From 49c8c6c0043b14cec0b0a794315feba1a8e7e923 Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Tue, 20 Jan 2026 16:34:38 +0800 Subject: [PATCH 01/10] feat: upgrade mind-cluster dependency and support Ascend 910C SuperPod device registration Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- cmd/main.go | 13 +- go.mod | 4 +- internal/manager/manager.go | 7 +- internal/server/server.go | 11 +- .../component/ascend-common/README.md | 8 + .../ascend-common/api/ascend-operator/LICENSE | 201 ++ .../api/ascend-operator/README.md | 164 ++ .../apis/batch/v1/ascendjob_types.go | 85 + .../apis/batch/v1/constants.go | 53 + .../ascend-operator/apis/batch/v1/defaults.go | 137 + .../ascend-operator/apis/batch/v1/register.go | 52 + .../apis/batch/v1/zz_generated.deepcopy.go | 137 + .../apis/batch/v1/zz_generated.defaults.go | 53 + .../client/clientset/versioned/clientset.go | 114 + .../clientset/versioned/scheme/register.go | 39 + .../versioned/typed/batch/v1/client.go | 110 + .../clientset/versioned/typed/batch/v1/job.go | 221 ++ .../externalversions/batch/interface.go | 49 + .../externalversions/batch/v1/interface.go | 48 + .../externalversions/batch/v1/job.go | 99 + .../informers/externalversions/factory.go | 207 ++ .../informers/externalversions/generic.go | 71 + .../internalinterfaces/factory_interfaces.go | 40 + .../listers/batch/v1/expansion_generated.go | 26 + .../client/listers/batch/v1/job.go | 108 + .../component/ascend-common/api/consts.go | 222 ++ .../ascend-common/api/default_name.go | 188 ++ .../ascend-common/api/publicfault.go | 32 + .../ascend-common/api/slownet/fault_net.go | 77 + .../ascend-common/api/superpoddevice.go | 36 + .../component/ascend-common/api/type.go | 30 + .../common-utils/cache/lrucache.go | 394 +++ .../common-utils/cache/lrucache_test.go | 304 +++ .../ascend-common/common-utils/hwlog/api.go | 310 +++ .../common-utils/hwlog/api_test.go | 165 ++ .../common-utils/hwlog/hwlog_adaptor.go | 174 ++ .../common-utils/hwlog/hwlog_adaptor_test.go | 126 + .../common-utils/hwlog/log_limiter.go | 156 ++ .../common-utils/hwlog/logger.go | 242 ++ .../common-utils/hwlog/logger_test.go | 217 ++ .../ascend-common/common-utils/hwlog/rolog.go | 447 ++++ .../common-utils/hwlog/rolog_test.go | 687 +++++ .../ascend-common/common-utils/hwlog/types.go | 49 + .../ascend-common/common-utils/hwlog/utils.go | 98 + .../common-utils/hwlog/utils_test.go | 38 + .../common-utils/limiter/limit_handler.go | 226 ++ .../limiter/limit_handler_test.go | 119 + .../common-utils/limiter/limit_listener.go | 161 ++ .../limiter/limit_listener_test.go | 125 + .../common-utils/limiter/limit_writer.go | 64 + .../common-utils/limiter/limit_writer_test.go | 37 + .../common-utils/rand/rand_linux.go | 71 + .../common-utils/rand/rand_linux_test.go | 54 + .../ascend-common/common-utils/rand/random.go | 28 + .../common-utils/rand/random_test.go | 32 + .../ascend-common/common-utils/utils/env.go | 35 + .../common-utils/utils/env_test.go | 51 + .../ascend-common/common-utils/utils/file.go | 176 ++ .../common-utils/utils/file_check.go | 240 ++ .../common-utils/utils/file_check_test.go | 194 ++ .../common-utils/utils/file_test.go | 169 ++ .../common-utils/utils/file_watcher.go | 85 + .../common-utils/utils/file_watcher_test.go | 81 + .../common-utils/utils/interface.go | 29 + .../common-utils/utils/interface_test.go | 36 + .../common-utils/utils/ip_utils.go | 98 + .../common-utils/utils/ip_utils_test.go | 182 ++ .../ascend-common/common-utils/utils/path.go | 382 +++ .../common-utils/utils/path_test.go | 232 ++ .../common-utils/utils/pwd_util.go | 75 + .../common-utils/utils/pwd_util_test.go | 59 + .../ascend-common/common-utils/utils/slice.go | 129 + .../common-utils/utils/slice_test.go | 536 ++++ .../common-utils/utils/strings.go | 75 + .../common-utils/utils/strings_test.go | 84 + .../ascend-common/devmanager/a310mgr.go | 25 + .../ascend-common/devmanager/a310pmgr.go | 35 + .../ascend-common/devmanager/a910mgr.go | 31 + .../devmanager/common/constants.go | 272 ++ .../ascend-common/devmanager/common/types.go | 435 ++++ .../ascend-common/devmanager/common/utils.go | 305 +++ .../devmanager/common/utils_test.go | 163 ++ .../devmanager/dcmi/constants.go | 78 + .../ascend-common/devmanager/dcmi/dcmi.go | 2213 +++++++++++++++++ .../devmanager/dcmi/dcmi_interface_api.h | 596 +++++ .../ascend-common/devmanager/devmanager.go | 1197 +++++++++ .../devmanager/devmanager_910a3_mock.go | 30 + .../devmanager/devmanager_910a3_mock_err.go | 43 + .../devmanager/devmanager_hccs_test.go | 166 ++ .../devmanager/devmanager_mock.go | 370 +++ .../devmanager/devmanager_mock_err.go | 369 +++ .../devmanager/devmanager_test.go | 78 + .../devmanager/hccn/hccn_tool.go | 335 +++ .../devmanager/hccn/hccn_tool_test.go | 49 + mind-cluster/component/ascend-common/go.mod | 55 + mind-cluster/component/ascend-common/go.sum | 492 ++++ .../component/npu-exporter/.gitignore | 1 + mind-cluster/component/npu-exporter/LICENSE | 201 ++ mind-cluster/component/npu-exporter/README.md | 42 + .../component/npu-exporter/build/Dockerfile | 21 + .../npu-exporter/build/Dockerfile-310P-1usoc | 31 + .../component/npu-exporter/build/build.sh | 80 + .../component/npu-exporter/build/build_ch.sh | 74 + .../build/metricConfiguration.json | 13 + .../build/npu-exporter-310P-1usoc.yaml | 167 ++ .../npu-exporter/build/npu-exporter.yaml | 140 ++ .../build/pluginConfiguration.json | 4 + .../npu-exporter/build/run_for_310P_1usoc.sh | 32 + .../component/npu-exporter/build/test.sh | 75 + .../npu-exporter/cmd/npu-exporter/main.go | 545 ++++ .../common/collector_for_container.go | 109 + .../common/collector_for_container_test.go | 137 + .../collector/common/constants.go | 140 ++ .../collector/common/metrics_collector.go | 192 ++ .../common/metrics_collector_test.go | 231 ++ .../collector/common/npu_collector.go | 423 ++++ .../collector/common/npu_collector_test.go | 547 ++++ .../npu-exporter/collector/common/types.go | 50 + .../collector/config/metrics_config.go | 208 ++ .../collector/config/metrics_config_test.go | 216 ++ .../collector/container/isula/isula_api.pb.go | 870 +++++++ .../collector/container/isula/isula_api.proto | 118 + .../container/isula/isula_api_grpc.pb.go | 107 + .../container/isula/isula_container.go | 39 + .../collector/container/isula/isulad.pb.go | 278 +++ .../collector/container/isula/isulad.proto | 35 + .../container/isula/isulad_grpc.pb.go | 105 + .../collector/container/parser.go | 630 +++++ .../collector/container/parser_test.go | 1027 ++++++++ .../collector/container/runtime_ops.go | 413 +++ .../collector/container/runtime_ops_test.go | 568 +++++ .../npu-exporter/collector/container/utils.go | 133 + .../collector/container/utils_test.go | 329 +++ .../collector/container/v1/containerd.pb.go | 310 +++ .../collector/container/v1/containerd.proto | 62 + .../collector/container/v1/spec.go | 59 + .../collector/metrics/collector_for_ddr.go | 142 ++ .../collector/metrics/collector_for_hbm.go | 228 ++ .../metrics/collector_for_hbm_test.go | 115 + .../collector/metrics/collector_for_hccs.go | 312 +++ .../metrics/collector_for_hccs_test.go | 150 ++ .../metrics/collector_for_network.go | 190 ++ .../collector/metrics/collector_for_npu.go | 453 ++++ .../metrics/collector_for_optical.go | 200 ++ .../collector/metrics/collector_for_pcie.go | 234 ++ .../collector/metrics/collector_for_roce.go | 263 ++ .../collector/metrics/collector_for_sio.go | 120 + .../metrics/collector_for_version.go | 56 + .../collector/metrics/collector_for_vnpu.go | 169 ++ .../metrics/collector_for_vnpu_test.go | 202 ++ .../collector/metrics/collector_test.go | 548 ++++ .../collector/metrics/common_utils.go | 193 ++ .../collector/metrics/common_utils_test.go | 165 ++ .../collector/testdata/prometheus_metrics | 166 ++ .../collector/testdata/prometheus_metrics2 | 6 + mind-cluster/component/npu-exporter/go.mod | 63 + mind-cluster/component/npu-exporter/go.sum | 561 +++++ .../npu-exporter/platforms/inputs/all/npu.go | 20 + .../platforms/inputs/npu/README.md | 107 + .../npu-exporter/platforms/inputs/npu/npu.go | 104 + .../platforms/inputs/npu/npu_test.go | 174 ++ .../platforms/inputs/npu/sample.conf | 9 + .../platforms/prom/prometheus_collector.go | 103 + .../prom/prometheus_collector_test.go | 159 ++ .../component/npu-exporter/plugins/README.md | 388 +++ .../plugins/collector_for_text_file.go | 358 +++ .../npu-exporter/plugins/register.go | 21 + .../utils/logger/general_logger.go | 76 + .../npu-exporter/utils/logger/logger.go | 174 ++ .../npu-exporter/utils/logger/logger_test.go | 119 + .../utils/logger/telegraf_logger.go | 82 + .../component/npu-exporter/utils/utils.go | 52 + .../npu-exporter/utils/utils_test.go | 103 + .../npu-exporter/versions/version.go | 23 + 174 files changed, 32608 insertions(+), 13 deletions(-) create mode 100644 mind-cluster/component/ascend-common/README.md create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/LICENSE create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/README.md create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go create mode 100644 mind-cluster/component/ascend-common/api/consts.go create mode 100644 mind-cluster/component/ascend-common/api/default_name.go create mode 100644 mind-cluster/component/ascend-common/api/publicfault.go create mode 100644 mind-cluster/component/ascend-common/api/slownet/fault_net.go create mode 100644 mind-cluster/component/ascend-common/api/superpoddevice.go create mode 100644 mind-cluster/component/ascend-common/api/type.go create mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache.go create mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/types.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/a310mgr.go create mode 100644 mind-cluster/component/ascend-common/devmanager/a310pmgr.go create mode 100644 mind-cluster/component/ascend-common/devmanager/a910mgr.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/constants.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/types.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/constants.go create mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go create mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go create mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go create mode 100644 mind-cluster/component/ascend-common/go.mod create mode 100644 mind-cluster/component/ascend-common/go.sum create mode 100644 mind-cluster/component/npu-exporter/.gitignore create mode 100644 mind-cluster/component/npu-exporter/LICENSE create mode 100644 mind-cluster/component/npu-exporter/README.md create mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile create mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc create mode 100644 mind-cluster/component/npu-exporter/build/build.sh create mode 100644 mind-cluster/component/npu-exporter/build/build_ch.sh create mode 100644 mind-cluster/component/npu-exporter/build/metricConfiguration.json create mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml create mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter.yaml create mode 100644 mind-cluster/component/npu-exporter/build/pluginConfiguration.json create mode 100644 mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh create mode 100644 mind-cluster/component/npu-exporter/build/test.sh create mode 100644 mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/constants.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/types.go create mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config.go create mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/parser.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/parser_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/utils.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/utils_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto create mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/spec.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics create mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 create mode 100644 mind-cluster/component/npu-exporter/go.mod create mode 100644 mind-cluster/component/npu-exporter/go.sum create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf create mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go create mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go create mode 100644 mind-cluster/component/npu-exporter/plugins/README.md create mode 100644 mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go create mode 100644 mind-cluster/component/npu-exporter/plugins/register.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/general_logger.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger_test.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go create mode 100644 mind-cluster/component/npu-exporter/utils/utils.go create mode 100644 mind-cluster/component/npu-exporter/utils/utils_test.go create mode 100644 mind-cluster/component/npu-exporter/versions/version.go diff --git a/cmd/main.go b/cmd/main.go index c77c244..b233829 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,7 +17,6 @@ package main import ( - "context" "flag" "fmt" "os" @@ -29,7 +28,7 @@ import ( "github.com/Project-HAMi/ascend-device-plugin/internal/server" "github.com/Project-HAMi/ascend-device-plugin/version" "github.com/fsnotify/fsnotify" - "huawei.com/npu-exporter/v6/common-utils/hwlog" + "huawei.com/npu-exporter/utils/logger" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) @@ -117,11 +116,11 @@ func main() { checkFlags() klog.Infof("version: %s", version.GetVersion()) klog.Infof("using config file: %s", *configFile) - config := &hwlog.LogConfig{ - OnlyToStdout: true, - LogLevel: *hwLoglevel, - } - err := hwlog.InitRunLogger(config, context.Background()) + + logger.HwLogConfig.OnlyToStdout = true + logger.HwLogConfig.LogLevel = *hwLoglevel + + err := logger.InitLogger("Prometheus") if err != nil { klog.Fatalf("init huawei run logger failed, %v", err) } diff --git a/go.mod b/go.mod index 86c9b48..6ea6f74 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/Project-HAMi/ascend-device-plugin go 1.22.2 require ( + ascend-common v0.0.0 github.com/Project-HAMi/HAMi v0.0.0 github.com/fsnotify/fsnotify v1.7.0 google.golang.org/grpc v1.63.2 @@ -57,6 +58,7 @@ require ( ) replace ( + ascend-common => ./mind-cluster/component/ascend-common github.com/Project-HAMi/HAMi v0.0.0 => github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480 - huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 + huawei.com/npu-exporter => ./mind-cluster/component/npu-exporter ) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index db92bd8..d070ea9 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -20,9 +20,10 @@ import ( "fmt" "sort" + "ascend-common/devmanager" + "ascend-common/devmanager/dcmi" + "github.com/Project-HAMi/ascend-device-plugin/internal" - "huawei.com/npu-exporter/v6/devmanager" - "huawei.com/npu-exporter/v6/devmanager/dcmi" "k8s.io/klog/v2" ) @@ -45,7 +46,7 @@ type AscendManager struct { } func NewAscendManager() (*AscendManager, error) { - mgr, err := devmanager.AutoInit("") + mgr, err := devmanager.AutoInit("", 30) if err != nil { return nil, err } diff --git a/internal/server/server.go b/internal/server/server.go index 67dcb8d..22f0927 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -44,6 +44,7 @@ const ( // PodAllocAnno = "huawei.com/AscendDevices" NodeLockAscend = "hami.io/mutex.lock" Ascend910Prefix = "Ascend910" + Ascend910CType = "Ascend910C" ) var ( @@ -191,10 +192,16 @@ func (ps *PluginServer) registerKubelet() error { return nil } -func (ps *PluginServer) getDeviceNetworkID(idx int) (int, error) { +func (ps *PluginServer) getDeviceNetworkID(idx int, deviceType string) (int, error) { + // For Ascend910C devices, all modules (dies) are interconnected via HCCS + if deviceType == Ascend910CType { + return 0, nil + } + if idx > 3 { return 1, nil } + return 0, nil } @@ -214,7 +221,7 @@ func (ps *PluginServer) registerHAMi() error { Health: dev.Health, } if strings.HasPrefix(device.Type, Ascend910Prefix) { - NetworkID, err := ps.getDeviceNetworkID(i) + NetworkID, err := ps.getDeviceNetworkID(i, device.Type) if err != nil { return fmt.Errorf("get networkID error: %v", err) } diff --git a/mind-cluster/component/ascend-common/README.md b/mind-cluster/component/ascend-common/README.md new file mode 100644 index 0000000..fa7f1b8 --- /dev/null +++ b/mind-cluster/component/ascend-common/README.md @@ -0,0 +1,8 @@ +# AscendCommon + +# 组件介绍 +提供公共代码给其他组件使用,组件包括NPU-Exporter等。 + +# 说明 + +1. 编译NPU-Exporter等组件时,AscendCommon要放在同一目录下 \ No newline at end of file diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE b/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/README.md b/mind-cluster/component/ascend-common/api/ascend-operator/README.md new file mode 100644 index 0000000..20c2f61 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/README.md @@ -0,0 +1,164 @@ +# ascend-aperator-apis + +## 介绍 + +ascend-aperator-apis旨在为用户提供AscendJob API,及其Clientsets, Listers、Informers。使用户能轻松对AscendJob进行CRUD操作。 + +## 接口说明 + +1. 创建clientsets + + ```go + NewForConfig(c *rest.Config)(*Clientset, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------- | ------------------------------------------------------------ | + | c | Input | *rest.Config | 客户端配置文件,由k8s提供的接口生成。包括cluster host、证书等信息 | + | - | Output | *clientsets | Client集合,包括AscendJob client和discovery client | + | - | Output | error | 错误信息 | + +2. 创建AscendJob + + ```go + Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | job | Input | *v1.AscendJob | AscendJob对象指针 | + | opts | Input | metav1.CreateOptions | 创建选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +3. 获取AscendJob + + ```go + Get(ctx context.Context, name string, opts metav1.GetOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | name | Input | string | AscendJob名称 | + | opts | Input | metav1.GetOptions | 获取选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +4. 列举AscendJob + + ```go + List(ctx context.Context, opts metav1.ListOptions)(*v1.AscendJobList, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | --------------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | opts | Input | metav1.ListOptions | 列举选项 | + | - | Output | *v1.AscendJob | AscendJobList对象指针 | + | - | Output | error | 错误信息 | + +5. 观察AscendJob + + ```go + Watch((ctx context.Context, opts metav1.ListOptions)(watch.Interface, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | ------------------ | ---------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | opts | Input | metav1.ListOptions | 列举选项 | + | - | Output | watch.Interface | watch类接口 | + | - | Output | error | 错误信息 | + +6. 更新AscendJob + + ```go + Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | job | Input | *v1.AscendJob | AscendJob对象指针 | + | opts | Input | metav1.UpdateOptions | 更新选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +7. 更新AscendJob状态 + + ```go + UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | job | Input | *v1.AscendJob | AscendJob对象指针 | + | opts | Input | metav1.UpdateOptions | 更新选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +8. 补丁AscendJob + + ```go + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ------------ | ------------ | --------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | name | Input | string | AscendJob名称 | + | pt | Input | types.PatchType | patch类型 | + | data | Input | []byte | patch信息 | + | subresources | Input | ...string | 子信息 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +9. 删除AscendJob + + ```go + Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ---------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | name | Input | string | AscendJob名称 | + | opts | Input | metav1.DeleteOptions | 删除选项 | + | - | Output | error | 错误信息 | + +10. 批量删除AscendJob + + ```go + DeleteCollection(ctx context.Context,opts metav1.DeleteOptions, listOpts metav1.ListOptions) error + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ---------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | opts | Input | metav1.DeleteOptions | 删除选项 | + | listOpts | Input | metav1.ListOptions | 列举选项 | + | - | Output | error | 错误信息 | + +11. 创建informerFactory + + ```go + NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) sharedInformerFactory + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ------------- | ------------ | --------------------- | ------------------ | + | client | Input | versioned.Interface | client类接口 | + | defaultResync | Input | time.Duration | 默认的重新同步时间 | + | - | Output | sharedInformerFactory | informer类接口 | + +12. 创建informer + + ```go + sharedInformerFactory.Batch().V1().Jobs().Informer() + ``` + + + diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go new file mode 100644 index 0000000..7bd1d65 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go @@ -0,0 +1,85 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to define AscendJob object and its initialization. +package v1 + +import ( + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// AscendJob is the Schema for the AscendJob API +type AscendJob struct { + // Standard Kubernetes type metadata. + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired state of the AscendJob. + // +optional + Spec AscendJobSpec `json:"spec,omitempty"` + + // Most recently observed status of the AscendJob. + // Populated by the system. + // Read-only. + // +optional + Status commonv1.JobStatus `json:"status,omitempty"` +} + +// AscendJobSpec defines the desired state of AscendJob +type AscendJobSpec struct { + // RunPolicy encapsulates various runtime policies of the distributed training + // job, for example how to clean up resources and how long the job can stay + // active. + // +kubebuilder:validation:Optional + RunPolicy commonv1.RunPolicy `json:"runPolicy"` + + // SuccessPolicy defines the policy to mark the AscendJob as succeeded. + // Default to "", using the default rules. + // +optional + SuccessPolicy *SuccessPolicy `json:"successPolicy,omitempty"` + + // SchedulerName defines the job scheduler with gang-scheduling enabled + SchedulerName string `json:"schedulerName,omitempty"` + + /* A map of ReplicaType (type) to ReplicaSpec (value). Specifies the ML cluster configuration. + For example, + { + "Scheduler": ReplacaSpec, + "Worker": ReplicaSpec, + } + */ + ReplicaSpecs map[commonv1.ReplicaType]*commonv1.ReplicaSpec `json:"replicaSpecs"` +} + +// AscendJobList contains a list of AscendJob +type AscendJobList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []AscendJob `json:"items"` +} + +// SuccessPolicy is the success policy. +type SuccessPolicy string + +const ( + // SuccessPolicyDefault is the default policy of success + SuccessPolicyDefault SuccessPolicy = "" + // SuccessPolicyAllWorkers is the 'ALLWorkers' policy of success + SuccessPolicyAllWorkers SuccessPolicy = "AllWorkers" +) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go new file mode 100644 index 0000000..9341682 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go @@ -0,0 +1,53 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "github.com/kubeflow/common/pkg/apis/common/v1" +) + +const ( + // GroupName is the group name used in this package. + GroupName = "mindxdl.gitee.com" + + // FrameworkKey the key of the laebl + FrameworkKey = "framework" + + // DefaultPort is default value of the port. + DefaultPort = 2222 + + // MindSporeFrameworkName is the name of ML Framework + MindSporeFrameworkName = "mindspore" + // MindSporeReplicaTypeScheduler is the type for Scheduler of distribute ML + MindSporeReplicaTypeScheduler v1.ReplicaType = "Scheduler" + + // PytorchFrameworkName is the name of ML Framework + PytorchFrameworkName = "pytorch" + // PytorchReplicaTypeMaster is the type for Scheduler of distribute ML + PytorchReplicaTypeMaster v1.ReplicaType = "Master" + + // TensorflowFrameworkName is the name of ML Framework + TensorflowFrameworkName = "tensorflow" + // TensorflowReplicaTypeChief is the type for Scheduler of distribute ML + TensorflowReplicaTypeChief v1.ReplicaType = "Chief" + + // ReplicaTypeWorker this is also used for non-distributed AscendJob + ReplicaTypeWorker v1.ReplicaType = "Worker" + + // DefaultRestartPolicy is default RestartPolicy for MSReplicaSpec. + DefaultRestartPolicy = v1.RestartPolicyNever +) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go new file mode 100644 index 0000000..4d5c124 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go @@ -0,0 +1,137 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "errors" + "fmt" + "strings" + + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + + "ascend-common/api" +) + +// Int32 is a helper routine that allocates a new int32 value +// to store v and returns a pointer to it. +func Int32(v int32) *int32 { + return &v +} + +// addDefaultingFuncs is used to register default funcs +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +// setDefaultPort sets the default ports for mindxdl container. +func setDefaultPort(spec *v1.PodSpec) { + index := 0 + for i, container := range spec.Containers { + if container.Name == api.DefaultContainerName { + index = i + break + + } + } + hasASJobPort := false + for _, port := range spec.Containers[index].Ports { + if port.Name == api.DefaultPortName { + hasASJobPort = true + break + } + } + if !hasASJobPort { + spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{ + Name: api.DefaultPortName, + ContainerPort: DefaultPort, + }) + } +} + +func setDefaultReplicas(spec *commonv1.ReplicaSpec) { + if spec.Replicas == nil { + spec.Replicas = Int32(1) + } + if spec.RestartPolicy == "" { + spec.RestartPolicy = DefaultRestartPolicy + } +} + +// setTypeNamesToCamelCase sets the name of all replica types from any case to correct case. +func setTypeNamesToCamelCase(job *AscendJob) { + setTypeNameToCamelCase(job, MindSporeReplicaTypeScheduler) + setTypeNameToCamelCase(job, ReplicaTypeWorker) + setTypeNameToCamelCase(job, PytorchReplicaTypeMaster) + setTypeNameToCamelCase(job, TensorflowReplicaTypeChief) +} + +// setTypeNameToCamelCase sets the name of the replica type from any case to correct case. +// E.g. from ps to PS; from WORKER to Worker. +func setTypeNameToCamelCase(job *AscendJob, typ commonv1.ReplicaType) { + for t := range job.Spec.ReplicaSpecs { + if strings.EqualFold(string(t), string(typ)) && t != typ { + spec := job.Spec.ReplicaSpecs[t] + delete(job.Spec.ReplicaSpecs, t) + job.Spec.ReplicaSpecs[typ] = spec + return + } + } +} + +// SetDefaultsAscendJob sets any unspecified values to defaults. +func SetDefaultsAscendJob(job *AscendJob) { + // Set default cleanpod policy to Running. + if job == nil { + return + } + + if job.Spec.RunPolicy.CleanPodPolicy == nil { + running := commonv1.CleanPodPolicyNone + job.Spec.RunPolicy.CleanPodPolicy = &running + } + // Set default success policy to "". + if job.Spec.SuccessPolicy == nil { + defaultPolicy := SuccessPolicyDefault + job.Spec.SuccessPolicy = &defaultPolicy + } + + // Update the key of replicaSpecs to camel case. + setTypeNamesToCamelCase(job) + + for rt, spec := range job.Spec.ReplicaSpecs { + // Set default replicas to 1. + setDefaultReplicas(spec) + // Set default port to ml container. + if rt == MindSporeReplicaTypeScheduler || rt == PytorchReplicaTypeMaster || rt == TensorflowReplicaTypeChief { + setDefaultPort(&spec.Template.Spec) + } + } +} + +// GetJobFramework get framework name of ascendjob +func GetJobFramework(job *AscendJob) (string, error) { + if job == nil || job.Labels == nil { + return "", errors.New("job or job labels is nil") + } + frame, ok := job.Labels[FrameworkKey] + if !ok { + return "", fmt.Errorf("job<%s-%s> label framework is not set", job.Namespace, job.Name) + } + return frame, nil +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go new file mode 100644 index 0000000..5813e39 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go @@ -0,0 +1,52 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + // SchemeGroupVersion is the group version used to register these objects. + SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1"} + // SchemeBuilder points to a list of functions added to Scheme. + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) + +// Resource takes an unqualified resource and returns a Group-qualified GroupResource. +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &AscendJob{}, + &AscendJobList{}, + ) + + v1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} + +func init() { + SchemeBuilder.Register(addDefaultingFuncs) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go new file mode 100644 index 0000000..695038b --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go @@ -0,0 +1,137 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1 + +import ( + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AscendJob) DeepCopyInto(out *AscendJob) { + if in == nil { + return + } + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJob. +func (in *AscendJob) DeepCopy() *AscendJob { + if in == nil { + return nil + } + out := new(AscendJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AscendJob) DeepCopyObject() runtime.Object { + if in == nil { + return nil + } + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AscendJobList) DeepCopyInto(out *AscendJobList) { + if in == nil { + return + } + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]AscendJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobList. +func (in *AscendJobList) DeepCopy() *AscendJobList { + if in == nil { + return nil + } + out := new(AscendJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AscendJobList) DeepCopyObject() runtime.Object { + if in == nil { + return nil + } + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AscendJobSpec) DeepCopyInto(out *AscendJobSpec) { + if in == nil { + return + } + *out = *in + in.RunPolicy.DeepCopyInto(&out.RunPolicy) + if in.SuccessPolicy != nil { + in, out := &in.SuccessPolicy, &out.SuccessPolicy + *out = new(SuccessPolicy) + **out = **in + } + if in.ReplicaSpecs != nil { + in, out := &in.ReplicaSpecs, &out.ReplicaSpecs + *out = make(map[commonv1.ReplicaType]*commonv1.ReplicaSpec, len(*in)) + for key, val := range *in { + var outVal *commonv1.ReplicaSpec + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(commonv1.ReplicaSpec) + (*in).DeepCopyInto(*out) + } + (*out)[key] = outVal + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobSpec. +func (in *AscendJobSpec) DeepCopy() *AscendJobSpec { + if in == nil { + return nil + } + out := new(AscendJobSpec) + in.DeepCopyInto(out) + return out +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go new file mode 100644 index 0000000..e9b774a --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go @@ -0,0 +1,53 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by defaulter-gen. DO NOT EDIT. + +package v1 + +import ( + "errors" + + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + if scheme == nil { + return errors.New("scheme is nil") + } + scheme.AddTypeDefaultingFunc(&AscendJob{}, func(obj interface{}) { SetObjectDefaults_AscendJob(obj.(*AscendJob)) }) + scheme.AddTypeDefaultingFunc(&AscendJobList{}, func(obj interface{}) { SetObjectDefaults_AscendJobList(obj.(*AscendJobList)) }) + return nil +} + +func SetObjectDefaults_AscendJob(in *AscendJob) { + SetDefaultsAscendJob(in) +} + +func SetObjectDefaults_AscendJobList(in *AscendJobList) { + if in == nil { + return + } + for i := range in.Items { + a := &in.Items[i] + SetObjectDefaults_AscendJob(a) + } +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go new file mode 100644 index 0000000..0d4add4 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go @@ -0,0 +1,114 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package versioned is used to define the ClientSet interface and struct, and its initialization. +package versioned + +import ( + "fmt" + "net/http" + + "k8s.io/client-go/discovery" + "k8s.io/client-go/rest" + "k8s.io/client-go/util/flowcontrol" + + "ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1" +) + +// Interface is the interface definition for Clientset. +type Interface interface { + Discovery() discovery.DiscoveryInterface + BatchV1() v1.BatchV1Interface +} + +// Clientset contains the clients for groups. Each group has exactly one +// version included in a Clientset. +type Clientset struct { + *discovery.DiscoveryClient + batchV1 *v1.BatchV1Client +} + +// BatchV1 retrieves the BatchV1alpha1Client +func (c *Clientset) BatchV1() v1.BatchV1Interface { + if c == nil { + return nil + } + return c.batchV1 +} + +// Discovery retrieves the DiscoveryClient +func (c *Clientset) Discovery() discovery.DiscoveryInterface { + if c == nil { + return nil + } + return c.DiscoveryClient +} + +// NewForConfig creates a new Clientset for the given config. +// If config's RateLimiter is not set and QPS and Burst are acceptable, +// NewForConfig will generate a rate-limiter in configShallowCopy. +// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), +// where httpClient was generated with rest.HTTPClientFor(c). +func NewForConfig(c *rest.Config) (*Clientset, error) { + configShallowCopy := *c + + // share the transport between all clients + httpClient, err := rest.HTTPClientFor(&configShallowCopy) + if err != nil { + return nil, err + } + + return NewForConfigAndClient(&configShallowCopy, httpClient) +} + +// NewForConfigAndClient creates a new Clientset for the given config and http client. +// Note the http client provided takes precedence over the configured transport values. +// If config's RateLimiter is not set and QPS and Burst are acceptable, +// NewForConfigAndClient will generate a rate-limiter in configShallowCopy. +func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { + if c == nil || httpClient == nil { + return nil, fmt.Errorf("nil pointer") + } + configShallowCopy := *c + if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { + if configShallowCopy.Burst <= 0 { + return nil, fmt.Errorf("burst is required to be greater than 0 " + + "when RateLimiter is not set and QPS is set to greater than 0") + } + configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) + } + + var cs Clientset + var err error + cs.batchV1, err = v1.NewForConfigAndClient(&configShallowCopy, httpClient) + if err != nil { + return nil, err + } + cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) + if err != nil { + return nil, err + } + return &cs, nil +} + +// New creates a new Clientset for the given RESTClient. +func New(c rest.Interface) *Clientset { + var cs Clientset + cs.batchV1 = v1.New(c) + + cs.DiscoveryClient = discovery.NewDiscoveryClient(c) + return &cs +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go new file mode 100644 index 0000000..58a99b0 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go @@ -0,0 +1,39 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package scheme is used to add runtime.Scheme +package scheme + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + "ascend-common/api/ascend-operator/apis/batch/v1" +) + +// RuntimeScheme is a Scheme object instance. +var RuntimeScheme = runtime.NewScheme() + +// Codecs is a CodecFactory object instance. +var Codecs = serializer.NewCodecFactory(RuntimeScheme) + +// ParameterCodec is a parameterCodec object instance. +var ParameterCodec = runtime.NewParameterCodec(RuntimeScheme) + +func init() { + utilruntime.Must(v1.AddToScheme(RuntimeScheme)) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go new file mode 100644 index 0000000..7dd8264 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go @@ -0,0 +1,110 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to define some client- and job-related interfaces, initialization operations, +// and method implementations. +package v1 + +import ( + "errors" + "net/http" + + "k8s.io/client-go/rest" + + "ascend-common/api/ascend-operator/apis/batch/v1" + "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" +) + +// BatchV1Interface is a batch client interface. +type BatchV1Interface interface { + RESTClient() rest.Interface + JobsGetter +} + +// BatchV1Client is a client structure. +type BatchV1Client struct { + restClient rest.Interface +} + +// Jobs returns a JobInterface object instance. +func (c *BatchV1Client) Jobs(namespace string) JobInterface { + if c == nil { + return nil + } + return newJobs(c, namespace) +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *BatchV1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} + +// NewForConfig creates a new BatchV1alpha1Client for the given config. +// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), +// where httpClient was generated with rest.HTTPClientFor(c). +func NewForConfig(c *rest.Config) (*BatchV1Client, error) { + if c == nil { + return nil, errors.New(nilPointError) + } + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + httpClient, err := rest.HTTPClientFor(&config) + if err != nil { + return nil, err + } + return NewForConfigAndClient(&config, httpClient) +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// NewForConfigAndClient creates a new BatchV1alpha1Client for the given config and http client. +// Note the http client provided takes precedence over the configured transport values. +func NewForConfigAndClient(c *rest.Config, h *http.Client) (*BatchV1Client, error) { + if c == nil || h == nil { + return nil, errors.New(nilPointError) + } + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientForConfigAndClient(&config, h) + if err != nil { + return nil, err + } + return &BatchV1Client{restClient: client}, nil +} + +// New creates a new BatchV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *BatchV1Client { + return &BatchV1Client{restClient: c} +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go new file mode 100644 index 0000000..a6527ad --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go @@ -0,0 +1,221 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "errors" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/rest" + + "ascend-common/api" + "ascend-common/api/ascend-operator/apis/batch/v1" + "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" +) + +const ( + nilPointError = "nil pointer" +) + +// JobsGetter has a method to return a JobInterface. +// A group's client should implement this interface. +type JobsGetter interface { + Jobs(namespace string) JobInterface +} + +// JobInterface has methods to work with Job resources. +type JobInterface interface { + Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) + Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) + UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) + Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error + Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) + List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) + Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, + subresources ...string) (result *v1.AscendJob, err error) + // JobExpansion +} + +// jobs implements JobInterface +type jobs struct { + client rest.Interface + ns string +} + +func (j *jobs) Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Post(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&opts, scheme.ParameterCodec). + Body(job). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, + error) { + if j == nil || job == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Put(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(job.Name). + VersionedParams(&opts, scheme.ParameterCodec). + Body(job). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, + error) { + if j == nil || job == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Put(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(job.Name). + SubResource("status"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(job). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + if j == nil { + return errors.New(nilPointError) + } + return j.client.Delete(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(name). + Body(&opts). + Do(ctx). + Error() +} + +func (j *jobs) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error { + if j == nil { + return errors.New(nilPointError) + } + var timeout time.Duration + if listOpts.TimeoutSeconds != nil { + timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second + } + return j.client.Delete(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&listOpts, scheme.ParameterCodec). + Timeout(timeout). + Body(&opts). + Do(ctx). + Error() +} + +func (j *jobs) Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Get(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(name). + VersionedParams(&opts, scheme.ParameterCodec). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + result := &v1.AscendJobList{} + err := j.client.Get(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + opts.Watch = true + return j.client.Get(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Watch(ctx) +} + +func (j *jobs) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, + subresources ...string) (*v1.AscendJob, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Patch(pt). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(name). + SubResource(subresources...). + VersionedParams(&opts, scheme.ParameterCodec). + Body(data). + Do(ctx). + Into(result) + return result, err +} + +// newJobs returns a Jobs +func newJobs(c *BatchV1Client, namespace string) *jobs { + return &jobs{ + client: c.RESTClient(), + ns: namespace, + } +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go new file mode 100644 index 0000000..78b5d12 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go @@ -0,0 +1,49 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package batch is used to define interfaces. +package batch + +import ( + "ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1" + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to each of this group's versions. +type Interface interface { + // V1 provides access to shared informers for resources in V1alpha1. + V1() v1.Interface +} + +type group struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, + tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// V1 returns a new v1alpha1.Interface. +func (g *group) V1() v1.Interface { + if g == nil { + return nil + } + return v1.New(g.factory, g.namespace, g.tweakListOptions) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go new file mode 100644 index 0000000..a4f0466 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go @@ -0,0 +1,48 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to implement job informer-related methods. +package v1 + +import ( + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // Jobs returns a JobInformer. + Jobs() JobInformer +} + +type version struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, + tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// Jobs returns a JobInformer. +func (v *version) Jobs() JobInformer { + if v == nil { + return nil + } + return &jobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go new file mode 100644 index 0000000..e5f0b1c --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go @@ -0,0 +1,99 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/cache" + + batchv1 "ascend-common/api/ascend-operator/apis/batch/v1" + "ascend-common/api/ascend-operator/client/clientset/versioned" + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" + batchlister "ascend-common/api/ascend-operator/client/listers/batch/v1" +) + +// JobInformer provides access to a shared informer and lister for +// Jobs. +type JobInformer interface { + Informer() cache.SharedIndexInformer + Lister() batchlister.JobLister +} + +type jobInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewJobInformer constructs a new informer for Job type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, + indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredJobInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredJobInformer constructs a new informer for Job type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, + indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.BatchV1().Jobs(namespace).List(context.TODO(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.BatchV1().Jobs(namespace).Watch(context.TODO(), options) + }, + }, + &batchv1.AscendJob{}, + resyncPeriod, + indexers, + ) +} + +func (f *jobInformer) defaultInformer(client versioned.Interface, + resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{ + cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *jobInformer) Informer() cache.SharedIndexInformer { + if f == nil || f.factory == nil { + return nil + } + return f.factory.InformerFor(&batchv1.AscendJob{}, f.defaultInformer) +} + +func (f *jobInformer) Lister() batchlister.JobLister { + if f == nil { + return nil + } + return batchlister.NewJobLister(f.Informer().GetIndexer()) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go new file mode 100644 index 0000000..5fec15f --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go @@ -0,0 +1,207 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package externalversions + +import ( + "reflect" + "sync" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/cache" + + "ascend-common/api/ascend-operator/client/clientset/versioned" + "ascend-common/api/ascend-operator/client/informers/externalversions/batch" + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" +) + +// SharedInformerFactory provides shared informers for resources in all known +// API group versions. +type SharedInformerFactory interface { + internalinterfaces.SharedInformerFactory + ForResource(resource schema.GroupVersionResource) (GenericInformer, error) + WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool + + Batch() batch.Interface +} + +// SharedInformerOption defines the functional option type for SharedInformerFactory. +type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory + +type sharedInformerFactory struct { + client versioned.Interface + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc + lock sync.Mutex + defaultResync time.Duration + customResync map[reflect.Type]time.Duration + + informers map[reflect.Type]cache.SharedIndexInformer + // startedInformers is used for tracking which informers have been started. + // This allows Start() to be called multiple times safely. + startedInformers map[reflect.Type]bool +} + +// WithCustomResyncConfig sets a custom resync period for the specified informer types. +func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + if factory == nil { + return factory + } + + if factory.customResync == nil { + factory.customResync = make(map[reflect.Type]time.Duration) + } + + for k, v := range resyncConfig { + factory.customResync[reflect.TypeOf(k)] = v + } + return factory + } +} + +// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. +func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + if factory == nil { + return nil + } + factory.tweakListOptions = tweakListOptions + return factory + } +} + +// WithNamespace limits the SharedInformerFactory to the specified namespace. +func WithNamespace(namespace string) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + if factory == nil { + return nil + } + factory.namespace = namespace + return factory + } +} + +// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. +func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { + return NewSharedInformerFactoryWithOptions(client, defaultResync) +} + +// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. +func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, + options ...SharedInformerOption) SharedInformerFactory { + factory := &sharedInformerFactory{ + client: client, + namespace: v1.NamespaceAll, + defaultResync: defaultResync, + informers: make(map[reflect.Type]cache.SharedIndexInformer), + startedInformers: make(map[reflect.Type]bool), + customResync: make(map[reflect.Type]time.Duration), + } + + // Apply all options + for _, opt := range options { + factory = opt(factory) + } + + return factory +} + +// Start initializes all requested informers. +func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { + if f == nil { + return + } + f.lock.Lock() + defer f.lock.Unlock() + + if f.startedInformers == nil { + f.startedInformers = make(map[reflect.Type]bool) + } + + for informerType, informer := range f.informers { + if !f.startedInformers[informerType] { + go informer.Run(stopCh) + f.startedInformers[informerType] = true + } + } +} + +// WaitForCacheSync waits for all started informers' cache were synced. +func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { + informers := func() map[reflect.Type]cache.SharedIndexInformer { + if f == nil { + return nil + } + f.lock.Lock() + defer f.lock.Unlock() + + informers := map[reflect.Type]cache.SharedIndexInformer{} + for informerType, informer := range f.informers { + if f.startedInformers[informerType] { + informers[informerType] = informer + } + } + return informers + }() + + res := map[reflect.Type]bool{} + for informType, informer := range informers { + res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) + } + return res +} + +// InternalInformerFor returns the SharedIndexInformer for obj using an internal +// client. +func (f *sharedInformerFactory) InformerFor(obj runtime.Object, + newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { + if f == nil { + return nil + } + + f.lock.Lock() + defer f.lock.Unlock() + + informerType := reflect.TypeOf(obj) + informer, exists := f.informers[informerType] + if exists { + return informer + } + + resyncPeriod, exists := f.customResync[informerType] + if !exists { + resyncPeriod = f.defaultResync + } + + informer = newFunc(f.client, resyncPeriod) + if f.informers == nil { + f.informers = make(map[reflect.Type]cache.SharedIndexInformer) + } + f.informers[informerType] = informer + + return informer +} + +func (f *sharedInformerFactory) Batch() batch.Interface { + if f == nil { + return nil + } + return batch.New(f, f.namespace, f.tweakListOptions) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go new file mode 100644 index 0000000..95db6d0 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go @@ -0,0 +1,71 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package externalversions + +import ( + "errors" + "fmt" + + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/cache" + + "ascend-common/api" + "ascend-common/api/ascend-operator/apis/batch/v1" +) + +// GenericInformer is type of SharedIndexInformer which will locate and delegate to other +// sharedInformers based on type +type GenericInformer interface { + Informer() cache.SharedIndexInformer + Lister() cache.GenericLister +} + +type genericInformer struct { + informer cache.SharedIndexInformer + resource schema.GroupResource +} + +// Informer returns the SharedIndexInformer. +func (f *genericInformer) Informer() cache.SharedIndexInformer { + if f == nil { + return nil + } + return f.informer +} + +// Lister returns the GenericLister. +func (f *genericInformer) Lister() cache.GenericLister { + if f == nil { + return nil + } + return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) +} + +// ForResource gives generic access to a shared informer of the matching type +// extend this to unknown resources with a client pool +func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { + if f == nil { + return nil, errors.New("nil pointer") + } + switch resource { + case v1.SchemeGroupVersion.WithResource(api.AscendJobsLowerCase): + return &genericInformer{resource: resource.GroupResource(), informer: f.Batch().V1().Jobs().Informer()}, nil + default: + } + + return nil, fmt.Errorf("no informer found for %v", resource) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go new file mode 100644 index 0000000..5602b78 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -0,0 +1,40 @@ +/* +Copyright 2019 Bloomberg Finance LP. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package internalinterfaces is used to define informer-related interfaces. +package internalinterfaces + +import ( + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/cache" + + "ascend-common/api/ascend-operator/client/clientset/versioned" +) + +// NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. +type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer + +// SharedInformerFactory a small interface to allow for adding an informer without an import cycle +type SharedInformerFactory interface { + Start(stopCh <-chan struct{}) + InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer +} + +// TweakListOptionsFunc is a function that transforms a v1.ListOptions. +type TweakListOptionsFunc func(*v1.ListOptions) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go new file mode 100644 index 0000000..9ed431c --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go @@ -0,0 +1,26 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to define job-related interfaces. +package v1 + +// JobListerExpansion allows custom methods to be added to +// JobLister. +type JobListerExpansion interface{} + +// JobNamespaceListerExpansion allows custom methods to be added to +// JobNamespaceLister. +type JobNamespaceListerExpansion interface{} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go new file mode 100644 index 0000000..084a913 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go @@ -0,0 +1,108 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "errors" + + k8serr "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" + + "ascend-common/api/ascend-operator/apis/batch/v1" +) + +// JobLister helps list Jobs. +// All objects returned here must be treated as read-only. +type JobLister interface { + // List lists all Jobs in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1.AscendJob, err error) + // Jobs returns an object that can list and get Jobs. + Jobs(namespace string) JobNamespaceLister + JobListerExpansion +} + +// jobLister implements the JobLister interface. +type jobLister struct { + indexer cache.Indexer +} + +// NewJobLister returns a new JobLister. +func NewJobLister(indexer cache.Indexer) JobLister { + return &jobLister{indexer: indexer} +} + +// List lists all Jobs in the indexer. +func (s *jobLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { + if s == nil { + return nil, errors.New("nil pointer") + } + var ret []*v1.AscendJob + err := cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1.AscendJob)) + }) + return ret, err +} + +// Jobs returns an object that can list and get Jobs. +func (s *jobLister) Jobs(namespace string) JobNamespaceLister { + if s == nil { + return nil + } + return jobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// JobNamespaceLister helps list and get Jobs. +// All objects returned here must be treated as read-only. +type JobNamespaceLister interface { + // List lists all Jobs in the indexer for a given namespace. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1.AscendJob, err error) + // Get retrieves the Job from the indexer for a given namespace and name. + // Objects returned here must be treated as read-only. + Get(name string) (*v1.AscendJob, error) + JobNamespaceListerExpansion +} + +// jobNamespaceLister implements the JobNamespaceLister +// interface. +type jobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all Jobs in the indexer for a given namespace. +func (s jobNamespaceLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { + var ret []*v1.AscendJob + err := cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1.AscendJob)) + }) + return ret, err +} + +// Get retrieves the Job from the indexer for a given namespace and name. +func (s jobNamespaceLister) Get(name string) (*v1.AscendJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, k8serr.NewNotFound(v1.Resource("job"), name) + } + return obj.(*v1.AscendJob), nil +} diff --git a/mind-cluster/component/ascend-common/api/consts.go b/mind-cluster/component/ascend-common/api/consts.go new file mode 100644 index 0000000..01881ce --- /dev/null +++ b/mind-cluster/component/ascend-common/api/consts.go @@ -0,0 +1,222 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api common const +package api + +// Env +const ( + NodeNameEnv = "NODE_NAME" + + // PtWorldSizeEnv the total number of npu used for the task for PyTorch + PtWorldSizeEnv = "WORLD_SIZE" + // PtLocalWorldSizeEnv number of npu used per pod for PyTorch + PtLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" + // PtLocalRankEnv logic id List of npu used by pod for PyTorch + PtLocalRankEnv = "LOCAL_RANK" + + // TfWorkerSizeEnv the total number of npu used for the task for TensorFlow + TfWorkerSizeEnv = "CM_WORKER_SIZE" + // TfLocalWorkerEnv number of npu used per pod for TensorFlow + TfLocalWorkerEnv = "CM_LOCAL_WORKER" + + // MsWorkerNumEnv the total number of npu used for the task for MindSpore + MsWorkerNumEnv = "MS_WORKER_NUM" + // MsLocalWorkerEnv number of npu used per pod for MindSpore + MsLocalWorkerEnv = "MS_LOCAL_WORKER" +) + +// NameSpace +const ( + DLNamespace = "mindx-dl" + ClusterNS = "cluster-system" + KubeNS = "kube-system" +) + +// Node +const ( + // NPUChipMemoryLabel label value is npu chip memory + NPUChipMemoryLabel = "mind-cluster/npu-chip-memory" + + // NodeSNAnnotation annotation value is node sn + NodeSNAnnotation = "product-serial-number" + // BaseDevInfoAnno annotation value is device base info + BaseDevInfoAnno = "baseDeviceInfos" + + // AcceleratorTypeKey the node label key of accelerator type + AcceleratorTypeKey = "accelerator-type" + // AcceleratorTypeModule910A3SuperPod for 910A3-SuperPod hardware + AcceleratorTypeModule910A3SuperPod = "module-a3-16-super-pod" +) + +// Pod +const ( + // PodUsedHardwareTypeAnno annotation value is the hardware type that real used in pod + PodUsedHardwareTypeAnno = "mind-cluster/hardware-type" + // PodRankIndexAnno annotation value is rank index of the pod + PodRankIndexAnno = "hccl/rankIndex" + // SuperPodIDAnno annotation key of the super pod id + SuperPodIDAnno = "super-pod-id" + + // Hotswitch Annotations + + // InHotSwitchFlowKey in hot switch flow key + InHotSwitchFlowKey = "inHotSwitchFlow" + // InHotSwitchFlowValue in hot switch flow true + InHotSwitchFlowValue = "true" + // BackupNewPodNameKey backup new pod name key + BackupNewPodNameKey = "backupNewPodName" + // BackupSourcePodNameKey backup source pod name key + BackupSourcePodNameKey = "backupSourcePodName" + // NeedOperatorOpeKey need operator ope key + NeedOperatorOpeKey = "needOperatorOpe" + // NeedVolcanoOpeKey need volcano ope key + NeedVolcanoOpeKey = "needVolcanoOpe" + // OpeTypeDelete ope type delete + OpeTypeDelete = "delete" + // OpeTypeCreate ope type create + OpeTypeCreate = "create" + // PodTypeKey pod type key + PodTypeKey = "podType" + // PodTypeBackup pod type backup + PodTypeBackup = "backup" + // DefaultRetryTimes default retry times + DefaultRetryTimes = 3 + // MasterPodRank master pod rank + MasterPodRank = "0" +) + +// PodGroup +const ( + // AtlasTaskLabel label value task kind, eg. ascend-910, ascend-{xxx}b + AtlasTaskLabel = "ring-controller.atlas" +) + +// ConfigMap +const ( + // DeviceInfoCMDataKey device-info-cm data key, record device info + DeviceInfoCMDataKey = "DeviceInfoCfg" + // SwitchInfoCMDataKey device-info-cm data key, record switch info + SwitchInfoCMDataKey = "SwitchInfoCfg" + // NodeInfoCMDataKey node-info-cm data key, record node info + NodeInfoCMDataKey = "NodeInfo" + // PubFaultCMDataKey public fault cm data key, record public fault info + PubFaultCMDataKey = "PublicFault" + + // CIMCMLabelKey cm label key, who uses these cms + CIMCMLabelKey = "mx-consumer-cim" + // PubFaultCMLabelKey public fault cm label key + PubFaultCMLabelKey = "mc-consumer-publicfault" +) + +const ( + // FaultJobCmName fault job cm name + FaultJobCmName = "fault-job-info" +) + +const ( + // PodScheduleLabel pod schedule label + PodScheduleLabel = "pod-rescheduling" + // ProcessScheduleLabel process schedule label + ProcessScheduleLabel = "process-recover-enable" + // RecoverStrategyKey recover strategy key in job annotation + RecoverStrategyKey = "recover-strategy" +) + +// process schedule strategy +const ( + // RecoverStrategy recover strategy + RecoverStrategy = "recover" + // RetryStrategy retry strategy + RetryStrategy = "retry" + // InPlaceStrategy recover in place strategy + InPlaceStrategy = "recover-in-place" + // DumpStrategy dump strategy + DumpStrategy = "dump" + // ExitStrategy exit strategy + ExitStrategy = "exit" + // ElasticTraining elastic-training strategy + ElasticTraining = "elastic-training" +) + +// process schedule common env +const ( + // ProcessRecoverEnv process recover env + ProcessRecoverEnv = "PROCESS_RECOVER" + // ElasticRecoverEnv elastic process recover env + ElasticRecoverEnv = "ELASTIC_PROCESS_RECOVER_ENABLE" + // EnableRestartEnv enable restart env + EnableRestartEnv = "ENABLE_RESTART_FAULT_PROCESS" +) + +// process schedule pytorch env +const ( + // HighAvailableEnv high available env + HighAvailableEnv = "HIGH_AVAILABILITY" + // PtCloseWatchDogKey pt close watch dog key + PtCloseWatchDogKey = "HCCL_ASYNC_ERROR_HANDLING" + // PtCloseWatchDogValue pt close watch dog value + PtCloseWatchDogValue = "0" +) + +// process schedule ms env +const ( + // MsRecoverEnv ms recover env + MsRecoverEnv = "MS_ENABLE_TFT" + // EnableMS enable ms + EnableMS = "MINDIO_FOR_MINDSPORE" + // MsDumpStrategy ms dump strategy + MsDumpStrategy = "TTP:1" + // MsUceStrategy ms uce strategy + MsUceStrategy = "UCE:1" + // MsArfStrategy ms arf strategy + MsArfStrategy = "ARF:1" + // MsHcceStrategy ms hcce strategy + MsHcceStrategy = "HCCE:1" + // MsRscStrategy ms rsc strategy + MsRscStrategy = "RSC:1" + // MsCloseWatchDogKey ms close watch dog key + MsCloseWatchDogKey = "MS_ENABLE_THM" + // MsCloseWatchDogValue ms close watch dog value + MsCloseWatchDogValue = `{HCCL_WATCHDOG:0}` +) + +const ( + //EnableFunc Enable Func + EnableFunc = "on" + // EnableFlag enable flag + EnableFlag = "1" + // PytorchFramework framework + PytorchFramework = "pytorch" + // MindSporeFramework framework + MindSporeFramework = "mindspore" +) + +const ( + // RescheduleInPlaceKey reschedule in place key + RescheduleInPlaceKey = "reschedule-in-place" + // RescheduleInPlaceValue reschedule in place value + RescheduleInPlaceValue = "true" +) + +const ( + // DeviceResetTimeout device reset timeout + DeviceResetTimeout = "deviceResetTimeout" + // DefaultDeviceResetTimeout default device reset timeout is 60 seconds + DefaultDeviceResetTimeout = 60 + // MinDeviceResetTimeout min device reset timeout is 10 seconds + MinDeviceResetTimeout = 10 + // MaxDeviceResetTimeout max device reset timeout is 600 seconds + MaxDeviceResetTimeout = 600 +) + +const ( + // SubHealthyStrategy config in pod group label for subHealthy fault strategy + SubHealthyStrategy = "subHealthyStrategy" + // SubHealthyHotSwitch strategy name of hot switch + SubHealthyHotSwitch = "hotSwitch" +) + +const ( + // MinAvailableKey decide minAvailable of task + MinAvailableKey = "huawei.com/schedule_minAvailable" +) diff --git a/mind-cluster/component/ascend-common/api/default_name.go b/mind-cluster/component/ascend-common/api/default_name.go new file mode 100644 index 0000000..7f0ae6c --- /dev/null +++ b/mind-cluster/component/ascend-common/api/default_name.go @@ -0,0 +1,188 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api common brand moniker +package api + +// common +const ( + // Pod910DeviceAnno annotation value is for generating 910 hccl rank table + Pod910DeviceAnno = "ascend.kubectl.kubernetes.io/ascend-910-configuration" + + // ResourceNamePrefix pre resource name + ResourceNamePrefix = "huawei.com/" + // PodRealAlloc pod annotation key, means pod real mount device + PodRealAlloc = "AscendReal" + + // PodAnnotationAscendReal pod annotation ascend real + PodAnnotationAscendReal = "huawei.com/AscendReal" + + // Ascend brand name + Ascend = "Ascend" + // AscendJob job kind is AscendJob + AscendJob = "AscendJob" + // AscendJobsLowerCase for ascend jobs lowercase + AscendJobsLowerCase = "ascendjobs" + + // AscendOperator ascend-Operator + AscendOperator = "ascend-Operator" +) + +// common 910 +const ( + // Ascend910 for 910 chip + Ascend910 = "Ascend910" + // Ascend910Lowercase for 910 chip lowercase + Ascend910Lowercase = "ascend910" + // HuaweiAscend910 ascend 910 chip with prefix + HuaweiAscend910 = "huawei.com/Ascend910" + // Ascend910MinuxPrefix name prefix of ascend 910 chip + Ascend910MinuxPrefix = "Ascend910-" + // Ascend910MinuxCase minus type of ascend 910 chip + Ascend910MinuxCase = "ascend-910" + // Ascend910No 910 chip number + Ascend910No = "910" +) + +// common 910 A1 +const ( + // Ascend910A ascend 910A chip + Ascend910A = "Ascend910" + // Ascend910APattern regular expression for 910A + Ascend910APattern = `^910` +) + +// common 910 A2 +const ( + // Ascend910B ascend 910B chip + Ascend910B = "Ascend910B" + // Ascend910BPattern regular expression for 910B + Ascend910BPattern = `^(910B\d{1}|A2G\d{1})` +) + +// common 910 A3 +const ( + // Ascend910A3 ascend Ascend910A3 chip + Ascend910A3 = "Ascend910A3" +) + +// common 310 +const ( + // Ascend310 ascend 310 chip + Ascend310 = "Ascend310" + // Ascend310Lowercase ascend 310 chip lowercase + Ascend310Lowercase = "ascend310" + // Ascend310No 310 chip number + Ascend310No = "310" + // HuaweiAscend310 ascend 310 chip with prefix + HuaweiAscend310 = "huawei.com/Ascend310" + // Ascend310MinuxPrefix name prefix of ascend 310 chip + Ascend310MinuxPrefix = "Ascend310-" +) + +// common 310B +const ( + // Ascend310B ascend 310B chip + Ascend310B = "Ascend310B" + // Ascend310BNo 310B chip number + Ascend310BNo = "310B" +) + +// common 310P +const ( + // Ascend310P ascend 310P chip + Ascend310P = "Ascend310P" + // Ascend310PLowercase ascend 310P chip lowercase + Ascend310PLowercase = "ascend310P" + // Ascend310PNo 310P chip number + Ascend310PNo = "310P" + // Ascend310PPattern regular expression for 310P + Ascend310PPattern = `^(310P\d{0,1}|I2\d{0,1})` + // HuaweiAscend310P ascend 310P chip with prefix + HuaweiAscend310P = "huawei.com/Ascend310P" + // Ascend310PMinuxPrefix name prefix of ascend 310P chip + Ascend310PMinuxPrefix = "Ascend310P-" +) + +// device plugin +const ( + // Use310PMixedInsert use 310P Mixed insert + Use310PMixedInsert = "use310PMixedInsert" + // Ascend310PMix dp use310PMixedInsert parameter usage + Ascend310PMix = "ascend310P-V, ascend310P-VPro, ascend310P-IPro" + // A300IA2Label the value of the A300I A2 node label + A300IA2Label = "card-910b-infer" + // A300IDuoLabel the value of the A300I Duo node label + A300IDuoLabel = "card-300i-duo" + //UseAscendDocker UseAscendDocker parameter + UseAscendDocker = "useAscendDocker" +) + +// docker runtime +const ( + // AscendDockerRuntime ascend-docker-runtime + AscendDockerRuntime = "ascend-docker-runtime" + // AscendDockerHook ascend-docker-hook + AscendDockerHook = "ascend-docker-hook" + // AscendDockerDestroy ascend-docker-destroy + AscendDockerDestroy = "ascend-docker-destroy" + // AscendDockerCli ascend-docker-cli + AscendDockerCli = "ascend-docker-cli" + + // AscendDockerRuntimeEnv env variable + AscendDockerRuntimeEnv = "ASCEND_DOCKER_RUNTIME" + // AscendVisibleDevicesEnv env variable + AscendVisibleDevicesEnv = "ASCEND_VISIBLE_DEVICES" + // AscendRuntimeOptionsEnv env variable + AscendRuntimeOptionsEnv = "ASCEND_RUNTIME_OPTIONS" + // AscendRuntimeMountsEnv env variable + AscendRuntimeMountsEnv = "ASCEND_RUNTIME_MOUNTS" + // AscendAllowLinkEnv env variable + AscendAllowLinkEnv = "ASCEND_ALLOW_LINK" + // AscendVnpuSpescEnv env variable + AscendVnpuSpescEnv = "ASCEND_VNPU_SPECS" + + // RunTimeLogDir dir path of runtime + RunTimeLogDir = "/var/log/ascend-docker-runtime/" + // HookRunLogPath run log path of hook + HookRunLogPath = "/var/log/ascend-docker-runtime/hook-run.log" + // InstallHelperRunLogPath run log path of install helper + InstallHelperRunLogPath = "/var/log/ascend-docker-runtime/install-helper-run.log" + // RunTimeRunLogPath run log path of runtime + RunTimeRunLogPath = "/var/log/ascend-docker-runtime/runtime-run.log" + + // RunTimeDConfigPath config path + RunTimeDConfigPath = "/etc/ascend-docker-runtime.d" +) + +// npu exporter +const ( + // DevicePathPattern device path pattern + DevicePathPattern = `^/dev/davinci\d+$` + // HccsBWProfilingTimeStr preset parameter name + HccsBWProfilingTimeStr = "hccsBWProfilingTime" + // Hccs log options domain value + Hccs = "hccs" + // Prefix pre statistic info + Prefix = "npu_chip_info_hccs_statistic_info_" + // BwPrefix pre bandwidth info + BwPrefix = "npu_chip_info_hccs_bandwidth_info_" + // AscendDeviceInfo + AscendDeviceInfo = "ASCEND_VISIBLE_DEVICES" +) + +const ( + // AscendJobKind is the kind name + AscendJobKind = "AscendJob" + // DefaultContainerName the default container name for AscendJob. + DefaultContainerName = "ascend" + // DefaultPortName is name of the port used to communicate between other process. + DefaultPortName = "ascendjob-port" + // ControllerName is the name of controller,used in log. + ControllerName = "ascendjob-controller" + // OperatorName name of operator + OperatorName = "ascend-operator" + // LogModuleName name of log module + LogModuleName = "hwlog" + // OperatorLogFilePath Operator log file name + OperatorLogFilePath = "/var/log/mindx-dl/ascend-operator/ascend-operator.log" +) diff --git a/mind-cluster/component/ascend-common/api/publicfault.go b/mind-cluster/component/ascend-common/api/publicfault.go new file mode 100644 index 0000000..8561145 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/publicfault.go @@ -0,0 +1,32 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api structs for public fault +package api + +// PubFaultInfo struct for public fault input +type PubFaultInfo struct { + Id string `json:"id"` + TimeStamp int64 `json:"timestamp"` + Version string `json:"version"` + Resource string `json:"resource"` + Faults []Fault `json:"faults"` +} + +// Fault public fault cm item Fault +type Fault struct { + FaultId string `json:"faultId"` + FaultType string `json:"faultType"` + FaultCode string `json:"faultCode"` + FaultTime int64 `json:"faultTime"` + Assertion string `json:"assertion"` + FaultLocation map[string]string `json:"faultLocation"` + Influence []Influence `json:"influence"` + Description string `json:"description"` +} + +// Influence public fault cm item Influence +type Influence struct { + NodeName string `json:"nodeName"` + NodeSN string `json:"nodeSN"` + DeviceIds []int32 `json:"deviceIds"` +} diff --git a/mind-cluster/component/ascend-common/api/slownet/fault_net.go b/mind-cluster/component/ascend-common/api/slownet/fault_net.go new file mode 100644 index 0000000..eacde6a --- /dev/null +++ b/mind-cluster/component/ascend-common/api/slownet/fault_net.go @@ -0,0 +1,77 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package slownet for net fault detect common +package slownet + +import ( + "fmt" + "os" + "path/filepath" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" +) + +const ( + rasNetRootPathKey = "RAS_NET_ROOT_PATH" + netFaultSubPath = "cluster" + detectConf = "cathelper.conf" +) + +// GetRasNetRootPath get ras net fault detect root path from env +func GetRasNetRootPath() (string, error) { + rootPath := os.Getenv(rasNetRootPathKey) + if len(rootPath) == 0 { + return "", fmt.Errorf("env %s not exists, please config it before starting", rasNetRootPathKey) + } + if !utils.IsDir(rootPath) { + return "", fmt.Errorf("env %s=%s, which is not dir", rasNetRootPathKey, rootPath) + } + safeRootPath, err := utils.CheckPath(rootPath) + if err != nil { + return "", fmt.Errorf("env %s=%s, which is invalid, err: %v", rasNetRootPathKey, rootPath, err) + } + return safeRootPath, nil +} + +// GetPingListFilePath get ping list task info file for ping mesh +func GetPingListFilePath(superPodId, serverIndex string) (string, error) { + rootPath, err := GetRasNetRootPath() + if err != nil { + return "", err + } + return filepath.Join(rootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), + fmt.Sprintf("ping_list_%s.json", serverIndex)), nil +} + +// GetSuperPodInfoFilePath get super pod info file path +func GetSuperPodInfoFilePath(superPodID, superPodPrefix string) (string, error) { + rootPath, err := GetRasNetRootPath() + if err != nil { + hwlog.RunLog.Errorf("get ras net root path failed, err : %v", err) + return "", err + } + superPodPathName := fmt.Sprintf("%s-%s", superPodPrefix, superPodID) + fileName := fmt.Sprintf("%s.json", superPodPathName) + filePath := filepath.Join(rootPath, netFaultSubPath, superPodPathName, fileName) + if _, errInfo := utils.CheckPath(filePath); errInfo != nil { + hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) + return "", errInfo + } + return filePath, nil +} + +// GetConfigPathForDetect the config path for network fault detect so +func GetConfigPathForDetect(superPodId string) (string, error) { + rasNetRootPath, err := GetRasNetRootPath() + if err != nil { + hwlog.RunLog.Errorf("get ras net root path failed, err: %v", err) + return "", err + } + confPath := filepath.Join(rasNetRootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), detectConf) + if _, errInfo := utils.CheckPath(confPath); errInfo != nil { + hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) + return "", errInfo + } + return confPath, nil +} diff --git a/mind-cluster/component/ascend-common/api/superpoddevice.go b/mind-cluster/component/ascend-common/api/superpoddevice.go new file mode 100644 index 0000000..4039dcb --- /dev/null +++ b/mind-cluster/component/ascend-common/api/superpoddevice.go @@ -0,0 +1,36 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api structs for SuperPodDevice +package api + +import "k8s.io/apimachinery/pkg/util/sets" + +// NpuBaseInfo is the base info of npu +type NpuBaseInfo struct { + IP string + SuperDeviceID uint32 +} + +// NodeDevice node device info +type NodeDevice struct { + NodeName string + ServerID string + ServerType string `json:"-"` + DeviceMap map[string]string // key: dev phyID, value: superPod device id +} + +// SuperPodDevice super node device info, key is superPodID, value is NodeDevice +type SuperPodDevice struct { + Version string + SuperPodID string + NodeDeviceMap map[string]*NodeDevice +} + +// SuperPodFaultInfos super pod fault info +type SuperPodFaultInfos struct { + SdIds []string + FaultNodes sets.String + NodeNames []string + FaultTimes int64 + JobId string `json:"JobId,omitempty"` +} diff --git a/mind-cluster/component/ascend-common/api/type.go b/mind-cluster/component/ascend-common/api/type.go new file mode 100644 index 0000000..9a2cde1 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/type.go @@ -0,0 +1,30 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api common const +package api + +// ResetCmInfo is the reset config info of a task +type ResetCmInfo struct { + RankList []*DevFaultnfo + UpdateTime int64 + RetryTime int + FaultFlushing bool + GracefulExit int + RestartFaultProcess bool +} + +// DevFaultnfo is the device info of a task +type DevFaultnfo struct { + RankId int + FaultInfo +} + +// FaultInfo is the fault info of device +type FaultInfo struct { + LogicId int32 + Status string + Policy string + InitialPolicy string + ErrorCode []int64 + ErrorCodeHex string +} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go new file mode 100644 index 0000000..0c0d420 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go @@ -0,0 +1,394 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package cache implement a memory-based LRU local cache +package cache + +import ( + "container/list" + "errors" + "fmt" + "math" + "sync" + "time" +) + +const ( + segmentCount = 16 + int64One int64 = 1 + int64Zero int64 = 0 + negInt64One int64 = -1 + intTwo = 2 + hashInit uint32 = 2166136261 + prime32 uint32 = 16777619 + twentyYears time.Duration = 20 * 365 * 24 * time.Hour +) + +var ( + notInitErr = errors.New("not initializes") + paraErr = errors.New("parameter error") +) + +type cacheEle struct { + key string + data interface{} + expireTime int64 +} + +type lruCache struct { + maxSize int + elemIndex map[string]*list.Element + *list.List + mu sync.Mutex +} + +// ConcurrencyLRUCache is a memory-based LRU local cache, default total 16 segment to improve concurrent performance +// LRU is not real least recently used for the total cache,but just for each buket +// we just need a proper method to clear cache +type ConcurrencyLRUCache struct { + segment int + cacheBuket [segmentCount]*lruCache +} + +// Set create or update an element using key +// key: The identity of an element +// value: new value of the element +// expireTime: expire time, positive int64 or -1 which means never overdue +func (cl *ConcurrencyLRUCache) Set(key string, value interface{}, expireTime time.Duration) error { + if cl == nil || cl.cacheBuket[0] == nil { + return notInitErr + } + if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { + return paraErr + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].setValue(key, value, expireTime) +} + +// Get get the value of a cached element by key. If key do not exist, this function will return nil and an error msg +// key: The identity of an element +// return: +// value: the cached value, nil if key do not exist +// err: error info, nil if value is not nil +func (cl *ConcurrencyLRUCache) Get(key string) (interface{}, error) { + if cl == nil || cl.cacheBuket[0] == nil { + return nil, notInitErr + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return nil, errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].getValue(key) +} + +// Delete delete the value by key, no error returned +func (cl *ConcurrencyLRUCache) Delete(key string) { + if cl == nil || cl.cacheBuket[0] == nil { + return + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return + } + cl.cacheBuket[cacheIndex].delValue(key) +} + +// SetIfNX if the key not exist or expired, will set the new value to cache and return true ,otherwise return false +func (cl *ConcurrencyLRUCache) SetIfNX(key string, value interface{}, expireTime time.Duration) bool { + if cl == nil || cl.cacheBuket[0] == nil { + return false + } + if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { + return false + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return false + } + return cl.cacheBuket[cacheIndex].setIfNotExist(key, value, expireTime) +} + +// INCR add one to the value(must int64) of the key , if the key not exist, initialize with 0 and then add one +func (cl *ConcurrencyLRUCache) INCR(key string, expireTime time.Duration) (int64, error) { + if err := validate(cl, expireTime); err != nil { + return 0, err + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return 0, errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].increment(key, expireTime) +} + +// DECR minus one to the value(must int64) of the key,if the key not exist, initialize with 0 and then minus one +func (cl *ConcurrencyLRUCache) DECR(key string, expireTime time.Duration) (int64, error) { + if err := validate(cl, expireTime); err != nil { + return 0, err + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return 0, errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].decrement(key, expireTime) +} + +func validate(cl *ConcurrencyLRUCache, expireTime time.Duration) error { + if cl == nil || cl.cacheBuket[0] == nil { + return paraErr + } + if expireTime <= 0 && expireTime != time.Duration(negInt64One) { + return paraErr + } + return nil +} + +// index calculate the key hashcode and index the right buket +func (cl *ConcurrencyLRUCache) index(key string) int { + var hash = hashInit + for i := 0; i < len(key); i++ { + hash *= prime32 + hash ^= uint32(key[i]) + } + return int(hash & (uint32(cl.segment) - 1)) +} + +// New create an instance of ConcurrencyLRUCache +// maxEntries the cache size, will to convert to (n/16+n%16>0?1:0)*16 +func New(maxEntries int) *ConcurrencyLRUCache { + if maxEntries <= 0 { + return nil + } + size := maxEntries / segmentCount + remain := maxEntries % segmentCount + if remain > 0 { + size += 1 + } + var cache [segmentCount]*lruCache + for i := 0; i < segmentCount; i++ { + cache[i] = &lruCache{ + maxSize: size, + elemIndex: make(map[string]*list.Element, segmentCount), + List: list.New(), + mu: sync.Mutex{}, + } + } + return &ConcurrencyLRUCache{ + segment: segmentCount, + cacheBuket: cache, + } +} + +func (c *lruCache) setValue(key string, value interface{}, expireTime time.Duration) error { + if c == nil || c.elemIndex == nil { + return errors.New("not initializes") + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + // if the cache not exist + c.setInner(key, value, expireTime) + return nil + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + return errors.New("cacheElement convert failed") + } + c.MoveToFront(v) + pkgElement(ele, value, expireTime) + return nil +} + +func pkgElement(ele *cacheEle, value interface{}, expireTime time.Duration) { + ele.data = value + if expireTime == time.Duration(negInt64One) { + ele.expireTime = negInt64One + return + } + ele.expireTime = time.Now().UnixNano() + int64(expireTime) +} + +func (c *lruCache) getValue(key string) (interface{}, error) { + if c == nil || c.elemIndex == nil { + return nil, errors.New("not initializes") + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + return nil, errors.New("no value found") + } + c.MoveToFront(v) + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + return nil, errors.New("cacheElement convert failed") + } + if ele.expireTime != negInt64One && time.Now().UnixNano() > ele.expireTime { + // if cache expired + c.safeDeleteByKey(key, v) + return nil, errors.New("the key was expired") + } + return ele.data, nil +} + +// Delete delete an element +func (c *lruCache) delValue(key string) { + if c == nil || c.elemIndex == nil { + return + } + c.mu.Lock() + defer c.mu.Unlock() + if v, ok := c.elemIndex[key]; ok { + c.safeDeleteByKey(key, v) + } +} + +func (c *lruCache) setIfNotExist(key string, value interface{}, expireTime time.Duration) bool { + if c == nil || c.elemIndex == nil { + return false + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + // if the cache not exist + c.setInner(key, value, expireTime) + return true + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + return false + } + c.MoveToFront(v) + if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { + return false + } + // if cache expired + pkgElement(ele, value, expireTime) + return true +} + +func (c *lruCache) increment(key string, expireTime time.Duration) (int64, error) { + if c == nil || c.elemIndex == nil { + return 0, notInitErr + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + c.setInner(key, int64One, expireTime) + return int64One, nil + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + c.setInner(key, int64One, expireTime) + return int64One, nil + } + c.MoveToFront(v) + if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { + newValue, ok := ele.data.(int64) + if !ok || newValue == math.MaxInt64 { + return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) + } + newValue++ + pkgElement(ele, newValue, expireTime) + return newValue, nil + } + // if cache expired + pkgElement(ele, int64One, expireTime) + return int64One, nil +} + +func (c *lruCache) decrement(key string, expireTime time.Duration) (int64, error) { + if c == nil || c.elemIndex == nil { + return 0, notInitErr + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + // if the cache not exist + c.setInner(key, negInt64One, expireTime) + return negInt64One, nil + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + c.setInner(key, negInt64One, expireTime) + return negInt64One, nil + } + c.MoveToFront(v) + if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { + newValue, ok := ele.data.(int64) + if !ok || newValue == math.MinInt64 { + return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) + } + newValue-- + pkgElement(ele, newValue, expireTime) + return newValue, nil + } + // if cache expired + pkgElement(ele, negInt64One, expireTime) + return negInt64One, nil +} + +func (c *lruCache) setInner(key string, value interface{}, expireTime time.Duration) { + if c == nil { + return + } + if c.Len()+1 > c.maxSize { + c.safeRemoveOldest() + } + newElem := &cacheEle{ + key: key, + data: value, + expireTime: negInt64One, + } + if expireTime != time.Duration(negInt64One) { + newElem.expireTime = time.Now().UnixNano() + int64(expireTime) + } + e := c.PushFront(newElem) + c.elemIndex[key] = e +} + +func (c *lruCache) safeDeleteByKey(key string, v *list.Element) { + if c == nil { + return + } + c.List.Remove(v) + delete(c.elemIndex, key) +} + +func (c *lruCache) safeRemoveOldest() { + if c == nil { + return + } + v := c.List.Back() + if v == nil { + return + } + c.List.Remove(v) + ele, ok := v.Value.(*cacheEle) + if !ok { + return + } + delete(c.elemIndex, ele.key) +} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go new file mode 100644 index 0000000..a8b5ea0 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go @@ -0,0 +1,304 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package cache implement a memory-based LRU local cache +package cache + +import ( + "container/list" + "fmt" + "math" + "sync" + "testing" + "time" + + "github.com/smartystreets/goconvey/convey" +) + +const ( + cacheTime = 500 + goRoutineCount = 10 +) + +func TestSet(t *testing.T) { + cache := New(1) + convey.Convey("test lru cacheTime", t, func() { + cache.Set("testkey1", "1", cacheTime*time.Millisecond) + v, err := cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + time.Sleep(cacheTime * time.Millisecond) + v, err = cache.Get("testkey1") + convey.So(v, convey.ShouldEqual, nil) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("test set twice", t, func() { + cache.Set("testkey1", "1", time.Minute) + v, err := cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + cache.Set("testkey1", "2", time.Minute) + v, err = cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "2") + }) + convey.Convey("SET failed", t, func() { + c := &lruCache{} + err := c.setValue("test", "1", time.Minute) + convey.So(err.Error(), convey.ShouldEqual, "not initializes") + _, err = c.getValue("test") + convey.So(err.Error(), convey.ShouldEqual, "not initializes") + }) + convey.Convey("SET not expired", t, func() { + cache.Set("testkey2", "1", time.Second) + err := cache.Set("testkey2", "1", time.Duration(negInt64One)) + convey.So(err, convey.ShouldEqual, nil) + v, err := cache.Get("testkey2") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + }) + convey.Convey("SET parameter error", t, func() { + err := cache.Set("testkey2", "1", -time.Second) + convey.So(err.Error(), convey.ShouldEqual, "parameter error") + }) +} + +func TestDelete(t *testing.T) { + cache := New(1) + convey.Convey("test lru delete", t, func() { + cache.Set("testkey1", "1", time.Minute) + v, err := cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + cache.Delete("testkey1") + v, err = cache.Get("testkey1") + convey.So(v, convey.ShouldEqual, nil) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("Delete no thing", t, func() { + c := &lruCache{} + c.delValue("test") + }) +} + +func TestSetIfNX(t *testing.T) { + cache := New(1) + convey.Convey("SetIfNX set parameter error", t, func() { + r := cache.SetIfNX("testkey1", "1", -time.Millisecond) + convey.So(r, convey.ShouldEqual, false) + }) + convey.Convey("SetIfNX set success", t, func() { + r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, true) + }) + convey.Convey("SetIfNX set success failed", t, func() { + r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, false) + }) + time.Sleep(cacheTime * time.Millisecond) + convey.Convey("SetIfNX set success", t, func() { + r := cache.SetIfNX("testkey1", "1", time.Second) + convey.So(r, convey.ShouldEqual, true) + }) + convey.Convey("SetIfNX expireTime -1", t, func() { + r := cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) + convey.So(r, convey.ShouldEqual, true) + r = cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) + convey.So(r, convey.ShouldEqual, false) + }) + +} + +func TestSetIfNXConcurrencyTest(t *testing.T) { + cache := New(1) + convey.Convey("SetIfNX concurrency test", t, func() { + var count = 0 + count = testSetIfNX(cache, count) + convey.So(count, convey.ShouldEqual, 1) + }) +} + +func testSetIfNX(cache *ConcurrencyLRUCache, count int) int { + l := sync.Mutex{} + wg := sync.WaitGroup{} + wg.Add(goRoutineCount) + for i := 0; i < goRoutineCount; i++ { + go func() { + r := cache.SetIfNX("testkey2", "1", time.Second) + if r { + l.Lock() + count++ + l.Unlock() + } + wg.Done() + }() + } + wg.Wait() + return count +} + +func TestINCRConcurrencyTest(t *testing.T) { + cache := New(1) + convey.Convey("INCR concurrency test", t, func() { + max := testIncr(cache) + convey.So(max, convey.ShouldEqual, goRoutineCount) + }) +} + +func testIncr(cache *ConcurrencyLRUCache) int64 { + var max = int64Zero + l := sync.Mutex{} + wg := sync.WaitGroup{} + wg.Add(goRoutineCount) + for i := 0; i < goRoutineCount; i++ { + go func() { + r, err := cache.INCR("testkey1", time.Second) + if err != nil { + return + } + l.Lock() + if r > max { + max = r + } + l.Unlock() + wg.Done() + }() + } + wg.Wait() + return max +} + +func TestDECRConcurrencyTest(t *testing.T) { + cache := New(1) + cache.Set("testkey1", int64(goRoutineCount), time.Minute) + convey.Convey("INCR concurrency test", t, func() { + min := testDecr(cache) + convey.So(min, convey.ShouldEqual, 0) + }) +} + +func testDecr(cache *ConcurrencyLRUCache) int64 { + var min = int64(math.MaxInt) + l := sync.Mutex{} + wg := sync.WaitGroup{} + wg.Add(goRoutineCount) + for i := 0; i < goRoutineCount; i++ { + go func() { + r, err := cache.DECR("testkey1", time.Second) + if err != nil { + return + } + l.Lock() + if r < min { + min = r + } + l.Unlock() + wg.Done() + }() + } + wg.Wait() + return min +} + +func TestINCR(t *testing.T) { + cache := New(1) + convey.Convey("not initializes", t, func() { + c := &lruCache{} + _, err := c.increment("test", time.Minute) + convey.So(err, convey.ShouldEqual, notInitErr) + }) + convey.Convey("parameter error", t, func() { + _, err := cache.INCR("testkey", -time.Minute) + convey.So(err, convey.ShouldEqual, paraErr) + }) + convey.Convey("INCR success", t, func() { + r, err := cache.INCR("testkey", time.Minute) + convey.So(r, convey.ShouldEqual, 1) + convey.So(err, convey.ShouldEqual, nil) + r, err = cache.INCR("testkey", time.Minute) + convey.So(r, convey.ShouldEqual, intTwo) + }) + + convey.Convey("INCR success when exits", t, func() { + cache.Set("testkey1", int64Zero, cacheTime*time.Millisecond) + r, err := cache.INCR("testkey1", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, 1) + convey.So(err, convey.ShouldEqual, nil) + time.Sleep(cacheTime * time.Millisecond) + r, err = cache.INCR("testkey1", time.Minute) + convey.So(r, convey.ShouldEqual, 1) + }) +} + +func TestDECR(t *testing.T) { + cache := New(1) + convey.Convey("not initializes", t, func() { + c := &lruCache{} + _, err := c.decrement("test", time.Minute) + convey.So(err, convey.ShouldEqual, notInitErr) + }) + convey.Convey("parameter error", t, func() { + _, err := cache.DECR("testkey1", -time.Minute) + convey.So(err, convey.ShouldEqual, paraErr) + }) + convey.Convey("SetIfNX set success", t, func() { + r, err := cache.DECR("testkey1", time.Minute) + convey.So(r, convey.ShouldEqual, negInt64One) + convey.So(err, convey.ShouldEqual, nil) + cache.Set("testkey1", int64One, time.Minute) + r, err = cache.DECR("testkey1", time.Minute) + convey.So(r, convey.ShouldEqual, 0) + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("Decr success when exits", t, func() { + cache.Set("testkey2", int64One, cacheTime*time.Millisecond) + r, err := cache.DECR("testkey2", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, 0) + convey.So(err, convey.ShouldEqual, nil) + time.Sleep(cacheTime * time.Millisecond) + r, err = cache.DECR("testkey2", time.Minute) + convey.So(err, convey.ShouldEqual, nil) + convey.So(r, convey.ShouldEqual, negInt64One) + }) +} + +func TestLRU(t *testing.T) { + convey.Convey("not initializes", t, func() { + c := &lruCache{ + maxSize: intTwo, + elemIndex: make(map[string]*list.Element, segmentCount), + List: list.New(), + mu: sync.Mutex{}, + } + c.setValue("test", "1", time.Minute) + c.setValue("test1", "1", time.Minute) + c.setValue("test2", "1", time.Minute) + _, err := c.getValue("test") + convey.So(err.Error(), convey.ShouldEqual, "no value found") + }) +} + +func BenchmarkSetIfNx(b *testing.B) { + cache := New(1) + for n := 0; n < b.N; n++ { + cache.SetIfNX(fmt.Sprintf("key%d", n), "xx", time.Second) + } +} + +func BenchmarkINCR(b *testing.B) { + cache := New(1) + for n := 0; n < b.N; n++ { + cache.INCR("sdds", time.Second) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api.go new file mode 100644 index 0000000..65de3e7 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/api.go @@ -0,0 +1,310 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "context" + "fmt" + "io" + "log" + "os" + "path" +) + +const ( + logDebugLv = iota - 1 + logInfoLv + logWarnLv + logErrorLv + logCriticalLv +) + +type logger struct { + lgDebug *log.Logger + lgInfo *log.Logger + lgWarn *log.Logger + lgError *log.Logger + lgCritical *log.Logger + lgCtrl *LogLimiter + lgLevel int + lgMaxLine int +} + +func (lg *logger) initLogWriter(w io.Writer) { + lg.lgDebug = log.New(w, "[DEBUG] ", log.Ldate|log.Lmicroseconds) + lg.lgInfo = log.New(w, "[INFO] ", log.Ldate|log.Lmicroseconds) + lg.lgWarn = log.New(w, "[WARN] ", log.Ldate|log.Lmicroseconds) + lg.lgError = log.New(w, "[ERROR] ", log.Ldate|log.Lmicroseconds) + lg.lgCritical = log.New(w, "[Critical] ", log.Ldate|log.Lmicroseconds) +} + +func (lg *logger) setLoggerLevel(lv int) { + if lv < minLogLevel || lv > maxLogLevel { + lg.lgLevel = 0 + return + } + lg.lgLevel = lv +} + +func (lg *logger) setLoggerMaxLine(lml int) { + if lml <= 0 || lml > maxEachLineLen { + lg.lgMaxLine = defaultMaxEachLineLen + return + } + lg.lgMaxLine = lml +} + +func (lg *logger) setLoggerWriter(config *LogConfig) { + rollLogger := &Logs{ + FileName: config.LogFileName, + Capacity: config.FileMaxSize, // megabytes + SaveVolume: config.MaxBackups, + SaveTime: config.MaxAge, // days + } + logWriter := &LogLimiter{ + Logs: rollLogger, + ExpiredTime: config.ExpiredTime, // seconds + CacheSize: config.CacheSize, + } + if config.OnlyToStdout { + lg.initLogWriter(os.Stdout) + return + } + if config.OnlyToFile { + lg.initLogWriter(logWriter) + return + } + writer := io.MultiWriter(os.Stdout, logWriter) + lg.initLogWriter(writer) + lg.lgCtrl = logWriter +} + +func (lg *logger) setLogger(config *LogConfig) error { + if err := validateLogConfigFiled(config); err != nil { + return err + } + lg.setLoggerWriter(config) + lg.setLoggerLevel(config.LogLevel) + lg.setLoggerMaxLine(config.MaxLineLength) + msg := fmt.Sprintf("%s's logger init success", path.Base(config.LogFileName)) + // skip change file mode and fs notify + if config.OnlyToStdout { + msg = fmt.Sprintf("%s, only to stdout", msg) + return nil + } + lg.Info(msg) + if err := os.Chmod(config.LogFileName, LogFileMode); err != nil { + lg.Errorf("change file mode failed: %v", err) + return fmt.Errorf("set log file mode failed") + } + return nil +} + +func (lg *logger) isInit() bool { + return lg.lgDebug != nil && lg.lgInfo != nil && lg.lgWarn != nil && lg.lgError != nil && lg.lgCritical != nil +} + +// Debug record debug not format +func (lg *logger) Debug(args ...interface{}) { + lg.DebugWithCtx(nil, args...) +} + +// Debugf record debug +func (lg *logger) Debugf(format string, args ...interface{}) { + lg.DebugfWithCtx(nil, format, args...) +} + +// DebugWithCtx record Debug not format +func (lg *logger) DebugWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logDebugLv { + return + } + if lg.validate() { + printHelper(lg.lgDebug, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// DebugfWithCtx record Debug format +func (lg *logger) DebugfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logDebugLv { + return + } + if lg.validate() { + printHelper(lg.lgDebug, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// Info record info not format +func (lg *logger) Info(args ...interface{}) { + lg.InfoWithCtx(nil, args...) +} + +// Infof record info +func (lg *logger) Infof(format string, args ...interface{}) { + lg.InfofWithCtx(nil, format, args...) +} + +// InfoWithCtx record Info not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) InfoWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logInfoLv { + return + } + if lg.validate() { + printHelper(lg.lgInfo, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// InfofWithCtx record Info format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) InfofWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logInfoLv { + return + } + if lg.validate() { + printHelper(lg.lgInfo, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// Warn record warn not format +func (lg *logger) Warn(args ...interface{}) { + lg.WarnWithCtx(nil, args...) +} + +// Warnf record warn +func (lg *logger) Warnf(format string, args ...interface{}) { + lg.WarnfWithCtx(nil, format, args...) +} + +// WarnWithCtx record Warn not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) WarnWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logWarnLv { + return + } + if lg.validate() { + printHelper(lg.lgWarn, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// WarnfWithCtx record Warn format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) WarnfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logWarnLv { + return + } + if lg.validate() { + printHelper(lg.lgWarn, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// WarnfWithLimit record warn for default times (default 3),domain is for logType of msg, +// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt +func (lg *logger) WarnfWithLimit(domain string, id interface{}, format string, args ...interface{}) { + if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + lg.WarnfWithCtx(nil, format, args...) + } +} + +// Error record error not format +func (lg *logger) Error(args ...interface{}) { + lg.ErrorWithCtx(nil, args...) +} + +// Errorf record error +func (lg *logger) Errorf(format string, args ...interface{}) { + lg.ErrorfWithCtx(nil, format, args...) +} + +// ErrorfWithLimit record error for default times (default 3),domain is for logType of msg, +// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt +func (lg *logger) ErrorfWithLimit(domain string, id interface{}, format string, args ...interface{}) { + if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + lg.ErrorfWithCtx(nil, format, args...) + } +} + +// ErrorfWithSpecifiedCounts record error for specified times,domain is for logType of msg, +// id is a unique identifier of this logType,maxCounts is for max print counts, +// you can reset the counter by call ResetErrCnt +func (lg *logger) ErrorfWithSpecifiedCounts(domain string, id interface{}, maxCounts int, + format string, args ...interface{}) { + if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, maxCounts); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + lg.ErrorfWithCtx(nil, format, args...) + } +} + +// ErrorWithCtx record Error not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) ErrorWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logErrorLv { + return + } + if lg.validate() { + printHelper(lg.lgError, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// ErrorfWithCtx record Error format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) ErrorfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logErrorLv { + return + } + if lg.validate() { + printHelper(lg.lgError, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// Critical record critical not format +func (lg *logger) Critical(args ...interface{}) { + lg.CriticalWithCtx(nil, args...) +} + +// Criticalf record Critical log format +func (lg *logger) Criticalf(format string, args ...interface{}) { + lg.CriticalfWithCtx(nil, format, args...) +} + +// CriticalWithCtx record Critical not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) CriticalWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logCriticalLv { + return + } + if lg.validate() { + printHelper(lg.lgCritical, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// CriticalfWithCtx record Critical format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) CriticalfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logCriticalLv { + return + } + if lg.validate() { + printHelper(lg.lgCritical, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +func (lg *logger) validate() bool { + if lg == nil || !lg.isInit() { + fmt.Println("Fatal function's logger is nil") + return false + } + return true +} + +// FlushMem writes the contents of the memory to the disk +func (lg *logger) FlushMem() error { + return lg.lgCtrl.Flush() +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go new file mode 100644 index 0000000..ecdcef6 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go @@ -0,0 +1,165 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "fmt" + "io/fs" + "os" + "path" + "path/filepath" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/utils" +) + +func TestNewLogger(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test setLogger func", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + // test for log file + mockPathCheck := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { + return "", nil + }) + mockMkdir := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { + return nil + }) + defer mockPathCheck.Reset() + defer mockMkdir.Reset() + lgConfig = &LogConfig{ + LogFileName: path.Join(filepath.Dir(os.Args[0]), "t.log"), + OnlyToFile: true, + MaxBackups: DefaultMaxBackups, + MaxAge: DefaultMinSaveAge, + CacheSize: DefaultCacheSize, + ExpiredTime: DefaultExpiredTime, + } + err = lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestLoggerPrint(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test logger print func", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + LogLevel: -1, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + lg.Debug("test debug") + lg.Debugf("test debugf") + lg.Info("test info") + lg.Infof("test infof") + lg.Warn("test warn") + lg.Warnf("test warnf") + lg.Error("test error") + lg.Errorf("test errorf") + lg.Critical("test critical") + lg.Criticalf("test criticalf") + lg.setLoggerLevel(maxLogLevel + 1) + lg.Debug("test debug") + lg.Debugf("test debugf") + lg.Info("test info") + lg.Infof("test infof") + lg.Warn("test warn") + lg.Warnf("test warnf") + lg.Error("test error") + lg.Errorf("test errorf") + lg.Critical("test critical") + lg.Criticalf("test criticalf") + }) + }) +} +func TestLoggerPrintWithLimit(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test logger print func with limit", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + LogLevel: -1, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + domain := "hccs" + logicId := 1 + + errFormat := "collect failed ,err:%v" + collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + ResetErrCnt(domain, logicId) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + }) + }) +} + +func TestWarnfWithLimit(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test warn logger print func with limit", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + LogLevel: -1, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + domain := "hccs" + logicId := 1 + + errFormat := "collect failed ,err:%v" + collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + ResetErrCnt(domain, logicId) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + }) + }) +} + +func TestValidate(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test validate", func() { + lg := new(logger) + res := lg.validate() + convey.So(res, convey.ShouldBeFalse) + lgConfig := &LogConfig{ + OnlyToStdout: true, + } + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + res = lg.validate() + convey.So(res, convey.ShouldBeTrue) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go new file mode 100644 index 0000000..5e5c567 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go @@ -0,0 +1,174 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "context" + "errors" +) + +// RunLog run logger +var RunLog *logger + +// InitRunLogger initialize run logger +func InitRunLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("run logger config is nil") + } + if RunLog != nil && RunLog.isInit() { + RunLog.Warn("run logger is been initialized") + return nil + } + RunLog = new(logger) + if RunLog == nil { + return errors.New("malloc new logger flied") + } + if err := RunLog.setLogger(config); err != nil { + return err + } + if !RunLog.isInit() { + return errors.New("run logger init failed") + } + return nil +} + +// OpLog operate logger +var OpLog *logger + +// InitOperateLogger initialize operate logger +func InitOperateLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("operate logger config is nil") + } + if OpLog != nil && OpLog.isInit() { + OpLog.Warn("operate logger is been initialized") + return nil + } + OpLog = new(logger) + if OpLog == nil { + return errors.New("malloc new logger flied") + } + if err := OpLog.setLogger(config); err != nil { + return err + } + if !OpLog.isInit() { + return errors.New("operate logger init failed") + } + return nil +} + +// SecLog security logger +var SecLog *logger + +// InitSecurityLogger initialize security logger +func InitSecurityLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("security logger config is nil") + } + if SecLog != nil && SecLog.isInit() { + SecLog.Warn("security logger is been initialized") + return nil + } + SecLog = new(logger) + if SecLog == nil { + return errors.New("malloc new logger flied") + } + if err := SecLog.setLogger(config); err != nil { + return err + } + if !SecLog.isInit() { + return errors.New("security logger init failed") + } + return nil +} + +// UserLog user logger +var UserLog *logger + +// InitUserLogger initialize user logger +func InitUserLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("user logger config is nil") + } + if UserLog != nil && UserLog.isInit() { + UserLog.Warn("user logger is been initialized") + return nil + } + UserLog = new(logger) + if UserLog == nil { + return errors.New("malloc new logger flied") + } + if err := UserLog.setLogger(config); err != nil { + return err + } + if !UserLog.isInit() { + return errors.New("user logger init failed") + } + return nil +} + +// DebugLog debug logger +var DebugLog *logger + +// InitDebugLogger initialize debug logger +func InitDebugLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("debug logger config is nil") + } + if DebugLog != nil && DebugLog.isInit() { + DebugLog.Warn("debug logger is been initialized") + return nil + } + DebugLog = new(logger) + if DebugLog == nil { + return errors.New("malloc new logger flied") + } + if err := DebugLog.setLogger(config); err != nil { + return err + } + if !DebugLog.isInit() { + return errors.New("debug logger init failed") + } + return nil +} + +// CustomLogger custom logger +type CustomLogger struct { + *logger +} + +// NewCustomLogger create a new custom logger +func NewCustomLogger(config *LogConfig, ctx context.Context) (*CustomLogger, error) { + if config == nil { + return nil, errors.New("custom logger config is nil") + } + log := new(logger) + if err := log.setLogger(config); err != nil { + return nil, err + } + if !log.isInit() { + return nil, errors.New("logger init failed") + } + return &CustomLogger{logger: log}, nil +} + +// SetCustomLogger set custom logger +func SetCustomLogger(log *logger) *CustomLogger { + if log == nil { + return nil + } + return &CustomLogger{logger: log} +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go new file mode 100644 index 0000000..a32e9be --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go @@ -0,0 +1,126 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "context" + "errors" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestInitRunLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init run log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitRunLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("run logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitRunLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitRunLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestNewCustomLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init custom log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + _, err := NewCustomLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("custom logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + _, err = NewCustomLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + _, err = NewCustomLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitOperateLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init operate log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitOperateLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("operate logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitOperateLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitOperateLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitSecurityLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init security log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitSecurityLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("security logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitSecurityLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitSecurityLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitUserLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init user log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitUserLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("user logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitUserLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitUserLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitDebugLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init debug log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitDebugLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("debug logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitDebugLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitDebugLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go b/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go new file mode 100644 index 0000000..88cfb9d --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go @@ -0,0 +1,156 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "fmt" + "sync" + "time" + + "ascend-common/common-utils/cache" +) + +const ( + // MaxCacheSize indicates the maximum log cache size + MaxCacheSize = 100 * 1024 + // MaxExpiredTime indicates the maximum log cache expired time + MaxExpiredTime = 60 * 60 + // DefaultCacheSize indicates the default log cache size + DefaultCacheSize = 10 * 1024 + // DefaultExpiredTime indicates the default log cache expired time + DefaultExpiredTime = 1 + cutPreLen = 46 + // ProblemOccurMaxNumbers indicates the maximum number of times that the same problem can occur + ProblemOccurMaxNumbers = 3 +) + +var ( + errorMap sync.Map +) + +// LogLimiter encapsulates Logs and provides the log traffic limiting capability +// to prevent too many duplicate logs. +type LogLimiter struct { + // Logs is a log rotate instance + Logs *Logs + logCache *cache.ConcurrencyLRUCache + logMu sync.Mutex + doOnce sync.Once + + logExpiredTime time.Duration + // CacheSize indicates the size of log cache + CacheSize int + // ExpiredTime indicates the expired time of log cache + ExpiredTime int +} + +// Write implements io.Writer. It encapsulates the Write method of Los and uses +// the lru cache to prevent duplicate log writing. +func (l *LogLimiter) Write(d []byte) (int, error) { + if l == nil { + return 0, fmt.Errorf("log limiter pointer does not exist") + } + + l.logMu.Lock() + defer l.logMu.Unlock() + + if l.ExpiredTime == 0 || l.CacheSize == 0 { + return l.Logs.Write(d) + } + + l.doOnce.Do(func() { + l.validateLimiterConf() + l.logCache = cache.New(l.CacheSize) + l.logExpiredTime = time.Duration(int64(l.ExpiredTime) * int64(time.Second)) + }) + + if l.logCache == nil { + l.logCache = cache.New(DefaultCacheSize) + } + if !l.logCache.SetIfNX(string(d[cutPreLen:]), "v", l.logExpiredTime) { + return 0, nil + } + + return l.Logs.Write(d) +} + +// Close implements io.Closer. It encapsulates the Close method of Logs. +func (l *LogLimiter) Close() error { + if l == nil { + return fmt.Errorf("log limiter pointer does not exist") + } + + l.logMu.Lock() + defer l.logMu.Unlock() + + return l.Logs.Close() +} + +// Flush encapsulates the Flush method of Logs. +func (l *LogLimiter) Flush() error { + if l == nil { + return fmt.Errorf("log limiter pointer does not exist") + } + + l.logMu.Lock() + defer l.logMu.Unlock() + + return l.Logs.Flush() +} + +// validateLimiterConf verifies the external input parameters in the LogLimiter. +func (l *LogLimiter) validateLimiterConf() { + if l.CacheSize < 0 || l.CacheSize > MaxCacheSize { + l.CacheSize = DefaultCacheSize + } + if l.ExpiredTime < 0 || l.ExpiredTime > MaxExpiredTime { + l.ExpiredTime = DefaultExpiredTime + } +} + +func getKey(domain string, id interface{}) string { + return fmt.Sprintf("%d_%s", id, domain) +} + +// IsNeedPrintWithSpecifiedCounts check whether print the error message, +// if the error message (domain_id as a unique identifier) has been printed +// for problemOccurMaxNumbers times, return false +func IsNeedPrintWithSpecifiedCounts(domain string, id interface{}, problemOccurMaxNumbers int) (bool, string) { + key := getKey(domain, id) + cnt, _ := errorMap.LoadOrStore(key, 0) + intCnt, ok := cnt.(int) + extraErrLog := "" + if !ok { + // the counter type is abnormal, print by default + return true, extraErrLog + } + if intCnt >= problemOccurMaxNumbers { + return false, extraErrLog + } + intCnt += 1 + errorMap.Store(key, intCnt) + if intCnt == problemOccurMaxNumbers { + extraErrLog = fmt.Sprintf(".The error log has been printed for %v times "+ + "and will not be printed any more", problemOccurMaxNumbers) + } + return true, extraErrLog + +} + +// ResetErrCnt reset the error count +func ResetErrCnt(domain string, id interface{}) { + errorMap.Delete(getKey(domain, id)) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go new file mode 100644 index 0000000..f659fbc --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go @@ -0,0 +1,242 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "errors" + "fmt" + "os" + "path" + "regexp" + "strings" + + "github.com/fsnotify/fsnotify" + + "ascend-common/common-utils/utils" +) + +const ( + // DefaultFileMaxSize the default maximum size of a single log file is 20 MB + DefaultFileMaxSize = 20 + // DefaultMinSaveAge the minimum storage duration of backup logs is 7 days + DefaultMinSaveAge = 7 + // DefaultMaxSaveAge the maximum storage duration of backup logs is 700 days + DefaultMaxSaveAge = 700 + // DefaultMaxBackups the default number of backup log + DefaultMaxBackups = 30 + // LogFileMode log file mode + LogFileMode os.FileMode = 0640 + // BackupLogFileMode backup log file mode + BackupLogFileMode os.FileMode = 0400 + // LogDirMode log dir mode + LogDirMode = 0750 + backUpLogRegex = `^.+-[0-9]{4}-[0-9]{2}-[0-9T]{5}-[0-9]{2}-[0-9]{2}\.[0-9]{2,4}` + bitsize = 64 + stackDeep = 3 + pathLen = 2 + minLogLevel = -1 + maxLogLevel = 3 + maxEachLineLen = 1048576 + defaultMaxEachLineLen = 256 +) + +// LogConfig log module config +type LogConfig struct { + // log file path + LogFileName string + // only write to std out, default value: false + OnlyToStdout bool + // only write to file, default value: false + OnlyToFile bool + // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 + LogLevel int + // size of a single log file (MB), default value: 20MB + FileMaxSize int + // MaxLineLength Max length of each log line, default value: 256 + MaxLineLength int + // maximum number of backup log files, default value: 30 + MaxBackups int + // maximum number of days for backup log files, default value: 7 + MaxAge int + // whether backup files need to be compressed, default value: false + IsCompress bool + // expiration time for log cache, default value: 1s + ExpiredTime int + // Size of log cache space, default: 10240 + CacheSize int +} + +var reg = regexp.MustCompile(backUpLogRegex) + +type validateFunc func(config *LogConfig) error + +func checkDir(fileDir string) error { + if !utils.IsExist(fileDir) { + if err := os.MkdirAll(fileDir, LogDirMode); err != nil { + return fmt.Errorf("create dirs failed") + } + return nil + } + if err := os.Chmod(fileDir, LogDirMode); err != nil { + return fmt.Errorf("change log dir mode failed") + } + return nil +} + +func createFile(filePath string) error { + fileName := path.Base(filePath) + if !utils.IsExist(filePath) { + f, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, LogFileMode) + if err != nil { + return fmt.Errorf("create file(%s) failed", fileName) + } + defer func() { + if err := f.Close(); err != nil { + fmt.Printf("close file failed: %v\n", err) + return + } + }() + } + return nil +} + +func checkAndCreateLogFile(filePath string) error { + if !utils.IsFile(filePath) { + return fmt.Errorf("config path is not file") + } + fileDir := path.Dir(filePath) + if err := checkDir(fileDir); err != nil { + return err + } + if err := createFile(filePath); err != nil { + return err + } + return nil +} + +func validateLogConfigFileMaxSize(config *LogConfig) error { + if config.FileMaxSize == 0 { + config.FileMaxSize = DefaultFileMaxSize + return nil + } + if config.FileMaxSize < 0 || config.FileMaxSize > DefaultFileMaxSize { + return fmt.Errorf("the size of a single log file range is (0, 20] MB") + } + + return nil +} + +func validateLogConfigBackups(config *LogConfig) error { + if config.MaxBackups <= 0 || config.MaxBackups > DefaultMaxBackups { + return fmt.Errorf("the number of backup log file range is (0, 30]") + } + return nil +} + +func validateLogConfigMaxAge(config *LogConfig) error { + fmt.Printf("MaxAge %s", config.MaxAge) + if config.MaxAge < DefaultMinSaveAge || config.MaxAge > DefaultMaxSaveAge { + return fmt.Errorf("the maxage of backup logs range is [7,700]") + } + return nil +} + +func validateLogLevel(config *LogConfig) error { + if config.LogLevel < minLogLevel || config.LogLevel > maxLogLevel { + return fmt.Errorf("the log level range should be [-1, 3]") + } + return nil +} + +func validateMaxLineLength(config *LogConfig) error { + if config.MaxLineLength == 0 { + config.MaxLineLength = defaultMaxEachLineLen + return nil + } + if config.MaxLineLength < 0 || config.MaxLineLength > maxEachLineLen { + return fmt.Errorf("the max length of each log line should be in the range (0, 1048576]") + } + return nil +} + +func getValidateFuncList() []validateFunc { + var funcList []validateFunc + funcList = append(funcList, validateLogConfigFileMaxSize, validateLogConfigBackups, validateMaxLineLength, + validateLogConfigMaxAge, validateLogLevel, validateLogConfigLimiter) + return funcList +} + +func validateLogConfigFiled(config *LogConfig) error { + if config.OnlyToStdout { + return nil + } + if _, err := utils.CheckPath(config.LogFileName); err != nil && err != os.ErrNotExist { + return fmt.Errorf("config log path is not absolute path: %v", err) + } + if strings.Contains(config.LogFileName, "..") || strings.Contains(config.LogFileName, "./") { + return errors.New("log path include invalid char") + } + + if err := checkAndCreateLogFile(config.LogFileName); err != nil { + return err + } + validateFuncList := getValidateFuncList() + for _, vaFunc := range validateFuncList { + if err := vaFunc(config); err != nil { + return err + } + } + + return nil +} + +func validateLogConfigLimiter(config *LogConfig) error { + if config.ExpiredTime < 0 || config.ExpiredTime > MaxExpiredTime { + return fmt.Errorf("the expired time of log cache range is [0, 3600], the value 0 disables the limiter") + } + if config.CacheSize < 0 || config.CacheSize > MaxCacheSize { + return fmt.Errorf("the size of log cache range is [0, 102400], the value 0 disables the limiter") + } + return nil +} + +func changeFileMode(l *logger, event fsnotify.Event, logFileFullPath string) { + if l == nil { + fmt.Println("changeFileMode logger is nil") + return + } + var logMode = LogFileMode + logPath := path.Dir(logFileFullPath) + changedFileName := path.Base(event.Name) + if isTargetLog(changedFileName) { + logMode = BackupLogFileMode + } + changedLogFilePath := path.Join(logPath, changedFileName) + if !utils.IsExist(changedLogFilePath) { + return + } + fPath, err := utils.CheckPath(changedLogFilePath) + if err != nil { + l.Errorf("wrong file path: %v", err) + return + } + if errChmod := os.Chmod(fPath, logMode); errChmod != nil { + l.Errorf("set file mode failed, filename: %s", changedFileName) + } +} +func isTargetLog(fileName string) bool { + return reg.MatchString(fileName) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go new file mode 100644 index 0000000..f91b663 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go @@ -0,0 +1,217 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "errors" + "io/fs" + "os" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/fsnotify/fsnotify" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/utils" +) + +func TestCheckDir(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test check dir func", func() { + mockStat := gomonkey.ApplyFunc(os.Stat, func(_ string) (fs.FileInfo, error) { + return nil, os.ErrNotExist + }) + mockMkDir := gomonkey.ApplyFunc(os.MkdirAll, func(_ string, _ fs.FileMode) error { + return nil + }) + defer mockStat.Reset() + defer mockMkDir.Reset() + err := checkDir("log") + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestCreateFile(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test create file", func() { + mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { + return false + }) + mockCreate := gomonkey.ApplyFunc(os.Create, func(_ string) (*os.File, error) { + return nil, nil + }) + defer mockExist.Reset() + defer mockCreate.Reset() + err := createFile("log") + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestCheckAndCreateLogFile(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test checkAndCreateLogFile func", func() { + mockCreate := gomonkey.ApplyFunc(createFile, func(_ string) error { + return nil + }) + defer mockCreate.Reset() + err := checkAndCreateLogFile("log") + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestValidateLogConfigFileMaxSize(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate max size func", func() { + conf := &LogConfig{} + err := validateLogConfigFileMaxSize(conf) + convey.So(err, convey.ShouldBeNil) + convey.So(conf.FileMaxSize, convey.ShouldEqual, DefaultFileMaxSize) + conf.FileMaxSize = -1 + err = validateLogConfigFileMaxSize(conf) + convey.So(err, convey.ShouldBeError) + conf.FileMaxSize = DefaultFileMaxSize + 1 + err = validateLogConfigFileMaxSize(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateLogConfigBackups(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate backups func", func() { + conf := &LogConfig{MaxBackups: DefaultMaxBackups} + err := validateLogConfigBackups(conf) + convey.So(err, convey.ShouldBeNil) + conf.MaxBackups = 0 + err = validateLogConfigBackups(conf) + convey.So(err, convey.ShouldBeError) + conf.FileMaxSize = DefaultMaxBackups + 1 + err = validateLogConfigBackups(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateLogConfigMaxAge(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate max age func", func() { + conf := &LogConfig{MaxAge: DefaultMinSaveAge} + err := validateLogConfigMaxAge(conf) + convey.So(err, convey.ShouldBeNil) + conf.MaxAge = 0 + err = validateLogConfigMaxAge(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateLogLevel(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate log level func", func() { + conf := &LogConfig{} + err := validateLogLevel(conf) + convey.So(err, convey.ShouldBeNil) + conf.LogLevel = minLogLevel - 1 + err = validateLogLevel(conf) + convey.So(err, convey.ShouldBeError) + conf.LogLevel = maxLogLevel + 1 + err = validateLogLevel(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateMaxLineLength(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate max line length func", func() { + conf := &LogConfig{} + err := validateMaxLineLength(conf) + convey.So(err, convey.ShouldBeNil) + convey.So(conf.MaxLineLength, convey.ShouldEqual, defaultMaxEachLineLen) + conf.MaxLineLength = -1 + err = validateMaxLineLength(conf) + convey.So(err, convey.ShouldNotBeNil) + conf.MaxLineLength = maxEachLineLen + 1 + err = validateMaxLineLength(conf) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestValidateLogConfigFiled(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate config filed func", func() { + mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { + return "", nil + }) + mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { + return nil + }) + defer mockCheckPath.Reset() + defer mockCheckAndCreate.Reset() + conf := &LogConfig{ + MaxBackups: DefaultMaxBackups, + MaxAge: DefaultMinSaveAge, + CacheSize: DefaultCacheSize, + ExpiredTime: DefaultExpiredTime, + } + err := validateLogConfigFiled(conf) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("test validate config filed func, log file is relative path", func() { + mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { + return "", nil + }) + mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { + return nil + }) + defer mockCheckPath.Reset() + defer mockCheckAndCreate.Reset() + conf := &LogConfig{ + MaxBackups: DefaultMaxBackups, + MaxAge: DefaultMinSaveAge, + CacheSize: DefaultCacheSize, + ExpiredTime: DefaultExpiredTime, + LogFileName: "../", + } + err := validateLogConfigFiled(conf) + expErr := errors.New("log path include invalid char") + convey.So(err, convey.ShouldResemble, expErr) + }) + }) +} + +func TestChangeFileMode(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test changeFileMode func", func() { + changeFileMode(nil, fsnotify.Event{}, "log") + mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { + return true + }) + mockChmod := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { + return nil + }) + defer mockExist.Reset() + defer mockChmod.Reset() + lg := new(logger) + evt := fsnotify.Event{Name: "run-2022-01-01T00-00-00.123.log"} + changeFileMode(lg, evt, "log") + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go new file mode 100644 index 0000000..cc07bb2 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go @@ -0,0 +1,447 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "time" +) + +const ( + oneDaySeconds = 24 * 60 * 60 + defaultCapacity = 20 + timeFormat = "2006-01-02T15-04-05.000" + kilobytes = 1024 + defaultDirPermission = 0750 + defaultFilePermission = 0600 + defaultBackupPermission = 0400 + maxCapacity = 20 + minSaveVolume = 1 + maxSaveVolume = 30 + maxSaveTime = 700 + minSaveTime = 7 +) + +// Logs is an io.WriteCloser. +type Logs struct { + file *os.File + mutex sync.Mutex + rmOnce sync.Once + + // FileName is the file where logs are written. + FileName string `json:"filename" yaml:"filename"` + + // Capacity is the maximum number of bytes before the log file + // is rotated, and the default value is 128 megabytes. + Capacity int `json:"capacity" yaml:"capacity"` + + // SaveTime is the maximum number of days for retaining old log + // files. It calculates the retention time based on the timestamp + // of the old log file name and the current time. + SaveTime int `json:"savetime" yaml:"savetime"` + + // SaveVolume is the maximum number of old log files that can be + // retained. It saves all old files by default. + SaveVolume int `json:"savevolume" yaml:"savevolume"` + + // UTC determines whether to use the local time of the computer + // or the UTC time as the timestamp in the formatted backup file. + LocalOrUTC bool `json:"localorutc" yaml:"localorutc"` + + length int64 + rmCh chan bool +} + +// logFile is a struct that is used to return filename and +// timestamp. +type logFile struct { + fileInfo os.FileInfo + timeStamp time.Time +} + +var ( + // mByte is used to convert capacity into bytes. + mByte = kilobytes * kilobytes +) + +// Write implements io.Writer. If a write would not cause the size of +// the log file to exceed Capacity, the log file is written normally. +// If a write would cause the size of the log file to exceed Capacity, +// but the write length is less than Capacity, the log file is closed, +// renamed to include a timestamp of the current time, and a new log +// is created using the original log file name. If the length of a write +// is greater than the Capacity, an error is returned. +func (l *Logs) Write(d []byte) (int, error) { + if l == nil { + return 0, fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + + writeLenth := int64(len(d)) + if writeLenth > l.maxLenth() { + return 0, fmt.Errorf("the write lenth %d is greater than the maximum file size %d", + writeLenth, l.maxLenth(), + ) + } + + if l.file == nil { + if err := l.openOrCreateFile(writeLenth); err != nil { + return 0, err + } + } + fileInfo, err := l.file.Stat() + if err != nil { + return 0, err + } + l.length = fileInfo.Size() + if writeLenth+l.length > l.maxLenth() { + if err := l.roll(); err != nil { + return 0, err + } + } + + n, err := l.file.Write(d) + if err != nil { + return 0, err + } + l.length += int64(n) + return n, err +} + +// Roll causes Logs to close the existing log file and create a new log +// file immediately. The purpose of this function is to provide rotation +// outside the normal rotation rule, e.g. in response to SIGHUP. After +// rotation, the deletion of the old log files is initiated. +func (l *Logs) Roll() error { + if l == nil { + return fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + return l.roll() +} + +// Close implements io.Closer. It closses the current log file. +func (l *Logs) Close() error { + if l == nil { + return fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + + return l.close() +} + +// Flush persist the contents of the current memory. +func (l *Logs) Flush() error { + if l == nil { + return fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + if l.file == nil { + return nil + } + return l.file.Sync() +} + +// maxLenth return the number of bytes of the maximum log size +// before rotating. +func (l *Logs) maxLenth() int64 { + if l.Capacity > 0 && l.Capacity < maxCapacity { + return int64(l.Capacity) * int64(mByte) + } + return int64(defaultCapacity * mByte) +} + +// fileName return the name of the log file. +func (l *Logs) fileName() string { + if l.FileName != "" { + return l.FileName + } + logName := filepath.Base(os.Args[0]) + "-mindx-dl.log" + return filepath.Join(os.TempDir(), logName) +} + +// openOrCreateFile opens the log file if it exists and the +// current write would not exceed the Capacity. It will create +// a new file if there is no such file or the write would exceed +// the Capacity. +func (l *Logs) openOrCreateFile(writeLen int64) error { + l.remove() + + name := l.fileName() + message, err := os.Stat(name) + if os.IsNotExist(err) { + return l.create() + } + + if err != nil { + return fmt.Errorf("failed to get log file message: %v", err) + } + + if writeLen+message.Size() >= l.maxLenth() { + return l.roll() + } + + f, err := os.OpenFile(name, os.O_APPEND|os.O_WRONLY, defaultFilePermission) + if err != nil { + return l.create() + } + l.file = f + l.length = message.Size() + return nil +} + +// create creates a new log file for writing, and backs up the +// old log file. The file is closed when this method is invoked +// by default. +func (l *Logs) create() error { + if err := os.MkdirAll(l.getDir(), defaultDirPermission); err != nil { + return fmt.Errorf("unable to create directory for new log file: %v", err) + } + + fileName, fileMode := l.fileName(), os.FileMode(defaultFilePermission) + if message, err := os.Stat(fileName); err == nil { + fileMode = message.Mode() + backupName := l.backup() + if err := os.Rename(fileName, backupName); err != nil { + return fmt.Errorf("failed to rename the log file: %v", err) + } + if err := os.Chmod(backupName, defaultBackupPermission); err != nil { + return fmt.Errorf("failed to change backup log file permission: %v", err) + } + } + newFile, err := os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, fileMode) + if err != nil { + return fmt.Errorf("unable to open new log file: %v", err) + } + l.length, l.file = 0, newFile + return nil +} + +// backup generates a backup file name based on the original file +// name and inserts a timestamp between the file name and extension. +// The timestamp uses the UTC time by default. +func (l *Logs) backup() string { + prefix, extension := l.getPreAndExt() + return filepath.Join(l.getDir(), fmt.Sprintf("%s%s%s", prefix, l.getTimestamp(), extension)) +} + +// getDir returns the directory for the current filename. +func (l *Logs) getDir() string { + return filepath.Dir(l.fileName()) +} + +// getPreAndExt returns the prefix name and extension name +// from Logs's filename. +func (l *Logs) getPreAndExt() (string, string) { + name := filepath.Base(l.fileName()) + extension := filepath.Ext(name) + prefix := name[:len(name)-len(extension)] + "-" + return prefix, extension +} + +// getTimestamp returns the timestamp of current time, and +// uses UTC time by default. +func (l *Logs) getTimestamp() string { + t := time.Now() + if !l.LocalOrUTC { + t = t.UTC() + } + return t.Format(timeFormat) +} + +// roll rotates the log file, close the existing log file and +// create a new one immediately. After rotating, this method +// deletes the old log files according to the configuration. +func (l *Logs) roll() error { + if err := l.close(); err != nil { + return err + } + if err := l.create(); err != nil { + return err + } + l.remove() + return nil +} + +// close closes the file if it is open. +func (l *Logs) close() error { + if l.file == nil { + return nil + } + err := l.file.Sync() + if err != nil { + return err + } + err = l.file.Close() + l.file = nil + return err +} + +// remove delete outdated log files, starting the remove +// goroutine if necessary. +func (l *Logs) remove() { + l.rmOnce.Do(func() { + l.rmCh = make(chan bool, 1) + go l.removeRun() + }) + select { + case l.rmCh <- true: + default: + } +} + +// removeRun manages the deletion of the old log files after +// rotating, which runs in a goroutine. +func (l *Logs) removeRun() { + for range l.rmCh { + if err := l.removeRunOnce(); err != nil { + fmt.Println("failed to remove runonce: ", err) + } + } +} + +// removeRunOnce performs removal of outdated log files. +// Old log files are removed if the number of old files +// exceed the Capacity or the retention time of old files +// is greater than SaveTime. +func (l *Logs) removeRunOnce() error { + if l.SaveVolume == 0 && l.SaveTime == 0 { + return nil + } + + if err := checkParam(l.SaveVolume, l.SaveTime); err != nil { + return err + } + + oldFiles, err := l.oldFilesList() + if err != nil { + return err + } + + var removeFiles []logFile + if l.SaveTime > 0 { + delTime := time.Now().Unix() - int64(l.SaveTime)*oneDaySeconds + var remainingFiles []logFile + for _, f := range oldFiles { + if f.timeStamp.Unix() <= delTime { + removeFiles = append(removeFiles, f) + continue + } + remainingFiles = append(remainingFiles, f) + } + oldFiles = remainingFiles + } + + if l.SaveVolume > 0 && l.SaveVolume < len(oldFiles) { + saved := make(map[string]struct{}, len(oldFiles)) + var remainingFiles []logFile + for _, f := range oldFiles { + saved[f.fileInfo.Name()] = struct{}{} + if l.SaveVolume >= len(saved) { + remainingFiles = append(remainingFiles, f) + continue + } + removeFiles = append(removeFiles, f) + } + oldFiles = remainingFiles + } + + for _, f := range removeFiles { + rmError := os.Remove(filepath.Join(l.getDir(), f.fileInfo.Name())) + if rmError != nil { + err = rmError + } + } + return err +} + +// oldFilesList returns the list of backup log files sorted +// by ModTime. These backup log files are stored in the same +// directory as the current log file. +func (l *Logs) oldFilesList() ([]logFile, error) { + logFiles, err := ioutil.ReadDir(l.getDir()) + if err != nil { + return nil, fmt.Errorf("unable to open the log file directory: %v", err) + } + + prefix, extension := l.getPreAndExt() + + var oldFiles []logFile + + for _, file := range logFiles { + if file.IsDir() { + continue + } + if timeStamp, err := l.extractTime(file.Name(), prefix, extension); err == nil { + oldFiles = append(oldFiles, logFile{fileInfo: file, timeStamp: timeStamp}) + continue + } + } + sort.Slice(oldFiles, func(i, j int) bool { + if i < 0 || i > len(oldFiles) || j < 0 || j > len(oldFiles) { + return false + } + return oldFiles[i].timeStamp.After(oldFiles[j].timeStamp) + }) + + return oldFiles, nil +} + +// extractTime extracts the formatted time from file name by +// stripping the prefix and extension of the file name. This +// prevents fileName from being confused with time.parse. +func (l *Logs) extractTime(name, prefix, extension string) (time.Time, error) { + if !strings.HasSuffix(name, extension) { + return time.Time{}, errors.New("unmatched extension") + } + + if !strings.HasPrefix(name, prefix) { + return time.Time{}, errors.New("unmatched prefix") + } + + timeStamp := name[len(prefix) : len(name)-len(extension)] + return time.Parse(timeFormat, timeStamp) +} + +// checkParam checks whether the parameters are correct +func checkParam(volume int, time int) error { + if volume != 0 { + if volume < minSaveVolume || volume > maxSaveVolume { + return fmt.Errorf("the value of savevolume is incorrect") + } + } + if time != 0 { + if time < minSaveTime || time > maxSaveTime { + return fmt.Errorf("the value of savetime is incorrect") + } + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go new file mode 100644 index 0000000..67807bd --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go @@ -0,0 +1,687 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +const ( + testDirPermission = 0700 + testFilePermission = 0600 + testMByte = 1 + testCapacity = 10 + testCapacity2 = 100 + testCapacity3 = 5 + testSaveTime = 10 + testSaveTime2 = 7 + testSaveVolume = 3 + testSaveVolume2 = 1 + fileCountOne = 1 + fileCountTwo = 2 + fileCountFour = 4 + waitTime = 50 + oneDayHour = 24 + sevenDays = 7 + fourteenDays = 14 + twentyOneDays = 21 + testYear = 2014 + testMonth = 5 + testDay = 4 + testHour = 14 + testMin = 44 + testSec = 33 + testNsec = 555000000 +) + +// TestCreate for test the function of create log file +func TestCreate(t *testing.T) { + convey.Convey("TestCreate", t, func() { + dir := makeTempDir("TestCrate") + defer os.RemoveAll(dir) + l := &Logs{ + FileName: getLogFile(dir), + } + defer l.Close() + + input := []byte("foobarfoobar!") + fileWrite(input, l) + existWithContent(input, getLogFile(dir)) + fileCount(fileCountOne, dir) + }) +} + +// TestOpenFile for test the function of open log file +func TestOpenFile(t *testing.T) { + convey.Convey("TestOpenFile", t, func() { + dir := makeTempDir("TestOpenFile") + defer os.RemoveAll(dir) + fileName := getLogFile(dir) + data := []byte("foo!") + err := ioutil.WriteFile(fileName, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + existWithContent(data, fileName) + + l := &Logs{ + FileName: fileName, + } + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(append(data, b...), fileName) + fileCount(fileCountOne, dir) + }) +} + +// TestWriteTooLong for test the processing of the overlong write error +func TestWriteTooLong(t *testing.T) { + convey.Convey("TestWriteTooLong", t, func() { + mByte = testMByte + dir := makeTempDir("TestWriteTooLong") + defer os.RemoveAll(dir) + + l := &Logs{ + FileName: getLogFile(dir), + Capacity: testCapacity3, + } + defer l.Close() + + b := []byte("barrrrrrrrrrrrrrrrr!") + n, err := l.Write(b) + convey.So(err, convey.ShouldNotBeNil) + convey.So(0, convey.ShouldEqual, n) + convey.So(err.Error(), convey.ShouldEqual, fmt.Sprintf( + "the write lenth %d is greater than the maximum file size %d", len(b), l.Capacity)) + _, err = os.Stat(getLogFile(dir)) + convey.So(err, shouldNotBeExist) + }) +} + +// TestMakeLogDir for test the function of make log file directory +func TestMakeLogDir(t *testing.T) { + convey.Convey("TestMakeLogDir", t, func() { + dir := time.Now().Format("TestMakeLogDir" + timeFormat) + dir = filepath.Join(os.TempDir(), dir) + defer os.RemoveAll(dir) + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + } + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, getLogFile(dir)) + fileCount(fileCountOne, dir) + }) +} + +// TestDefaultFileName for test default log file name +func TestDefaultFileName(t *testing.T) { + convey.Convey("TestDefaultFileName", t, func() { + dir := os.TempDir() + fileName := filepath.Join(dir, filepath.Base(os.Args[0])+"-mindx-dl.log") + defer os.Remove(fileName) + + l := &Logs{} + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, fileName) + }) +} + +// TestAutoRoll for test the automatic log rolling +func TestAutoRoll(t *testing.T) { + convey.Convey("TestAutoRoll", t, func() { + mByte = testMByte + dir := makeTempDir("TestAutoRoll") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + } + defer l.Close() + + b := []byte("aoo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + defer patch1.Reset() + + b2 := []byte("foooooo!") + fileWrite(b2, l) + existWithContent(b2, fileName) + existWithContent(b, getBackupFile(dir, time.Now())) + fileCount(fileCountTwo, dir) + }) +} + +// TestFirstWriteRoll for test the log rolling on first write +func TestFirstWriteRoll(t *testing.T) { + convey.Convey("TestFirstWriteRoll", t, func() { + mByte = testMByte + dir := makeTempDir("TestFirstWriteRoll") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + } + defer l.Close() + + start := []byte("boooooo!") + err := ioutil.WriteFile(fileName, start, testFilePermission) + convey.So(err, convey.ShouldBeNil) + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + defer patch1.Reset() + + b := []byte("fooo!") + fileWrite(b, l) + existWithContent(b, fileName) + existWithContent(start, getBackupFile(dir, time.Now())) + fileCount(fileCountTwo, dir) + }) +} + +// TestSaveVolumeCase1 for test the deleting log files that exceed the volume +func TestSaveVolumeCase1(t *testing.T) { + convey.Convey("TestSaveVolumeCase1", t, func() { + mByte = testMByte + dir := makeTempDir("TestSaveVolumeCase1") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + SaveVolume: testSaveVolume2, + } + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b2 := []byte("foooooo!") + fileWrite(b2, l) + secondFileName := getBackupFile(dir, time.Now()) + existWithContent(b, secondFileName) + existWithContent(b2, fileName) + fileCount(fileCountTwo, dir) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + defer patch2.Reset() + b3 := []byte("baaaaaar!") + fileWrite(b3, l) + thirdFileName := getBackupFile(dir, time.Now()) + existWithContent(b2, thirdFileName) + existWithContent(b3, fileName) + <-time.After(time.Millisecond * waitTime) + fileCount(fileCountTwo, dir) + existWithContent(b2, thirdFileName) + convey.So(secondFileName, shouldNotExist) + }) +} + +// TestSaveVolumeCase2 for test the deleting log files that exceed the volume when a non-log file exists +func TestSaveVolumeCase2(t *testing.T) { + convey.Convey("TestSaveVolumeCase2", t, func() { + mByte = testMByte + dir := makeTempDir("TestSaveVolumeCase2") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{FileName: fileName, Capacity: testCapacity, SaveVolume: testSaveVolume2} + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b2 := []byte("baaaaaar!") + fileWrite(b2, l) + secondFileName := getBackupFile(dir, time.Now()) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + notLogFile := getLogFile(dir) + ".foo" + err := ioutil.WriteFile(notLogFile, []byte("data"), testFilePermission) + convey.So(err, convey.ShouldBeNil) + notLogFileDir := getBackupFile(dir, time.Now()) + err = os.Mkdir(notLogFileDir, testDirPermission) + convey.So(err, convey.ShouldBeNil) + + patch2.Reset() + patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time3 := currentTime + return time3.Add(time.Hour * oneDayHour * twentyOneDays) + }) + defer patch3.Reset() + thirdFileName := getBackupFile(dir, time.Now()) + b3 := []byte("baaaaaaz!") + fileWrite(b3, l) + existWithContent(b2, thirdFileName) + <-time.After(time.Millisecond * waitTime) + fileCount(fileCountFour, dir) + existWithContent(b3, fileName) + convey.So(secondFileName, shouldNotExist) + convey.So(notLogFile, shouldExist) + convey.So(notLogFileDir, shouldExist) + }) +} + +// TestCleanupExistingBackupFiles fot test the clearing the current backup log files +func TestCleanupExistingBackupFiles(t *testing.T) { + convey.Convey("TestCleanupExistingBackupFiles", t, func() { + mByte = testMByte + dir := makeTempDir("TestCleanupExistingBackupFiles") + defer os.RemoveAll(dir) + currentTime := time.Now() + + data := []byte("data") + backup := getBackupFile(dir, time.Now()) + err := ioutil.WriteFile(backup, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + backup = getBackupFile(dir, time.Now()) + err = ioutil.WriteFile(backup, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + fileName := getLogFile(dir) + err = ioutil.WriteFile(fileName, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + SaveVolume: testSaveVolume2, + } + defer l.Close() + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + defer patch2.Reset() + b2 := []byte("foooooo!") + fileWrite(b2, l) + + <-time.After(time.Millisecond * waitTime) + + fileCount(fileCountTwo, dir) + }) +} + +// TestSaveTime for test the deleting log files that exceed the time +func TestSaveTime(t *testing.T) { + convey.Convey("TestSaveTime", t, func() { + mByte = testMByte + dir := makeTempDir("TestSaveTime") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + SaveTime: testSaveTime2, + } + defer l.Close() + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b := []byte("zoo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + b2 := []byte("foooooo!") + fileWrite(b2, l) + existWithContent(b, getBackupFile(dir, time.Now())) + + <-time.After(waitTime * time.Millisecond) + + fileCount(fileCountTwo, dir) + existWithContent(b2, fileName) + existWithContent(b, getBackupFile(dir, time.Now())) + + patch2.Reset() + patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time3 := currentTime + return time3.Add(time.Hour * oneDayHour * twentyOneDays) + }) + defer patch3.Reset() + b3 := []byte("baaaaar!") + fileWrite(b3, l) + existWithContent(b2, getBackupFile(dir, time.Now())) + + <-time.After(waitTime * time.Millisecond) + + fileCount(fileCountTwo, dir) + existWithContent(b3, fileName) + existWithContent(b2, getBackupFile(dir, time.Now())) + }) +} + +// TestOldLogFilesList for test the obtaining the list of old log files +func TestOldLogFilesList(t *testing.T) { + convey.Convey("TestOldLogFilesList", t, func() { + mByte = testMByte + dir := makeTempDir("TestOldLogFiles") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + data := []byte("data") + err := ioutil.WriteFile(fileName, data, testDirPermission) + convey.So(err, convey.ShouldBeNil) + t1, err := time.Parse(timeFormat, currentTime.UTC().Format(timeFormat)) + convey.So(err, convey.ShouldBeNil) + backup := getBackupFile(dir, currentTime) + err = ioutil.WriteFile(backup, data, testDirPermission) + convey.So(err, convey.ShouldBeNil) + + patch := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + defer patch.Reset() + t2, err := time.Parse(timeFormat, time.Now().UTC().Format(timeFormat)) + convey.So(err, convey.ShouldBeNil) + backup2 := getBackupFile(dir, time.Now()) + err = ioutil.WriteFile(backup2, data, testDirPermission) + convey.So(err, convey.ShouldBeNil) + + l := &Logs{FileName: fileName} + files, err := l.oldFilesList() + convey.So(err, convey.ShouldBeNil) + convey.So(fileCountTwo, convey.ShouldEqual, len(files)) + convey.So(t2, convey.ShouldEqual, files[0].timeStamp) + convey.So(t1, convey.ShouldEqual, files[1].timeStamp) + }) +} + +// TestExtractTime for test obtaining log file timestamp +func TestExtractTime(t *testing.T) { + convey.Convey("TestExtractTime", t, func() { + l := &Logs{FileName: "/var/log/myfoo/foo.log"} + prefix, extention := l.getPreAndExt() + + tests := []struct { + fileName string + want time.Time + wantErr bool + }{ + {"foo-2014-05-04T14-44-33.555.log", time.Date( + testYear, testMonth, testDay, testHour, testMin, testSec, testNsec, time.UTC), false}, + {"foo-2014-05-04T14-44-33.555", time.Time{}, true}, + {"2014-05-04T14-44-33.555.log", time.Time{}, true}, + {"foo.log", time.Time{}, true}, + } + + for _, test := range tests { + got, err := l.extractTime(test.fileName, prefix, extention) + convey.So(got, convey.ShouldEqual, test.want) + convey.So(err != nil, convey.ShouldEqual, test.wantErr) + } + }) +} + +// TestLocalTime for test the situation that current time is the local time +func TestLocalTime(t *testing.T) { + convey.Convey("TestLocalTime", t, func() { + mByte = testMByte + dir := makeTempDir("TestLocalTime") + defer os.RemoveAll(dir) + currentTime := time.Now() + + l := &Logs{ + FileName: getLogFile(dir), + Capacity: testCapacity, + LocalOrUTC: true, + } + defer l.Close() + + patch := gomonkey.ApplyFunc(time.Now, func() time.Time { + return currentTime + }) + defer patch.Reset() + b := []byte("boo!") + fileWrite(b, l) + + b2 := []byte("fooooooo!") + fileWrite(b2, l) + existWithContent(b2, getLogFile(dir)) + existWithContent(b, getBackupFileLocal(dir, currentTime)) + }) +} + +// TestRoll for test rolling +func TestRoll(t *testing.T) { + convey.Convey("TestRoll", t, func() { + dir := makeTempDir("TestRotate") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + SaveVolume: testSaveVolume2, + Capacity: testCapacity2, // megabytes + } + defer l.Close() + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + err := l.Roll() + convey.So(err, convey.ShouldBeNil) + + <-time.After(waitTime * time.Millisecond) + + filename2 := getBackupFile(dir, time.Now()) + existWithContent(b, filename2) + existWithContent([]byte{}, fileName) + fileCount(fileCountTwo, dir) + + patch2.Reset() + patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time3 := currentTime + return time3.Add(time.Hour * oneDayHour * twentyOneDays) + }) + defer patch3.Reset() + err = l.Roll() + convey.So(err, convey.ShouldBeNil) + + <-time.After(waitTime * time.Millisecond) + + filename3 := getBackupFile(dir, time.Now()) + existWithContent([]byte{}, filename3) + existWithContent([]byte{}, fileName) + fileCount(fileCountTwo, dir) + + b2 := []byte("foooooo!") + fileWrite(b2, l) + existWithContent(b2, fileName) + }) +} + +// TestJson for test JSON conversion +func TestJson(t *testing.T) { + convey.Convey("TestJson", t, func() { + data := []byte(` + { + "filename": "foo", + "capacity": 10, + "savetime": 10, + "savevolume": 3, + "localorutc": true + }`[1:]) + + l := Logs{} + err := json.Unmarshal(data, &l) + convey.So(err, convey.ShouldBeNil) + convey.So("foo", convey.ShouldEqual, l.FileName) + convey.So(testCapacity, convey.ShouldEqual, l.Capacity) + convey.So(testSaveTime, convey.ShouldEqual, l.SaveTime) + convey.So(testSaveVolume, convey.ShouldEqual, l.SaveVolume) + convey.So(true, convey.ShouldEqual, l.LocalOrUTC) + }) +} + +// makeTempDir creates a file in the OS temp directory to keep parallel test +func makeTempDir(name string) string { + dir := time.Now().Format(name + timeFormat) + dir = filepath.Join(os.TempDir(), dir) + err := os.Mkdir(dir, testDirPermission) + convey.So(err, convey.ShouldBeNil) + return dir +} + +// existWithContent checks that the given file exists and has the correct content +func existWithContent(content []byte, dir string) { + info, err := os.Stat(dir) + convey.So(err, convey.ShouldBeNil) + convey.So(int64(len(content)), convey.ShouldEqual, info.Size()) + + b, err := ioutil.ReadFile(dir) + convey.So(err, convey.ShouldBeNil) + convey.So(content, convey.ShouldResemble, b) +} + +// getLogFile returns the log file name in the given directory for the current fake time +func getLogFile(dir string) string { + return filepath.Join(dir, "foobar.log") +} + +func getBackupFile(dir string, t time.Time) string { + return filepath.Join(dir, "foobar-"+t.UTC().Format(timeFormat)+".log") +} + +func getBackupFileLocal(dir string, t time.Time) string { + return filepath.Join(dir, "foobar-"+t.Format(timeFormat)+".log") +} + +// fileCount checks that the number of files in the directory is exp. +func fileCount(exp int, dir string) { + files, err := ioutil.ReadDir(dir) + convey.So(err, convey.ShouldBeNil) + convey.So(len(files), convey.ShouldEqual, exp) +} + +func fileWrite(b []byte, l *Logs) { + n, err := l.Write(b) + convey.So(err, convey.ShouldBeNil) + convey.So(len(b), convey.ShouldEqual, n) +} + +func shouldNotBeExist(actual interface{}, expected ...interface{}) string { + err, ok := actual.(error) + if !ok { + return "incorrect parameter type" + } + if os.IsNotExist(err) { + return "" + } + return "File exists, but should not have been created" +} +func shouldNotExist(actual interface{}, expected ...interface{}) string { + path, ok := actual.(string) + if !ok { + return "incorrect parameter type" + } + _, err := os.Stat(path) + if os.IsNotExist(err) { + return "" + } + return fmt.Sprintf("expected to get os.IsNotExist, but instead got %v", err) +} + +func shouldExist(actual interface{}, expected ...interface{}) string { + path, ok := actual.(string) + if !ok { + return "incorrect parameter type" + } + _, err := os.Stat(path) + if err != nil { + return fmt.Sprintf("expected file to exist, but got error from os.Stat: %v", err) + } + return "" +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/types.go b/mind-cluster/component/ascend-common/common-utils/hwlog/types.go new file mode 100644 index 0000000..e97c80b --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/types.go @@ -0,0 +1,49 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import "errors" + +// ContextKey especially for context value +// to solve problem of "should not use basic type untyped string as key in context.WithValue" +type ContextKey string + +// String the implement of String method +func (c ContextKey) String() string { + return string(c) +} + +const ( + // UserID used for context value key of "ID" + UserID ContextKey = "UserID" + // ReqID used for context value key of "requestID" + ReqID ContextKey = "RequestID" + // extraDeepKey used for context value key of "extraDeepKey" + extraDeepKey ContextKey = "extraDeepKey" +) + +// SelfLogWriter used this to replace some opensource log +type SelfLogWriter struct { +} + +// Write implement the interface of io.writer +func (l *SelfLogWriter) Write(p []byte) (int, error) { + if RunLog == nil { + return -1, errors.New("hwlog is not initialized") + } + RunLog.Info(string(p)) + return len(p), nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go new file mode 100644 index 0000000..40955f4 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go @@ -0,0 +1,98 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "bytes" + "context" + "fmt" + "log" + "runtime" + "strings" +) + +// printHelper helper function for log printing +func printHelper(lg *log.Logger, msg string, maxLogLength int, ctx ...context.Context) { + str := getCallerInfo(ctx...) + trimMsg := strings.Replace(msg, "\r", " ", -1) + trimMsg = strings.Replace(trimMsg, "\n", " ", -1) + runeArr := []rune(trimMsg) + if length := len(runeArr); length > maxLogLength { + trimMsg = string(runeArr[:maxLogLength]) + } + lg.Println(str + trimMsg) +} + +// getCallerInfo gets the caller's information +func getCallerInfo(ctx ...context.Context) string { + var deep = stackDeep + var userID interface{} + var traceID interface{} + for _, c := range ctx { + if c == nil { + deep++ + continue + } + userID = c.Value(UserID) + traceID = c.Value(ReqID) + if val := c.Value(extraDeepKey); val != nil { + currentVal, _ := val.(int) // security type assertions, invalid values are automatically zeroed + deep += currentVal + } + } + var funcName string + pc, codePath, codeLine, ok := runtime.Caller(deep) + if ok { + funcName = runtime.FuncForPC(pc).Name() + } + p := strings.Split(codePath, "/") + l := len(p) + if l == pathLen { + funcName = p[l-1] + } else if l > pathLen { + funcName = fmt.Sprintf("%s/%s", p[l-pathLen], p[l-1]) + } + callerPath := fmt.Sprintf("%s:%d", funcName, codeLine) + goroutineID := getGoroutineID() + str := fmt.Sprintf("%-8s%s ", goroutineID, callerPath) + if userID != nil || traceID != nil { + str = fmt.Sprintf("%s{%#v}-{%#v} ", str, userID, traceID) + } + return str +} + +// getCallerGoroutineID gets the goroutineID +func getGoroutineID() string { + b := make([]byte, bitsize, bitsize) + b = b[:runtime.Stack(b, false)] + b = bytes.TrimPrefix(b, []byte("goroutine ")) + b = b[:bytes.IndexByte(b, ' ')] + return string(b) +} + +// DeepIncrease increases the stack depth by 1 +func DeepIncrease(ctx context.Context) context.Context { + if ctx == nil { + return context.WithValue(context.Background(), extraDeepKey, 1) + } + + var currentVal int + if val := ctx.Value(extraDeepKey); val != nil { + currentVal, _ = val.(int) // security type assertions, invalid values are automatically zeroed + } + + return context.WithValue(ctx, extraDeepKey, currentVal+1) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go new file mode 100644 index 0000000..ca2bda2 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go @@ -0,0 +1,38 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "context" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestUtilsFunc(t *testing.T) { + convey.Convey("test utils", t, func() { + convey.Convey("test utils func", func() { + lg := new(logger) + conf := &LogConfig{OnlyToStdout: true} + userCtx := context.TODO() + userCtx = context.WithValue(userCtx, UserID, 0) + userCtx = context.WithValue(userCtx, ReqID, 0) + err := lg.setLogger(conf) + convey.So(err, convey.ShouldBeNil) + printHelper(lg.lgInfo, "test", defaultMaxEachLineLen) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go new file mode 100644 index 0000000..fdab9a8 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go @@ -0,0 +1,226 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limiter +package limiter + +import ( + "context" + "errors" + "fmt" + "math" + "net/http" + "regexp" + "strconv" + "strings" + "syscall" + "time" + + "ascend-common/common-utils/cache" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" +) + +const ( + kilo = 1000.0 + // DefaultDataLimit default http body limit size + DefaultDataLimit = 1024 * 1024 * 10 + defaultMaxConcurrency = 1024 + maxStringLen = 20 + // DefaultCacheSize default cache size + DefaultCacheSize = 1024 * 100 + arrLen = 2 + // IPReqLimitReg ip request limit regex string + IPReqLimitReg = "^[1-9]\\d{0,2}/[1-9]\\d{0,2}$" +) + +type limitHandler struct { + concurrency chan struct{} + httpHandler http.Handler + log bool + method string + limitBytes int64 + ipExpiredTime time.Duration + ipCache *cache.ConcurrencyLRUCache +} + +// HandlerConfig the configuration of the limitHandler +type HandlerConfig struct { + // PrintLog whether you need print access log, when use gin framework, suggest to set false,otherwise set true + PrintLog bool + // Method only allow setting http method pass + Method string + // LimitBytes set the max http body size + LimitBytes int64 + // TotalConCurrency set the program total concurrent http request + TotalConCurrency int + // IPConCurrency set the signle IP concurrent http request "2/1sec" + IPConCurrency string + // CacheSize the local cacheSize + CacheSize int +} + +// StatusResponseWriter the writer record the http status +type StatusResponseWriter struct { + http.ResponseWriter + http.Hijacker + Status int +} + +// WriteHeader override the WriteHeader method +func (w *StatusResponseWriter) WriteHeader(status int) { + w.ResponseWriter.WriteHeader(status) + w.Status = status +} + +// ServeHTTP implement http.Handler +func (h *limitHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { + req.Body = http.MaxBytesReader(w, req.Body, h.limitBytes) + ctx := initContext(req) + path := req.URL.Path + clientUserAgent := req.UserAgent() + clientIP := utils.ClientIP(req) + if clientIP != "" && h.ipCache != nil { + if !h.ipCache.SetIfNX(fmt.Sprintf("key-%s", clientIP), "v", h.ipExpiredTime) { + hwlog.RunLog.WarnfWithCtx(ctx, "Single IP request reject:%s: %s <%3d> |%15s |%s |%d ", req.Method, + path, http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) + http.Error(w, "503 too busy", http.StatusServiceUnavailable) + return + } + } + select { + case _, ok := <-h.concurrency: + if !ok { + // channel closed and no need return token + return + } + if h.method != "" && req.Method != h.method { + http.NotFound(w, req) + // recover token to the bucket + h.concurrency <- struct{}{} + return + } + hwlog.RunLog.Debugf("token count:%d", len(h.concurrency)) + start := time.Now() + statusRes := newResponse(w) + h.httpHandler.ServeHTTP(statusRes, req) + stop := time.Since(start) + h.concurrency <- struct{}{} + latency := int(math.Ceil(float64(stop.Nanoseconds()) / kilo / kilo)) + if h.log { + hwlog.RunLog.InfofWithCtx(ctx, "%s %s: %s <%3d> (%dms) |%15s |%s |%d", req.Proto, req.Method, path, + statusRes.Status, latency, clientIP, clientUserAgent, syscall.Getuid()) + } + default: + hwlog.RunLog.WarnfWithCtx(ctx, "Total reject request:%s: %s <%3d> |%15s |%s |%d ", req.Method, path, + http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) + http.Error(w, "503 too busy", http.StatusServiceUnavailable) + } +} + +func newResponse(w http.ResponseWriter) *StatusResponseWriter { + jk, ok := w.(http.Hijacker) + if !ok { + hwlog.RunLog.Warn("hijack not implement") + } + statusRes := &StatusResponseWriter{ + ResponseWriter: w, + Status: http.StatusOK, + Hijacker: jk, + } + return statusRes +} + +func initContext(req *http.Request) context.Context { + ctx := context.Background() + reqID := req.Header.Get(hwlog.ReqID.String()) + if reqID != "" { + ctx = context.WithValue(context.Background(), hwlog.ReqID, reqID) + } + id := req.Header.Get(hwlog.UserID.String()) + if id != "" { + ctx = context.WithValue(ctx, hwlog.UserID, id) + } + return ctx +} + +// NewLimitHandler new a bucket-token limiter +func NewLimitHandler(maxConcur, maxConcurrency int, handler http.Handler, printLog bool) (http.Handler, error) { + return NewLimitHandlerWithMethod(maxConcur, maxConcurrency, handler, printLog, "") +} + +// NewLimitHandlerWithMethod new a bucket-token limiter with specific http method +func NewLimitHandlerWithMethod(maxConcur, maxConcurrency int, handler http.Handler, printLog bool, + httpMethod string) (http.Handler, error) { + if maxConcur < 1 || maxConcur > maxConcurrency { + return nil, errors.New("maxConcurrency parameter error") + } + conchan := make(chan struct{}, maxConcur) + return createHandler(conchan, handler, printLog, httpMethod, DefaultDataLimit), nil +} + +func createHandler(ch chan struct{}, handler http.Handler, printLog bool, + httpMethod string, bodySizeLimit int64) *limitHandler { + h := &limitHandler{ + concurrency: ch, + httpHandler: handler, + log: printLog, + method: httpMethod, + limitBytes: bodySizeLimit, + ipExpiredTime: time.Duration(-1), + } + for i := 0; i < cap(ch); i++ { + h.concurrency <- struct{}{} + } + return h +} + +// NewLimitHandlerV2 new a bucket-token limiter which contains limit request by IP +func NewLimitHandlerV2(handler http.Handler, conf *HandlerConfig) (http.Handler, error) { + if conf == nil { + return nil, errors.New("parameter error") + } + if conf.TotalConCurrency < 1 || conf.TotalConCurrency > defaultMaxConcurrency { + return nil, errors.New("totalConCurrency parameter error") + } + if len(conf.Method) > maxStringLen { + return nil, errors.New("method parameter error") + } + if conf.CacheSize <= 0 { + hwlog.RunLog.Info("use default cache size") + conf.CacheSize = DefaultCacheSize + } + reg := regexp.MustCompile(IPReqLimitReg) + if !reg.Match([]byte(conf.IPConCurrency)) { + return nil, errors.New("IPConCurrency parameter error") + } + conchan := make(chan struct{}, conf.TotalConCurrency) + h := createHandler(conchan, handler, conf.PrintLog, conf.Method, conf.LimitBytes) + arr := strings.Split(conf.IPConCurrency, "/") + if len(arr) != arrLen || arr[0] == "0" { + return nil, errors.New("IPConCurrency parameter error") + } + arr1, err := strconv.ParseInt(arr[1], 0, 0) + if err != nil { + return nil, fmt.Errorf("IPConCurrency parameter(%s) error, parse to int failed: %v", arr[1], err) + } + arr0, err := strconv.ParseInt(arr[0], 0, 0) + if err != nil || arr0 == 0 { + return nil, fmt.Errorf("IPConCurrency parameter(%s) error,parse to int failed: %v", arr[0], err) + } + h.ipExpiredTime = time.Duration(arr1 * int64(time.Second) / arr0) + h.ipCache = cache.New(DefaultCacheSize) + return h, nil + +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go new file mode 100644 index 0000000..69dbb8e --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go @@ -0,0 +1,119 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limiter +package limiter + +import ( + "context" + "net/http" + "net/url" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" +) + +func init() { + config := hwlog.LogConfig{ + OnlyToStdout: true, + } + hwlog.InitRunLogger(&config, context.TODO()) +} +func TestServeHTTP(t *testing.T) { + convey.Convey("test limitHandler serveHTTP", t, func() { + h, w, r := initVarable() + convey.Convey("header contains reqID and userID,", func() { + mock := gomonkey.ApplyMethodFunc(h.httpHandler, "ServeHTTP", func(http.ResponseWriter, + *http.Request) { + return + }) + defer mock.Reset() + h.ServeHTTP(w.ResponseWriter, r) + convey.So(len(h.concurrency), convey.ShouldEqual, 1) + }) + convey.Convey("token channel close,", func() { + mock := gomonkey.ApplyFunc(http.Error, func(http.ResponseWriter, string, int) { + return + }) + defer mock.Reset() + _, ok := <-h.concurrency + if !ok { + return + } + h.ServeHTTP(w.ResponseWriter, r) + convey.So(len(h.concurrency), convey.ShouldEqual, 0) + }) + }) +} + +func initVarable() (*limitHandler, StatusResponseWriter, *http.Request) { + lh, err := NewLimitHandler(1, len2, http.DefaultServeMux, false) + if err != nil { + return nil, StatusResponseWriter{}, nil + } + v, ok := lh.(*limitHandler) + if !ok { + return nil, StatusResponseWriter{}, nil + } + w := StatusResponseWriter{ + ResponseWriter: nil, + Status: 0, + } + r := &http.Request{ + URL: &url.URL{ + Path: "test.com", + }, + Header: map[string][]string{"userID": {"1"}, "reqID": {"requestIDxxxx"}}, + Method: "GET", + } + return v, w, r +} + +func TestNewLimitHandlerV2(t *testing.T) { + conf := &HandlerConfig{ + PrintLog: false, + Method: "", + LimitBytes: DefaultDataLimit, + TotalConCurrency: defaultMaxConcurrency, + IPConCurrency: "2/1", + CacheSize: DefaultCacheSize, + } + convey.Convey("normal situation,no err return", t, func() { + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("IPConCurrency parameter error", t, func() { + conf.IPConCurrency = "2021/1" + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("cacheSize parameter error", t, func() { + conf.CacheSize = 0 + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("method parameter error", t, func() { + conf.Method = "20/iajsdkjas2jhjdklsjkldjsdfasd1" + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("TotalConCurrency parameter error", t, func() { + conf.TotalConCurrency = 0 + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go new file mode 100644 index 0000000..b81d511 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go @@ -0,0 +1,161 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limit listener, refer to "golang.org/x/net/netutil" and +// change the acquire method, if acquire failed, return false immediately +package limiter + +import ( + "errors" + "fmt" + "net" + "strings" + "sync" + "time" + + "ascend-common/common-utils/cache" + "ascend-common/common-utils/hwlog" +) + +const ( + maxConnection = 1024 + maxIPConnection = 512 + + largeMaxConnection = 16384 +) + +func commonLimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { + if IPConnLimit < 0 || IPConnLimit > maxIPConnection { + return nil, errors.New("the parameter IPConnLimit is illegal") + } + bucket := make(chan struct{}, totalConnLimit) + ll := &localLimitListener{ + Listener: l, + buckets: bucket, + ipConnLimit: int64(IPConnLimit), + } + if cacheSize > 0 { + ll.ipCache = cache.New(cacheSize) + } + return ll, nil +} + +// LimitListener returns a Listener that accepts at most n connections at the same time +func LimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { + if totalConnLimit < 0 || totalConnLimit > maxConnection { + return nil, errors.New("the parameter totalConnLimit is illegal") + } + return commonLimitListener(l, totalConnLimit, IPConnLimit, cacheSize) +} + +type localLimitListener struct { + net.Listener + buckets chan struct{} + closeOnce sync.Once + ipCache *cache.ConcurrencyLRUCache + ipConnLimit int64 +} + +// acquire acquires the limiting semaphore. Returns true if successfully +// accquired, false if the listener is closed or reach the max limit +func (l *localLimitListener) acquire() bool { + select { + case l.buckets <- struct{}{}: + return true + default: + return false + } +} +func (l *localLimitListener) release() { <-l.buckets } + +// Accept implement net.Listener interface +func (l *localLimitListener) Accept() (net.Conn, error) { + c, err := l.Listener.Accept() + if err != nil { + return nil, err + } + // ip connection limit + ip, cacheKey := getIpAndKey(c) + if ip != "" && l.ipCache != nil { + if counts, err := l.ipCache.INCR(cacheKey, -1); err == nil && counts > l.ipConnLimit { + hwlog.RunLog.Warn("ip connections reach max limit, connection will to force closed") + return closeImmediately(c, l.ipCache), nil + } + } + // total tcp connection limit + if l.acquire() { + return &limitListenerConn{Conn: c, release: l.release, ipCache: l.ipCache}, nil + } + hwlog.RunLog.Warn("limit forbidden, connection will to force closed") + return closeImmediately(c, l.ipCache), nil + +} + +func getIpAndKey(c net.Conn) (string, string) { + ipWithPort := c.RemoteAddr().String() + if ipWithPort != "" { + s := strings.Split(ipWithPort, ":") + return s[0], fmt.Sprintf("key-conn-%s", s[0]) + } + return "", "" +} + +func closeImmediately(c net.Conn, lruCache *cache.ConcurrencyLRUCache) net.Conn { + // once the connection reach the max limit, force close the connection + tcpConn, ok := c.(*net.TCPConn) + if ok { + if err := tcpConn.SetLinger(0); err != nil { + hwlog.RunLog.Warnf("Error when setting linger: %s", err) + } + } + + err := c.Close() + if err != nil { + hwlog.RunLog.Warn(err) + } + return &limitListenerConn{Conn: c, release: func() {}, ipCache: lruCache} +} + +// Close implement net.Listener interface +func (l *localLimitListener) Close() error { + err := l.Listener.Close() + l.closeOnce.Do(func() { close(l.buckets) }) + return err +} + +type limitListenerConn struct { + net.Conn + releaseOnce sync.Once + release func() + ipCache *cache.ConcurrencyLRUCache +} + +// Close override net.Conn interface +func (l *limitListenerConn) Close() error { + err := l.Conn.Close() + if err != nil { + hwlog.RunLog.Debugf("close grpc connect failed: %v", err) + return fmt.Errorf("close grpc connect failed: %v", err) + } + l.releaseOnce.Do(l.release) + ip, cacheKey := getIpAndKey(l.Conn) + if ip != "" && l.ipCache != nil { + d, err := l.ipCache.DECR(cacheKey, time.Hour) + if err != nil { + hwlog.RunLog.Error(err) + } + hwlog.RunLog.Debugf("decrement ip connections %d", d) + } + return err +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go new file mode 100644 index 0000000..631e1bb --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go @@ -0,0 +1,125 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limiter +package limiter + +import ( + "errors" + "net" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +const ( + len2 = 2 +) + +func TestLimitListenerAccept(t *testing.T) { + convey.Convey("test Accept function", t, func() { + + limitLor, err := LimitListener(&mockLicener{}, len2, len2, DefaultCacheSize) + if err != nil { + return + } + l, ok := limitLor.(*localLimitListener) + if !ok { + return + } + mock2 := gomonkey.ApplyFunc(getIpAndKey, func(net.Conn) (string, string) { + return "127.0.0.1", "key-127.0.0.1" + }) + defer mock2.Reset() + convey.Convey("acquire token success", func() { + _, err = l.Accept() + convey.So(err, convey.ShouldEqual, nil) + }) + + convey.Convey("accept failed", func() { + mock := gomonkey.ApplyMethodFunc(l.Listener, "Accept", func() (net.Conn, error) { + return nil, errors.New("mock error") + }) + defer mock.Reset() + con, err := l.Accept() + convey.So(err, convey.ShouldNotEqual, nil) + convey.So(con, convey.ShouldEqual, nil) + }) + + convey.Convey("acquire token failed", func() { + mock := gomonkey.ApplyPrivateMethod(l, "acquire", func(*localLimitListener) bool { + return false + }) + defer mock.Reset() + con, err := l.Accept() + convey.So(err, convey.ShouldEqual, nil) + conm, ok := con.(*limitListenerConn) + if !ok { + return + } + convey.So(conm.release, convey.ShouldNotEqual, nil) + }) + + }) +} + +type mockLicener struct { +} + +func (l *mockLicener) Accept() (net.Conn, error) { + return &net.TCPConn{}, nil +} + +func (l *mockLicener) Addr() net.Addr { + return &net.IPAddr{ + IP: []byte("127.0.0.1"), + Zone: "", + } +} + +func (l *mockLicener) Close() error { + return nil +} + +func TestGetIpAndKey(t *testing.T) { + convey.Convey("test getIp function", t, func() { + c := net.TCPConn{} + mock := gomonkey.ApplyMethodFunc(&c, "RemoteAddr", func() net.Addr { + return &net.IPAddr{ + IP: []byte("127.0.0.1"), + Zone: "", + } + }) + defer mock.Reset() + ip, _ := getIpAndKey(&c) + convey.So(ip, convey.ShouldNotEqual, "") + }) +} + +func TestLimitListener(t *testing.T) { + convey.Convey("test new listener function success", t, func() { + l, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection, DefaultDataLimit) + convey.So(l, convey.ShouldNotEqual, nil) + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("test new listener function", t, func() { + _, err := LimitListener(&mockLicener{}, maxConnection+1, maxIPConnection, DefaultDataLimit) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("test new listener function", t, func() { + _, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection+1, DefaultDataLimit) + convey.So(err, convey.ShouldNotEqual, nil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go new file mode 100644 index 0000000..9117d07 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go @@ -0,0 +1,64 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a writer limiter +package limiter + +import ( + "bytes" + "errors" + + "ascend-common/common-utils/hwlog" +) + +const defaultLimit = 1024 + +// LimitedWriter limit the size of written data +type LimitedWriter struct { + buffer *bytes.Buffer + limit int + size int +} + +// NewLimitedWriter create a LimitedWriter +func NewLimitedWriter(limit int) *LimitedWriter { + if limit <= 0 { + hwlog.RunLog.Warnf("limit: %v is invalid, set default limit: %v", limit, defaultLimit) + limit = defaultLimit + } + return &LimitedWriter{ + buffer: &bytes.Buffer{}, + limit: limit, + } +} + +// Write write bytes to buffer +func (lw *LimitedWriter) Write(p []byte) (int, error) { + if lw.size+len(p) > lw.limit { + return 0, errors.New("buffer limit exceeded") + } + n, err := lw.buffer.Write(p) + if err == nil { + lw.size += n + } + return n, err +} + +// GetBufferBytes get buffer bytes +func (lw *LimitedWriter) GetBufferBytes() []byte { + if lw.buffer == nil { + return []byte{} + } + return lw.buffer.Bytes() +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go new file mode 100644 index 0000000..9a308f3 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go @@ -0,0 +1,37 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a writer limiter +package limiter + +import ( + "io" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestLimitWriterWrite(t *testing.T) { + convey.Convey("test limiter Writer write function", t, func() { + data := []byte("test") + limitBuffer := NewLimitedWriter(len(data)) + + n, err := limitBuffer.Write(data) + convey.So(err, convey.ShouldBeNil) + convey.So(n, convey.ShouldEqual, len(data)) + n, err = limitBuffer.Write(data) + convey.So(err, convey.ShouldEqual, io.EOF) + convey.So(n, convey.ShouldEqual, 0) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go new file mode 100644 index 0000000..1a97a1b --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go @@ -0,0 +1,71 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security rand +package rand + +import ( + "errors" + "fmt" + "io" + "os" + "runtime" + "sync" + "time" +) + +const ( + maxReadSize = 1<<25 - 1 +) + +// A randomReader satisfies reads by reading the file named name. +type randomReader struct { + f io.Reader + mu sync.Mutex +} + +func init() { + Reader = &randomReader{} +} + +func warnBlocked() { + fmt.Println("mindx-security/rand: blocked for 60 seconds waiting to read random data from the kernel") +} + +var supportOs = "linux" + +// Read implements the interface of io.Reader +func (r *randomReader) Read(b []byte) (int, error) { + t := time.AfterFunc(time.Minute, warnBlocked) + defer t.Stop() + if len(b) > maxReadSize { + return 0, errors.New("byte size is too large") + } + r.mu.Lock() + defer r.mu.Unlock() + if runtime.GOOS != supportOs { + return 0, errors.New("not supported") + } + f, err := os.Open("/dev/random") + if err != nil { + return 0, err + } + defer func() { + err = f.Close() + if err != nil { + fmt.Println("close random file failed") + } + }() + return f.Read(b) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go new file mode 100644 index 0000000..b02d9d6 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go @@ -0,0 +1,54 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security rand +package rand + +import ( + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +const ( + illegalSize = 1 << 25 +) + +func TestInnerRead(t *testing.T) { + convey.Convey("test random read func", t, func() { + reader := &randomReader{} + convey.Convey("read size too large, err returned", func() { + bs := make([]byte, illegalSize, illegalSize) + r, err := reader.Read(bs) + convey.So(err.Error(), convey.ShouldEqual, "byte size is too large") + convey.So(r, convey.ShouldEqual, 0) + }) + convey.Convey("windows,err returned", func() { + mock := gomonkey.ApplyGlobalVar(&supportOs, "windows") + defer mock.Reset() + bs := make([]byte, 1, 1) + r, err := reader.Read(bs) + convey.So(err.Error(), convey.ShouldEqual, "not supported") + convey.So(r, convey.ShouldEqual, 0) + }) + convey.Convey("normal situation,no err returned", func() { + // the length of byte is one, to prevent block when generate random + bs := make([]byte, 1, 1) + r, err := reader.Read(bs) + convey.So(err, convey.ShouldEqual, nil) + convey.So(r, convey.ShouldEqual, 1) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random.go b/mind-cluster/component/ascend-common/common-utils/rand/random.go new file mode 100644 index 0000000..353d868 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/random.go @@ -0,0 +1,28 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security io.Reader +package rand + +import ( + "io" +) + +// Reader rand reader to generate security random bytes +var Reader io.Reader + +// Read is a helper function that calls Reader.Read using io.ReadFull. +func Read(b []byte) (int, error) { + return io.ReadFull(Reader, b) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random_test.go b/mind-cluster/component/ascend-common/common-utils/rand/random_test.go new file mode 100644 index 0000000..04ce333 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/random_test.go @@ -0,0 +1,32 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security rand +package rand + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestRead(t *testing.T) { + convey.Convey("package function test,normal situation", t, func() { + // the length of byte is one, to prevent block when generate random + bs := make([]byte, 1, 1) + l, err := Read(bs) + convey.So(err, convey.ShouldEqual, nil) + convey.So(l, convey.ShouldEqual, 1) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env.go b/mind-cluster/component/ascend-common/common-utils/utils/env.go new file mode 100644 index 0000000..4402375 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/env.go @@ -0,0 +1,35 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils env function +package utils + +import ( + "fmt" + "os/user" + "strconv" +) + +// GetCurrentUid get current uid +func GetCurrentUid() (uint32, error) { + userInfo, err := user.Current() + if err != nil { + return 0, fmt.Errorf("get current user info failed: %v", err) + } + uid, err := strconv.Atoi(userInfo.Uid) + if err != nil { + return 0, fmt.Errorf("convert uid to int failed: %v", err) + } + return uint32(uid), nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env_test.go b/mind-cluster/component/ascend-common/common-utils/utils/env_test.go new file mode 100644 index 0000000..95d8983 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/env_test.go @@ -0,0 +1,51 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils env test +package utils + +import ( + "fmt" + "os/user" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +func TestGetCurrentUid(t *testing.T) { + convey.Convey("test func GetCurrentUid success", t, func() { + var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "0"}, nil) + defer p1.Reset() + uid, err := GetCurrentUid() + convey.So(err, convey.ShouldBeNil) + convey.So(uid, convey.ShouldEqual, 0) + }) + convey.Convey("test func GetCurrentUid failed, get current user info failed", t, func() { + var p1 = gomonkey.ApplyFuncReturn(user.Current, nil, testErr) + defer p1.Reset() + uid, err := GetCurrentUid() + expErr := fmt.Errorf("get current user info failed: %v", testErr) + convey.So(err, convey.ShouldResemble, expErr) + convey.So(uid, convey.ShouldEqual, 0) + }) + convey.Convey("test func GetCurrentUid failed, uid is invalid", t, func() { + var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "invalid uid"}, nil) + defer p1.Reset() + uid, err := GetCurrentUid() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "convert uid to int failed") + convey.So(uid, convey.ShouldEqual, 0) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file.go b/mind-cluster/component/ascend-common/common-utils/utils/file.go new file mode 100644 index 0000000..253e2b5 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file.go @@ -0,0 +1,176 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "reflect" + "strings" +) + +const ( + // FileMode file privilege + FileMode = 0600 + // Size10M bytes of 10M + Size10M = 10 * 1024 * 1024 + maxSize = 1024 * 1024 * 1024 +) + +// ReadLimitBytes read limit length of contents from file path +func ReadLimitBytes(path string, limitLength int) ([]byte, error) { + if limitLength < 0 || limitLength > maxSize { + return nil, errors.New("the limit length is not valid") + } + + key, err := CheckPath(path) + if err != nil { + return nil, err + } + file, err := os.OpenFile(key, os.O_RDONLY, FileMode) + if err != nil { + return nil, errors.New(fmt.Sprintf("open file with read-only and %04o mode failed", FileMode)) + } + defer file.Close() + buf := make([]byte, limitLength, limitLength) + l, err := file.Read(buf) + if err != nil { + return nil, fmt.Errorf("read file failed: %v", err) + } + return buf[0:l], nil +} + +// LoadFile load file content +func LoadFile(filePath string) ([]byte, error) { + if filePath == "" { + return nil, nil + } + absPath, err := filepath.Abs(filePath) + if err != nil { + return nil, fmt.Errorf("the filePath is invalid: %v", err) + } + if !IsExist(absPath) { + return nil, nil + } + + return ReadLimitBytes(absPath, Size10M) +} + +func closeFile(file *os.File) { + if file == nil { + return + } + if err := file.Close(); err != nil { + return + } + return +} + +// CopyFile copy file +func CopyFile(src, dst string) error { + src, err := CheckPath(src) + if err != nil { + return err + } + if IsExist(dst) { + dst, err = CheckPath(dst) + if err != nil { + return err + } + } + + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer closeFile(srcFile) + + srcInfo, err := os.Stat(src) + if err != nil { + return err + } + + dstFile, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE|os.O_TRUNC, srcInfo.Mode()) + if err != nil { + return err + } + defer closeFile(dstFile) + + if _, err = io.Copy(dstFile, srcFile); err != nil { + return err + } + return os.Chmod(dst, srcInfo.Mode()) +} + +// CopyDir recursively copy files +func CopyDir(src string, dst string) error { + var ( + err error + fds []os.FileInfo = nil + dstInfo os.FileInfo + ) + + if dstInfo, err = os.Stat(src); err != nil { + return err + } + if err = os.MkdirAll(dst, dstInfo.Mode()); err != nil { + return err + } + if subFolder(src, dst) { + return errors.New("the destination directory is a subdirectory of the source directory") + } + if fds, err = ioutil.ReadDir(src); err != nil { + return err + } + for _, fd := range fds { + srcFile := filepath.Join(src, fd.Name()) + dstFile := filepath.Join(dst, fd.Name()) + if fd.IsDir() { + if err = CopyDir(srcFile, dstFile); err != nil { + return err + } + } else { + if err = CopyFile(srcFile, dstFile); err != nil { + return err + } + } + } + return nil +} + +func subFolder(src, dst string) bool { + if src == dst { + return true + } + srcReal, err := filepath.EvalSymlinks(src) + if err != nil { + return false + } + dstReal, err := filepath.EvalSymlinks(dst) + if err != nil { + return false + } + srcList := strings.Split(srcReal, string(os.PathSeparator)) + dstList := strings.Split(dstReal, string(os.PathSeparator)) + if len(srcList) > len(dstList) { + return false + } + return reflect.DeepEqual(srcList, dstList[:len(srcList)]) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check.go new file mode 100644 index 0000000..4134245 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_check.go @@ -0,0 +1,240 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" + "syscall" +) + +const ( + notValidPath = "not-valid-file-path" + maxAllowFileSize int64 = 1024 * 100 // in megabytes + oneMegabytes int64 = 1024 * 1024 + // DefaultWhiteList default white list in string + DefaultWhiteList = "-_./~" + // DefaultStringLength default string max length + DefaultStringLength = 256 + // DefaultPathLength default path max length + DefaultPathLength = 4096 +) + +// RealFileChecker Check whether the file is valid +func RealFileChecker(path string, checkParent, allowLink bool, size int64) (string, error) { + realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) + if err != nil { + return notValidPath, err + } + if fileInfo.IsDir() { + return notValidPath, fmt.Errorf("invalid dir") + } + if !fileInfo.Mode().IsRegular() { + return notValidPath, fmt.Errorf("invalid regular file") + } + if size > maxAllowFileSize || size < 0 { + return notValidPath, fmt.Errorf("invalid size") + } + if fileInfo.Size() > size*oneMegabytes { + return notValidPath, fmt.Errorf("size too large") + } + return realPath, nil +} + +// RealDirChecker Check whether the directory is valid +func RealDirChecker(path string, checkParent, allowLink bool) (string, error) { + realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) + if err != nil { + return notValidPath, err + } + if !fileInfo.IsDir() { + return notValidPath, fmt.Errorf("is not dir") + } + return realPath, nil +} + +// PathStringChecker Check whether the directory string is valid +func PathStringChecker(path string) (string, error) { + realPath, err := filepath.Abs(path) + if err != nil { + return notValidPath, err + } + if len(realPath) > DefaultPathLength { + return notValidPath, fmt.Errorf("path over max path length") + } + if !stringChecker(realPath, 0, DefaultPathLength) { + return notValidPath, fmt.Errorf("invalid path") + } + if err = pathDepthChecker(realPath, 0); err != nil { + return notValidPath, err + } + return realPath, nil +} + +// VerifyFile verify the file after it is opened. +func VerifyFile(file *os.File, size int64) error { + fileInfo, err := file.Stat() + if err != nil { + return err + } + if size > maxAllowFileSize || size < 0 { + return fmt.Errorf("invalid size") + } + if fileInfo.Size() > size*oneMegabytes { + return fmt.Errorf("file size error %v", fileInfo.Size()) + } + if (fileInfo.Mode() & fs.ModeSymlink) != 0 { + return fmt.Errorf("file is softlink") + } + if st := fileInfo.Sys(); st.(*syscall.Stat_t).Uid != uint32(os.Geteuid()) { + return fmt.Errorf("file owner incorrect") + } + return nil +} + +// SafeChmod after the verification is complete, run the chmod command. +func SafeChmod(path string, size int64, mode os.FileMode) error { + file, err := os.Open(path) + if err != nil { + return err + } + defer file.Close() + if err = VerifyFile(file, size); err != nil { + return err + } + if err = file.Chmod(mode); err != nil { + return err + } + return nil +} + +func realPathChecker(path string, checkParent, allowLink bool) (string, os.FileInfo, error) { + realPath, err := filepath.Abs(path) + if err != nil { + return notValidPath, nil, err + } + if len(realPath) > DefaultPathLength { + return notValidPath, nil, fmt.Errorf("path over max path length") + } + if !stringChecker(realPath, 0, DefaultPathLength) { + return notValidPath, nil, fmt.Errorf("invalid path") + } + if err = fileChecker(realPath, true, checkParent, allowLink, 0); err != nil { + return notValidPath, nil, err + } + fileInfo, err := os.Stat(realPath) + if err != nil { + return notValidPath, nil, err + } + return realPath, fileInfo, nil +} + +func fileChecker(path string, allowDir, checkParent, allowLink bool, deep int) error { + const maxDepth int = 99 + if deep > maxDepth { + return fmt.Errorf("over maxDepth %d", maxDepth) + } + fileInfo, err := normalFileCheck(path, allowDir, allowLink) + if err != nil { + return err + } + if err = checkOwnerAndPermission(fileInfo, path); err != nil { + return err + } + if path != "/" && checkParent { + return fileChecker(filepath.Dir(path), true, true, allowLink, deep+1) + } + return nil +} + +func pathDepthChecker(path string, deep int) error { + const maxDepth int = 99 + if deep > maxDepth { + return fmt.Errorf("over maxDepth %d", maxDepth) + } + if path != "/" { + return pathDepthChecker(filepath.Dir(path), deep+1) + } + return nil +} + +func checkOwnerAndPermission(fileInfo os.FileInfo, filePath string) error { + const groupWriteIndex, otherWriteIndex, permLength int = 5, 8, 10 + perm := fileInfo.Mode().Perm().String() + if len(perm) != permLength { + return fmt.Errorf("permission not right %v %v", filePath, perm) + } + for index, char := range perm { + if (index == groupWriteIndex || index == otherWriteIndex) && char == 'w' { + return fmt.Errorf("write permission not right %v %v", filePath, perm) + } + } + stat, ok := fileInfo.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("can not get stat %v", filePath) + } + if !(int(stat.Uid) == 0 || int(stat.Uid) == os.Getuid()) { + return fmt.Errorf("owner not right %v %v", filePath, int(stat.Uid)) + } + return nil +} + +func normalFileCheck(filePath string, allowDir, allowLink bool) (os.FileInfo, error) { + realPath, err := filepath.EvalSymlinks(filePath) + if err != nil || (realPath != filePath && !allowLink) { + return nil, fmt.Errorf("symlinks or not existed, failed %v, %v", filePath, err) + } + fileInfo, err := os.Stat(filePath) + if err != nil { + return nil, fmt.Errorf("get file stat failed %v", err) + } + if allowDir && !fileInfo.Mode().IsRegular() && !fileInfo.IsDir() { + return nil, fmt.Errorf("not regular file/dir %v", filePath) + } + if !allowDir && !fileInfo.Mode().IsRegular() { + return nil, fmt.Errorf("not regular file %v", filePath) + } + if fileInfo.Mode()&os.ModeSetuid != 0 { + return nil, fmt.Errorf("setuid not allowed %v", filePath) + } + if fileInfo.Mode()&os.ModeSetgid != 0 { + return nil, fmt.Errorf("setgid not allowed %v", filePath) + } + return fileInfo, nil +} + +func isValidCode(c rune) bool { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') +} + +func isInWhiteList(c rune) bool { + return strings.Contains(DefaultWhiteList, string(c)) +} + +func stringChecker(text string, minLength, maxLength int) bool { + if len(text) <= minLength || len(text) >= maxLength { + return false + } + for _, char := range text { + if !isValidCode(char) && !isInWhiteList(char) { + return false + } + } + return true +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go new file mode 100644 index 0000000..3c8e065 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go @@ -0,0 +1,194 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package mindxcheckutils is a check utils package +package utils + +import ( + "os" + "strings" + "testing" +) + +func TestNormalFileCheckRegularFile(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + defer removeTmpDir(t, tmpDir) + err = os.Symlink(filePath, tmpDir+"/syslink") + if err != nil { + t.Fatalf("create symlink failed %q: %s", filePath, err) + } + + if _, err = normalFileCheck(tmpDir, true, false); err != nil { + t.Fatalf("check allow dir failed %q: %s", tmpDir+"/__test__", err) + } + + if _, err = normalFileCheck(tmpDir, false, false); !strings.Contains(err.Error(), "not regular file") { + t.Fatalf("check not allow dir failed %q: %s", tmpDir+"/__test__", err) + } + + if _, err = normalFileCheck("/dev/zero", true, false); !strings.Contains(err.Error(), "not regular file/dir") { + t.Fatalf("check /dev/zero failed %q: %s", tmpDir+"/__test__", err) + } + + if _, err = normalFileCheck(tmpDir+"/syslink", false, false); !strings.Contains(err.Error(), "symlinks") { + t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) + } + + if _, err = normalFileCheck(filePath, false, false); err != nil { + t.Fatalf("check failed %q: %s", filePath, err) + } + + if _, err = normalFileCheck(tmpDir+"/notexisted", false, false); !strings.Contains(err.Error(), "not existed") { + t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) + } +} + +func TestRealFileChecker(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + const permission os.FileMode = 0700 + err = os.WriteFile(filePath, []byte("hello\n"), permission) + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + if _, err = RealFileChecker(filePath, false, true, 0); err == nil { + t.Fatalf("size check wrong 0 %q: %s", filePath, err) + } + if _, err = RealFileChecker(filePath, false, true, 1); err != nil { + t.Fatalf("size check wrong 1 %q: %s", filePath, err) + } +} + +func TestRealFileCheckerInside(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + const permission os.FileMode = 0700 + const deep int = 100 + err = os.WriteFile(filePath, []byte("hello\n"), permission) + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + if err = fileChecker(filePath, false, false, false, deep); err == nil { + t.Fatalf("size check wrong 0 %q: %s", filePath, err) + } +} + +func TestRealDirChecker(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + if _, err = RealDirChecker(filePath, false, true); err == nil { + t.Fatalf("should be dir 0 %q: %s", filePath, err) + } + if _, err = RealDirChecker(tmpDir, false, true); err != nil { + t.Fatalf("should be dir 1 %q: %s", filePath, err) + } +} + +func TestVerifyFile(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + err = os.Symlink(filePath, tmpDir+"/syslink") + if err != nil { + t.Fatalf("create symlink failed %q: %s", filePath, err) + } + file, err := os.Open(filePath) + if err != nil { + t.Fatalf("open file failed") + } + defer file.Close() + linkFile, err := os.Open(tmpDir + "/syslink") + if err != nil { + t.Fatalf("open file failed") + } + defer linkFile.Close() + const permission os.FileMode = 0700 + err = os.WriteFile(filePath, []byte("hello\n"), permission) + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + if err = VerifyFile(file, 0); err == nil { + t.Fatalf("size check wrong 0 %q: %s", filePath, err) + } + if err = VerifyFile(file, 1); err != nil { + t.Fatalf("size check wrong 1 %q: %s", filePath, err) + } + if err = VerifyFile(linkFile, 1); err != nil && !strings.Contains(err.Error(), "symlinks") { + t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) + } +} + +func TestStringChecker(t *testing.T) { + if ok := stringChecker("0123456789abcABC", 0, DefaultStringLength); !ok { + t.Fatalf("failed on regular letters") + } + const testSize = 3 + if ok := stringChecker("123", 0, testSize); ok { + t.Fatalf("failed on max length") + } + if ok := stringChecker("1234", 0, testSize); ok { + t.Fatalf("failed on max length") + } + if ok := stringChecker("12", 0, testSize); !ok { + t.Fatalf("failed on max length") + } + if ok := stringChecker("", 0, testSize); ok { + t.Fatalf("failed on min length") + } + if ok := stringChecker("123", testSize, DefaultStringLength); ok { + t.Fatalf("failed on min length") + } + if ok := stringChecker("123%", 0, DefaultStringLength); ok { + t.Fatalf("failed on strange words") + } + if ok := stringChecker("123.-/~", 0, DefaultStringLength); !ok { + t.Fatalf("failed on strange words") + } +} + +func createTestFile(t *testing.T, fileName string) (string, string, error) { + const fileMode os.FileMode = 0600 + tmpDir := os.TempDir() + const permission os.FileMode = 0700 + if os.MkdirAll(tmpDir+"/__test__", permission) != nil { + t.Fatalf("MkdirAll failed %q", tmpDir+"/__test__") + } + f, err := os.Create(tmpDir + "/__test__" + fileName) + if err != nil { + t.Fatalf("create file failed %q: %s", tmpDir+"/__test__", err) + } + defer f.Close() + err = f.Chmod(fileMode) + if err != nil { + t.Fatalf("change file mode failed %q: %s", tmpDir+"/__test__", err) + } + return tmpDir + "/__test__", tmpDir + "/__test__" + fileName, err +} + +func removeTmpDir(t *testing.T, tmpDir string) { + if os.RemoveAll(tmpDir) != nil { + t.Logf("removeall %v", tmpDir) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_test.go new file mode 100644 index 0000000..8f91417 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_test.go @@ -0,0 +1,169 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +func TestReadLimitBytes(t *testing.T) { + convey.Convey("test ReadLimitBytes func", t, func() { + convey.Convey("should return nil given empty string", func() { + emptyString := "" + const limitLength = 10 + res, err := ReadLimitBytes(emptyString, limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err, convey.ShouldBeError) + }) + + convey.Convey("should not return nil given valid path", func() { + const limitLength = 10 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldNotBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return nil given invalid limit length", func() { + const limitLength = -1 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "the limit length is not valid") + }) + + convey.Convey("should return nil when check path failed", func() { + checkStub := gomonkey.ApplyFunc(CheckPath, func(path string) (string, error) { + return "", errors.New("check failed") + }) + defer checkStub.Reset() + const limitLength = 10 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "check failed") + }) + + convey.Convey("should return nil when read file failed", func() { + var file *os.File + checkStub := gomonkey.ApplyMethod(reflect.TypeOf(file), "Read", + func(_ *os.File, _ []byte) (int, error) { + return 0, errors.New("read file failed") + }) + defer checkStub.Reset() + const limitLength = 10 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "read file failed: read file failed") + }) + }) +} + +func TestLoadFile(t *testing.T) { + convey.Convey("test LoadFile func", t, func() { + convey.Convey("should return error given empty path", func() { + res, err := LoadFile("") + convey.So(res, convey.ShouldBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return nil given path not existing", func() { + res, err := LoadFile("xxxx") + convey.So(res, convey.ShouldBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should not return nil given valid path", func() { + res, err := LoadFile("../../go.mod") + convey.So(res, convey.ShouldNotBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return nil given invalid path", func() { + absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { + return "", errors.New("the path is invalid") + }) + defer absStub.Reset() + res, err := LoadFile("../../go.mod") + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "the filePath is invalid: the path is invalid") + }) + + convey.Convey("should return nil when read file failed", func() { + readStub := gomonkey.ApplyFunc(ReadLimitBytes, func(path string, limitLength int) ([]byte, error) { + return nil, errors.New("read file failed") + }) + defer readStub.Reset() + res, err := LoadFile("../../go.mod") + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "read file failed") + }) + }) +} + +func TestCopyDir(t *testing.T) { + convey.Convey("test CopyDir func", t, func() { + convey.Convey("should return error given empty src path", func() { + err := CopyDir("", "") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given file src path", func() { + err := CopyDir("../../go.mod", "") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return nil given dir src path", func() { + err := CopyDir("../utils", "../utils_test") + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("should return error given file dst path", func() { + err := CopyDir("../utils", "../utils_test/file_test.go") + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestCopyFile(t *testing.T) { + convey.Convey("test CopyFile func", t, func() { + convey.Convey("should return error given empty src file path", func() { + err := CopyFile("", "../utils_test/file_test.go") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given empty dst path", func() { + err := CopyFile("../utils_test/file_test.go", "") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given dir scr path", func() { + err := CopyFile("../utils", "../utils_test/file_test.go") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given dir dst path", func() { + err := CopyFile("../utils/file_test.go", "../utils_test") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return nil given file scr and dst path", func() { + err := CopyFile("../utils/file_test.go", "../utils_test/file_test.go") + convey.So(err, convey.ShouldBeNil) + }) + }) + if err := os.RemoveAll("../utils_test"); err != nil { + fmt.Print("remove util_test file failed") + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go new file mode 100644 index 0000000..78f4266 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go @@ -0,0 +1,85 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer utils for file watcher +package utils + +import ( + "fmt" + "os" + + "github.com/fsnotify/fsnotify" +) + +// FileWatcher struct file watcher +type FileWatcher struct { + watcher *fsnotify.Watcher +} + +// NewFileWatcher new FileWatcher +func NewFileWatcher() (*FileWatcher, error) { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return nil, err + } + return &FileWatcher{watcher: watcher}, nil +} + +// WatchFile add file to watch +func (fw *FileWatcher) WatchFile(filePath string) error { + if _, err := os.Stat(filePath); err != nil { + return err + } + if _, err := PathStringChecker(filePath); err != nil { + return err + } + return fw.watcher.Add(filePath) +} + +// Events get event channel +func (fw *FileWatcher) Events() chan fsnotify.Event { + if fw == nil || fw.watcher == nil { + return nil + } + return fw.watcher.Events +} + +// Errors get error channel +func (fw *FileWatcher) Errors() chan error { + if fw == nil || fw.watcher == nil { + return nil + } + return fw.watcher.Errors +} + +// Close to close the file watcher +func (fw *FileWatcher) Close() error { + if fw == nil || fw.watcher == nil { + return nil + } + return fw.watcher.Close() +} + +// GetFileWatcherChan get eventCh and errCh for file watcher +func GetFileWatcherChan(filePath string) (*FileWatcher, error) { + watcher, err := NewFileWatcher() + if err != nil { + return nil, fmt.Errorf("new file watcher failed, error: %v", err) + } + if err = watcher.WatchFile(filePath); err != nil { + return nil, fmt.Errorf("watch file <%s> failed, error: %v", filePath, err) + } + fmt.Printf("watching file <%s>...\n", filePath) + return watcher, nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go new file mode 100644 index 0000000..32220da --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go @@ -0,0 +1,81 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils test for file watcher utils +package utils + +import ( + "errors" + "fmt" + "os" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/fsnotify/fsnotify" + "github.com/smartystreets/goconvey/convey" +) + +var testErr = errors.New("test error") + +const ( + testFilePath = "./test.txt" + errFilePath = "./not_exist_file.txt" +) + +func TestGetFileWatcherChan(t *testing.T) { + prepareTestFile(t) + defer removeFile() + + p1 := gomonkey.ApplyFuncReturn(PathStringChecker, "", nil) + defer p1.Reset() + convey.Convey("test func GetFileWatcherChan success", t, func() { + _, err := GetFileWatcherChan(testFilePath) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("test func GetFileWatcherChan failed, new watcher err", t, func() { + p2 := gomonkey.ApplyFuncReturn(fsnotify.NewWatcher, nil, testErr) + defer p2.Reset() + _, err := GetFileWatcherChan(testFilePath) + expErr := fmt.Errorf("new file watcher failed, error: %v", testErr) + convey.So(err, convey.ShouldResemble, expErr) + }) + convey.Convey("test func GetFileWatcherChan failed, file does not exist", t, func() { + _, err := GetFileWatcherChan(errFilePath) + expErr := fmt.Sprintf("watch file <%s> failed", errFilePath) + convey.So(err.Error(), convey.ShouldContainSubstring, expErr) + }) + convey.Convey("test func GetFileWatcherChan failed, watcher is nil", t, func() { + var watcher = &FileWatcher{} + eventCh := watcher.Events() + convey.So(eventCh, convey.ShouldBeNil) + errCh := watcher.Errors() + convey.So(errCh, convey.ShouldBeNil) + err := watcher.Close() + convey.So(err, convey.ShouldBeNil) + }) +} + +func prepareTestFile(t *testing.T) { + const mode644 = 0644 + err := os.WriteFile(testFilePath, []byte("file context"), mode644) + if err != nil { + t.Error(err) + } +} + +func removeFile() { + if err := os.Remove(testFilePath); err != nil && errors.Is(err, os.ErrNotExist) { + fmt.Printf("remove file %s failed, %v\n", testFilePath, err) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface.go b/mind-cluster/component/ascend-common/common-utils/utils/interface.go new file mode 100644 index 0000000..7ccae4d --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/interface.go @@ -0,0 +1,29 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import "reflect" + +// IsNil check whether the interface is nil, including type or data is nil +func IsNil(i interface{}) bool { + if i == nil { + return true + } + defer func() { + recover() + }() + return reflect.ValueOf(i).IsNil() +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go b/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go new file mode 100644 index 0000000..f2ce878 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go @@ -0,0 +1,36 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestIsNil(t *testing.T) { + var a interface{} // type = nil, data = nil + var b interface{} = (*int)(nil) // type is *int , data = nil + var c interface{} = "dd" + convey.Convey("test IsNil func, type and data is both nil", t, func() { + convey.So(a == nil, convey.ShouldEqual, true) + convey.So(b == nil, convey.ShouldEqual, false) + convey.So(c == nil, convey.ShouldEqual, false) + convey.So(IsNil(a), convey.ShouldEqual, true) + convey.So(IsNil(b), convey.ShouldEqual, true) + convey.So(IsNil(c), convey.ShouldEqual, false) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go new file mode 100644 index 0000000..f3ed96e --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go @@ -0,0 +1,98 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import ( + "errors" + "net" + "net/http" + "regexp" + "strings" +) + +const ( + domainReg = "^[a-zA-Z0-9][a-zA-Z0-9.-]{1,256}[a-zA-Z0-9]$" +) + +// ClientIP try to get the clientIP +func ClientIP(r *http.Request) string { + // get forward ip fistly + var ip string + xForwardedFor := r.Header.Get("X-Forwarded-For") + forwardSlice := strings.Split(xForwardedFor, ",") + if len(forwardSlice) >= 1 { + if ip = strings.TrimSpace(forwardSlice[0]); ip != "" { + return ip + } + } + // try get ip from "X-Real-Ip" + ip = strings.TrimSpace(r.Header.Get("X-Real-Ip")) + if ip != "" { + return ip + } + var err error + if ip, _, err = net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)); err == nil { + return ip + } + return "" +} + +// CheckDomain check domain which by regex and blacklist +func CheckDomain(domain string, forLocalUsage bool) error { + matched, err := regexp.MatchString(domainReg, domain) + if err != nil { + return err + } + if !matched { + return errors.New("domain does not match allowed regex") + } + if !forLocalUsage { + return nil + } + if IsDigitString(domain) { + return errors.New("domain can not be all digits") + } + if strings.Contains(domain, "localhost") { + return errors.New("domain can not contain localhost") + } + return nil +} + +// IsHostValid check if the host is valid +func IsHostValid(host string) error { + parsedIp := net.ParseIP(host) + if parsedIp != nil { + return IsIPValid(parsedIp) + } + return CheckDomain(host, false) +} + +// IsIPValid check ip valid +func IsIPValid(parsedIp net.IP) error { + if parsedIp == nil { + return errors.New("parse ip is nil") + } + if parsedIp.To4() == nil && parsedIp.To16() == nil { + return errors.New("not a valid ipv4 or ipv6 ip") + } + if parsedIp.IsUnspecified() { + return errors.New("is all zeros ip") + } + if parsedIp.IsMulticast() { + return errors.New("is multicast ip") + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go new file mode 100644 index 0000000..6ad93ab --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go @@ -0,0 +1,182 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import ( + "net/http" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +const ( + localhost = "127.0.0.1" + localhostLoop = "0.0.0.0" +) + +func TestClientIP(t *testing.T) { + convey.Convey("test ClientIP func", t, func() { + convey.Convey("get IP from X-Forwarded-For", func() { + ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {localhost, localhostLoop}})) + convey.So(ip, convey.ShouldEqual, localhost) + }) + convey.Convey("get IP from X-Real-Ip", func() { + ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, + "X-Real-Ip": {localhost}})) + convey.So(ip, convey.ShouldEqual, localhost) + }) + convey.Convey("get IP from RemoteAddr", func() { + ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, + "X-Real-Ip": {}})) + convey.So(ip, convey.ShouldEqual, localhost) + }) + convey.Convey("get IP from RemoteAddr failed", func() { + ip := ClientIP(&http.Request{RemoteAddr: localhost}) + convey.So(ip, convey.ShouldEqual, "") + }) + convey.Convey("get IP failed", func() { + ip := ClientIP(&http.Request{}) + convey.So(ip, convey.ShouldEqual, "") + }) + }) +} + +func mockRequest(header map[string][]string) *http.Request { + return &http.Request{ + Method: "GET", + URL: nil, + Proto: "HTTP", + ProtoMajor: 0, + ProtoMinor: 0, + Header: header, + ContentLength: 0, + Close: false, + Host: "www.test.com", + RemoteAddr: "127.0.0.1:8080", + } +} + +func TestCheckDomain(t *testing.T) { + convey.Convey("CheckDomain function test suite", t, func() { + testDomainFormatValidation() + testLocalUsageConstraints() + testParameterCombinations() + }) +} + +// Test domain format validation +func testDomainFormatValidation() { + convey.Convey("Validate domain format rules", func() { + convey.Convey("Valid domain should pass validation", func() { + err := CheckDomain("example.com", false) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("Domain with special characters should be rejected", func() { + err := CheckDomain("example@com", false) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "domain does not match allowed regex") + }) + + convey.Convey("Domain starting with hyphen should be rejected", func() { + err := CheckDomain("-example.com", false) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +// Test local usage constraints +func testLocalUsageConstraints() { + convey.Convey("Validate constraints for local usage (forLocalUsage=true)", func() { + convey.Convey("All-digit domain should be rejected", func() { + err := CheckDomain("123456", true) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not be all digits") + }) + + convey.Convey("Domain containing 'localhost' should be rejected", func() { + err := CheckDomain("my-localhost.com", true) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not contain localhost") + }) + + convey.Convey("Valid local domain should pass validation", func() { + err := CheckDomain("local-app.example", true) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +// Test parameter combinations +func testParameterCombinations() { + convey.Convey("Validate parameter combinations", func() { + convey.Convey("All-digit restriction ignored when forLocalUsage=false", func() { + err := CheckDomain("123456", false) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("DNS check skipped when forLocalUsage=false", func() { + err := CheckDomain("unresolvable.test", false) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestIsHostValid(t *testing.T) { + tests := []struct { + name string + ip string + wantErr bool + errMsg string + }{ + { + name: "invalid IP format but domain", ip: "not.an.ip", + wantErr: false, + }, + { + name: "valid IPv4", ip: "192.168.1.1", wantErr: false, + }, + { + name: "valid IPv6", ip: "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + wantErr: false, + }, + { + name: "unspecified IPv4", ip: "0.0.0.0", + wantErr: true, errMsg: "is all zeros ip", + }, + { + name: "unspecified IPv6", ip: "::", + wantErr: true, errMsg: "is all zeros ip", + }, + { + name: "IPv6 multicast", ip: "ff02::1", + wantErr: true, errMsg: "is multicast ip", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := IsHostValid(tt.ip) + if (err != nil) != tt.wantErr { + t.Errorf("IsIPValid() error = %v, wantErr %v", err, tt.wantErr) + return + } + if err != nil && err.Error() != tt.errMsg { + t.Errorf("IsIPValid() error = %v, wantErrMsg %v", + err.Error(), tt.errMsg) + } + }) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path.go b/mind-cluster/component/ascend-common/common-utils/utils/path.go new file mode 100644 index 0000000..b3150b9 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/path.go @@ -0,0 +1,382 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "bufio" + "errors" + "fmt" + "io" + "io/fs" + "log" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "syscall" +) + +const ( + dirMode = 0700 + + rootUID = 0 + maxPathDepth = 20 + maxPathLength = 1024 + // DefaultWriteFileMode default file mode for write permission check + DefaultWriteFileMode = 0022 + + ldSplitLen = 2 + ldLibNameIndex = 0 + ldLibPathIndex = 1 + ldCommand = "/sbin/ldconfig" + ldParam = "--print-cache" + // LdLibPath LD_LIBRARY_PATH + LdLibPath = "LD_LIBRARY_PATH" + grepCommand = "/bin/grep" +) + +// IsDir check whether the path is a directory. +func IsDir(path string) bool { + if path == "" { + return false + } + + if !IsExist(path) { + return path[len(path)-1:] == "/" + } + s, err := os.Stat(path) + if err != nil { + return false + } + return s.IsDir() +} + +// IsFile check whether the path is a file +func IsFile(path string) bool { + if path == "" { + return false + } + return !IsDir(path) +} + +// IsSoftlink check whether the path is softlink +func IsSoftlink(path string) (bool, error) { + file, err := os.Open(path) + if err != nil { + return false, err + } + defer file.Close() + fileInfo, err := file.Stat() + if err != nil { + return false, err + } + if (fileInfo.Mode() & fs.ModeSymlink) != 0 { + return true, nil + } + return false, nil +} + +// IsExist check whether the path exists, If the file is a symbolic link, the returned the final FileInfo +func IsExist(filePath string) bool { + _, err := os.Stat(filePath) + if err == nil { + return true + } + if os.IsExist(err) { + return true + } + return false +} + +// IsLexist check whether the path exists, If the file is a symbolic link, the returned FileInfo +// describes the symbolic link +func IsLexist(filePath string) bool { + _, err := os.Lstat(filePath) + if err == nil { + return true + } + if os.IsExist(err) { + return true + } + return false +} + +// CheckPath validate given path and return resolved absolute path +func CheckPath(path string) (string, error) { + if path == "" { + return path, nil + } + origin := path + for !IsLexist(path) { + path = filepath.Dir(path) + if path == "." { + return "", os.ErrNotExist + } + } + absPath, err := filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("get the absolute path failed: %v", err) + } + resoledPath, err := filepath.EvalSymlinks(absPath) + if err != nil { + if strings.Contains(err.Error(), "no such file or directory") { + return "", os.ErrNotExist + } + return "", fmt.Errorf("get the symlinks path failed: %v", err) + } + if absPath != resoledPath { + return "", errors.New("can't support symlinks") + } + // get the original full path + absOrigin, err := filepath.Abs(origin) + if err != nil { + return "", fmt.Errorf("get the absolute path failed: %v", err) + } + return absOrigin, nil +} + +// MakeSureDir create directory. The last element of path should end with slash, or it will be omitted. +func MakeSureDir(path string) error { + dir := filepath.Dir(path) + if IsExist(dir) { + return nil + } + + if err := os.MkdirAll(dir, dirMode); err != nil { + return fmt.Errorf("create directory failed: %v", err) + } + + return nil +} + +// CheckMode check input file mode whether includes invalid mode. +// For example, if read operation of group and other is forbidden, then call CheckMode(inputFileMode, 0044). +// All operations are forbidden for group and other, then call CheckMode(inputFileMode, 0077). +// Write operation is forbidden for group and other by default, with calling CheckMode(inputFileMode) +func CheckMode(mode os.FileMode, optional ...os.FileMode) bool { + var targetMode os.FileMode + if len(optional) > 0 { + targetMode = optional[0] + } else { + targetMode = DefaultWriteFileMode + } + checkMode := uint32(mode) & uint32(targetMode) + return checkMode == 0 +} + +// CheckOwnerAndPermission check path owner and permission +func CheckOwnerAndPermission(verifyPath string, mode os.FileMode, uid uint32) (string, error) { + if verifyPath == "" { + return verifyPath, errors.New("empty path") + } + absPath, err := filepath.Abs(verifyPath) + if err != nil { + return "", fmt.Errorf("abs failed %v", err) + } + resoledPath, err := filepath.EvalSymlinks(absPath) + if err != nil { + return "", fmt.Errorf("evalSymlinks failed %v", err) + } + // if symlinks + if absPath != resoledPath { + // check symlinks its self owner + pathInfo, err := os.Lstat(absPath) + if err != nil { + return "", fmt.Errorf("lstat failed, %v", err) + } + stat, ok := pathInfo.Sys().(*syscall.Stat_t) + if !ok || stat.Uid != uid { + return "", errors.New("symlinks owner may not root") + } + } + pathInfo, err := os.Stat(resoledPath) + if err != nil { + return "", fmt.Errorf("stat failed %v", err) + } + stat, ok := pathInfo.Sys().(*syscall.Stat_t) + if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { + return "", errors.New("check uid or mode failed") + } + return resoledPath, nil +} + +// DoCheckOwnerAndPermission check path owner and permission +func DoCheckOwnerAndPermission(path string, mode os.FileMode, uid uint32) error { + if !IsExist(path) { + return nil + } + pathInfo, err := os.Stat(path) + if err != nil { + return fmt.Errorf("stat failed %v", err) + } + stat, ok := pathInfo.Sys().(*syscall.Stat_t) + if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { + return fmt.Errorf("check uid or mode failed : %v", path) + } + return nil +} + +func checkAbsPath(libPath string) (string, error) { + absLibPath, err := CheckOwnerAndPermission(libPath, DefaultWriteFileMode, rootUID) + if err != nil { + return "", fmt.Errorf("%s: %v", libPath, err) + } + count := 0 + fPath := absLibPath + for { + if count >= maxPathDepth { + break + } + count++ + if fPath == "/" { + return absLibPath, nil + } + fPath = filepath.Dir(fPath) + if _, err := CheckOwnerAndPermission(fPath, DefaultWriteFileMode, rootUID); err != nil { + return "", fmt.Errorf("%s: %v", fPath, err) + } + } + return "", errors.New("absolute path check failed") +} + +func checkLibsPath(libraryPaths []string) (string, error) { + errs := make([]string, 0, len(libraryPaths)) + for _, libraryAbsName := range libraryPaths { + absLibPath, err := checkAbsPath(libraryAbsName) + if err == nil { + return absLibPath, nil + } + errs = append(errs, fmt.Sprintf("%s;", err.Error())) + } + return "", fmt.Errorf("lib path is invalid, %v", errs) +} + +func getLibFromEnv(libraryName string) (string, error) { + ldLibraryPath := os.Getenv(LdLibPath) + if len(ldLibraryPath) > maxPathLength { + return "", fmt.Errorf("invalid library path env") + } + libraryPaths := strings.Split(ldLibraryPath, ":") + targetLibs := make([]string, 0, len(ldLibraryPath)) + for _, libraryPath := range libraryPaths { + libraryAbsName := path.Join(libraryPath, libraryName) + if len(libraryAbsName) > maxPathLength || !IsLexist(libraryAbsName) { + continue + } + targetLibs = append(targetLibs, libraryAbsName) + } + if len(libraryPaths) == 0 { + return "", errors.New("file path no exist or too long") + } + return checkLibsPath(targetLibs) +} + +func trimSpaceTable(data string) string { + data = strings.Replace(data, " ", "", -1) + data = strings.Replace(data, "\t", "", -1) + data = strings.Replace(data, "\n", "", -1) + return data +} + +func parserLibPath(line, libraryName string) string { + ldInfo := strings.Split(line, "=>") + if len(ldInfo) < ldSplitLen { + return "" + } + libNames := strings.Split(ldInfo[ldLibNameIndex], " ") + for index, libName := range libNames { + if index >= maxPathDepth { + break + } + if len(libName) == 0 { + continue + } + if name := trimSpaceTable(libName); name != libraryName { + continue + } + return trimSpaceTable(ldInfo[ldLibPathIndex]) + } + return "" +} + +func parseLibFromLdCmd(libraryName string) (string, error) { + ldCmd := exec.Command(ldCommand, ldParam) + grepCmd := exec.Command(grepCommand, libraryName) + ldCmdStdout, err := ldCmd.StdoutPipe() + if err != nil { + return "", fmt.Errorf("command exec failed: %v", err) + } + grepCmd.Stdin = ldCmdStdout + stdout, err := grepCmd.StdoutPipe() + if err != nil { + return "", fmt.Errorf("get pipe failed: %v", err) + } + if err = grepCmd.Start(); err != nil { + return "", fmt.Errorf("command exec failed: %v", err) + } + if err = ldCmd.Run(); err != nil { + return "", fmt.Errorf("command exec failed: %v", err) + } + defer func() { + if err = grepCmd.Wait(); err != nil { + log.Printf("command exec failed, %v", err) + } + }() + reader := bufio.NewReader(stdout) + count := 0 + line := "" + for { + if count >= maxPathLength { + err = errors.New("too many items in command stdout") + break + } + count++ + line, err = reader.ReadString('\n') + if err != nil || io.EOF == err { + break + } + if libPath := parserLibPath(line, libraryName); libPath != "" { + return libPath, nil + } + } + return "", fmt.Errorf("can't find valid lib: %v", err) +} + +func getLibFromLdCmd(libraryName string) (string, error) { + libraryAbsName, err := parseLibFromLdCmd(libraryName) + if err != nil { + return "", err + } + var absLibPath string + if absLibPath, err = checkAbsPath(libraryAbsName); err == nil { + return absLibPath, nil + } + return "", fmt.Errorf("driver lib is not exist or it's permission is invalid, %v", err) +} + +// GetDriverLibPath get driver lib path from ld config +func GetDriverLibPath(libraryName string) (string, error) { + var libPath string + var envErr, cmdErr error + if libPath, envErr = getLibFromEnv(libraryName); envErr == nil { + return libPath, nil + } + if libPath, cmdErr = getLibFromLdCmd(libraryName); cmdErr == nil { + return libPath, nil + } + return "", fmt.Errorf("cannot found valid driver lib, fromEnv: %v, fromLdCmd: %v", envErr, cmdErr) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path_test.go b/mind-cluster/component/ascend-common/common-utils/utils/path_test.go new file mode 100644 index 0000000..4e2346f --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/path_test.go @@ -0,0 +1,232 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +func TestIsDir(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test IsDir func", func() { + res := IsDir("/tmp/") + convey.So(res, convey.ShouldBeTrue) + res = IsDir("/utils/") + convey.So(res, convey.ShouldBeTrue) + res = IsDir("") + convey.So(res, convey.ShouldBeFalse) + }) + }) +} + +func TestIsFile(t *testing.T) { + convey.Convey("test IsFile func", t, func() { + res := IsFile("/tmp/") + convey.So(res, convey.ShouldBeFalse) + res = IsFile("") + convey.So(res, convey.ShouldBeFalse) + }) +} + +func TestIsExist(t *testing.T) { + convey.Convey("test IsExist func", t, func() { + res := IsExist("/xxxx/") + convey.So(res, convey.ShouldBeFalse) + }) +} + +func TestIsLexist(t *testing.T) { + convey.Convey("test IsLexist func", t, func() { + res := IsLexist("/xxxx/") + convey.So(res, convey.ShouldBeFalse) + }) +} + +func TestCheckPath(t *testing.T) { + convey.Convey("test CheckPath func", t, func() { + convey.Convey("should return itself given empty string", func() { + res, err := CheckPath("") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error given not exist path", func() { + res, err := CheckPath("xxxxxxx") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "file does not exist") + }) + + convey.Convey("should return resolve path given normal path", func() { + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldNotBeEmpty) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return err when get abs path failed", func() { + absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { + return "", errors.New("abs failed") + }) + defer absStub.Reset() + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "get the absolute path failed: abs failed") + }) + + convey.Convey("should return err when get eval symbol link failed", func() { + symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { + return "", errors.New("symlinks path failed") + }) + defer symStub.Reset() + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "get the symlinks path failed: symlinks path failed") + }) + + convey.Convey("should return err given symbol link", func() { + symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { + return "xxx", nil + }) + defer symStub.Reset() + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "can't support symlinks") + }) + + }) +} + +func TestMakeSureDir(t *testing.T) { + convey.Convey("test MakeSureDir func", t, func() { + convey.Convey("normal situation, no err returned", func() { + err := MakeSureDir("./testdata/tmp/test") + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("abnormal situation,err returned", func() { + mock := gomonkey.ApplyFunc(os.MkdirAll, func(name string, perm os.FileMode) error { + return fmt.Errorf("error") + }) + defer mock.Reset() + err := MakeSureDir("./xxxx/xxx") + convey.So(err.Error(), convey.ShouldEqual, "create directory failed: error") + }) + }) +} + +func TestGetDriverLibPath(t *testing.T) { + convey.Convey("test GetDriverLibPath func", t, func() { + convey.Convey("should return itself given empty string", func() { + err := os.Setenv(LdLibPath, "") + convey.So(err, convey.ShouldBeNil) + res, err := GetDriverLibPath("") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err, convey.ShouldBeError) + }) + + convey.Convey("should return path when getLibFromEnv succeed", func() { + envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { + return "/test", nil + }) + defer envStub.Reset() + res, err := GetDriverLibPath("") + convey.So(res, convey.ShouldEqual, "/test") + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return path when getLibFromEnv failed but getLibFromLdCmd succeed", func() { + envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { + return "", errors.New("failed") + }) + defer envStub.Reset() + cmdStub := gomonkey.ApplyFunc(getLibFromLdCmd, func(libraryName string) (string, error) { + return "/test", nil + }) + defer cmdStub.Reset() + res, err := GetDriverLibPath("") + convey.So(res, convey.ShouldEqual, "/test") + convey.So(err, convey.ShouldBeNil) + }) + + }) +} + +type mockFileInfo struct { + mode os.FileMode + sys interface{} +} + +func (m *mockFileInfo) Name() string { return "mock" } +func (m *mockFileInfo) Size() int64 { return 0 } +func (m *mockFileInfo) Mode() os.FileMode { return m.mode } +func (m *mockFileInfo) ModTime() time.Time { return time.Now() } +func (m *mockFileInfo) IsDir() bool { return false } +func (m *mockFileInfo) Sys() interface{} { return m.sys } + +func TestDoCheckOwnerAndPermission(t *testing.T) { + var testPath = "/test" + var testMode os.FileMode = 0660 + var excludePermissions os.FileMode = 0002 + patch := gomonkey.NewPatches() + defer patch.Reset() + convey.Convey("should return nil when path is not exist", t, func() { + patch.ApplyFuncReturn(IsExist, false) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldBeNil) + }) + + patch.ApplyFuncReturn(IsExist, true) + convey.Convey("should return err when stat failed", t, func() { + patch.ApplyFuncReturn(os.Stat, nil, os.ErrNotExist) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err.Error(), convey.ShouldContainSubstring, "stat failed") + }) + + convey.Convey("should return err when get uid failed", t, func() { + patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: "invalid-type"}, nil) + defer patch.Reset() + + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") + }) + + convey.Convey("should return err when permission check failure", t, func() { + patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) + patch.ApplyFuncReturn(CheckMode, false) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") + }) + + convey.Convey("should return nil where all checks pass", t, func() { + patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) + patch.ApplyFuncReturn(CheckMode, true) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldBeNil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go new file mode 100644 index 0000000..49c2f36 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go @@ -0,0 +1,75 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for password handler +package utils + +import ( + "bytes" + "errors" + "regexp" +) + +const ( + lowercaseCharactersRegex = `[a-z]{1,}` + uppercaseCharactersRegex = `[A-Z]{1,}` + baseNumberRegex = `[0-9]{1,}` + specialCharactersRegex = `[!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{1,}` + passWordRegex = `^[a-zA-Z0-9!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{8,64}$` + minComplexCount = 2 +) + +// CheckPassWordComplexity check password complexity +func CheckPassWordComplexity(s []byte) error { + complexCheckRegexArr := []string{ + lowercaseCharactersRegex, + uppercaseCharactersRegex, + baseNumberRegex, + specialCharactersRegex, + } + complexCount := 0 + for _, pattern := range complexCheckRegexArr { + if matched, err := regexp.Match(pattern, s); matched && err == nil { + complexCount++ + } + } + if complexCount < minComplexCount { + return errors.New("password complex not meet the requirement") + } + return nil +} + +// ValidatePassWord validate password +func ValidatePassWord(userName string, passWord []byte) error { + if err := commonCheckForPassWord(userName, passWord); err != nil { + return err + } + return CheckPassWordComplexity(passWord) +} + +func commonCheckForPassWord(userName string, passWord []byte) error { + if matched, err := regexp.Match(passWordRegex, passWord); err != nil || !matched { + return errors.New("password not meet requirement") + } + var userNameByte []byte = []byte(userName) + if bytes.Equal(userNameByte, passWord) { + return errors.New("password cannot equals username") + } + var reverseUserName = ReverseString(userName) + var reverseUserNameByte []byte = []byte(reverseUserName) + if bytes.Equal(reverseUserNameByte, passWord) { + return errors.New("password cannot equal reversed username") + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go new file mode 100644 index 0000000..808c231 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go @@ -0,0 +1,59 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for password handler +package utils + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +var ( + truePasswd = []byte("aA0!\"#$%&'()*+,-. /:;<=>?@[\\]^_`{|}~") + falsePasswd1 = []byte("userName") + falsePasswd2 = []byte("12345678") + falsePasswd3 = []byte("1234567") + falsePasswd4 = []byte("emaNresu.") + falsePasswd5 = []byte("不支持特殊字符测试test") +) + +// TestCommonCheckForPassWord test common check for passWord +func TestCommonCheckForPassWord(t *testing.T) { + convey.Convey("correct password", t, func() { + err := ValidatePassWord("userName", truePasswd) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("username == password", t, func() { + err := ValidatePassWord("userName", falsePasswd1) + convey.So(err.Error(), convey.ShouldEqual, "password cannot equals username") + }) + convey.Convey("complex not meet the requirement", t, func() { + err := ValidatePassWord("userName", falsePasswd2) + convey.So(err.Error(), convey.ShouldEqual, "password complex not meet the requirement") + }) + convey.Convey("password too short", t, func() { + err := ValidatePassWord("userName", falsePasswd3) + convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") + }) + convey.Convey("username equal reverse password", t, func() { + err := ValidatePassWord(".userName", falsePasswd4) + convey.So(err.Error(), convey.ShouldEqual, "password cannot equal reversed username") + }) + convey.Convey("test special ", t, func() { + err := ValidatePassWord("userName", falsePasswd5) + convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice.go b/mind-cluster/component/ascend-common/common-utils/utils/slice.go new file mode 100644 index 0000000..f673bc1 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/slice.go @@ -0,0 +1,129 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for slice utils +package utils + +import ( + "fmt" + "slices" + "strconv" +) + +// hex hexadecimal +const hex = 16 + +type stringTool struct{} + +// StringTool slice for string tool +var StringTool stringTool + +// HexStringToInt hex string slice to int64 slice +func (s stringTool) HexStringToInt(sources []string) map[int64]struct{} { + intMap := make(map[int64]struct{}, len(sources)) + for _, source := range sources { + num, err := strconv.ParseInt(source, hex, 0) + if err != nil { + fmt.Printf("parse hex to int failed, skip it. error: %v\n", err) + continue + } + intMap[num] = struct{}{} + } + return intMap +} + +// Contains check whether slice contains target +func Contains[T comparable](sources []T, target T) bool { + for _, v := range sources { + if v == target { + return true + } + } + return false +} + +// Remove delete the first matching element in the slice +func Remove[T comparable](slice []T, target T) []T { + for i, v := range slice { + if v == target { + return append(slice[:i], slice[i+1:]...) + } + } + return slice +} + +// RemoveDuplicates remove duplicates from slice +func RemoveDuplicates[T comparable](slice []T) []T { + existMap := make(map[T]struct{}) + result := make([]T, 0) + for _, str := range slice { + if _, ok := existMap[str]; !ok { + existMap[str] = struct{}{} + result = append(result, str) + } + } + return result +} + +// SameElementInMap whether map contains target +func SameElementInMap[T comparable](sources map[T]struct{}, targets []T) bool { + for _, target := range targets { + if _, ok := sources[target]; ok { + return true + } + } + return false +} + +// RemoveEleSli remove element in sources which is in target +func RemoveEleSli[T comparable](source, target []T) []T { + sliMap := make(map[T]struct{}) + for _, item := range target { + sliMap[item] = struct{}{} + } + + result := make([]T, 0) + for _, ele := range source { + if _, ok := sliMap[ele]; !ok { + result = append(result, ele) + } + } + return result +} + +// RemoveElementsNotInSecond remove elements not in slice2 +func RemoveElementsNotInSecond[T comparable](slice1, slice2 []T) []T { + sliMap := make(map[T]struct{}) + for _, item := range slice2 { + sliMap[item] = struct{}{} + } + + result := make([]T, 0) + for _, item := range slice1 { + if _, ok := sliMap[item]; ok { + result = append(result, item) + } + } + return result +} + +// CheckSliceSupport check elements is supported in expects +func CheckSliceSupport(elements []int64, expects []int64) error { + for _, e := range elements { + if !slices.Contains(expects, e) { + return fmt.Errorf("element %v does not contain %v", e, expects) + } + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go b/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go new file mode 100644 index 0000000..b3bf161 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go @@ -0,0 +1,536 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for slice utils +package utils + +import ( + "fmt" + "reflect" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +const ( + decimal1A = 26 + decimalFF = 255 + decimalNeg10 = 16 + decimalNegFF = -255 +) + +func buildHexStringToIntTestCase() []struct { + name string + input []string + expected map[int64]struct{} +} { + return []struct { + name string + input []string + expected map[int64]struct{} + }{ + { + name: "01 - Valid hex strings", + input: []string{"1A", "FF", "10"}, + expected: map[int64]struct{}{ + decimal1A: {}, + decimalFF: {}, + decimalNeg10: {}, + }, + }, + { + name: "02 - Invalid hex strings", + input: []string{"xyz", "ghijk"}, + expected: map[int64]struct{}{}, + }, + { + name: "03 - Empty input array", + input: []string{}, + expected: map[int64]struct{}{}, + }, + { + name: "04 - Duplicate values should be deduplicated", + input: []string{"0x1A", "1A", "0x1a"}, // All represent 26 in decimal + expected: map[int64]struct{}{ + decimal1A: {}, + }, + }, + { + name: "05 - Mixed valid and invalid inputs", + input: []string{"0x1A", "xyz", "0xFF", "invalid", "0x10"}, + expected: map[int64]struct{}{}, + }, + { + name: "06 - Negative hex numbers", + input: []string{"-0x1A", "-FF"}, + expected: map[int64]struct{}{ + decimalNegFF: {}, + }, + }, + } +} + +func TestHexStringToInt(t *testing.T) { + for _, tt := range buildHexStringToIntTestCase() { + t.Run(tt.name, func(t *testing.T) { + result := StringTool.HexStringToInt(tt.input) + for i := range tt.expected { + fmt.Println(i) + } + if len(result) != len(tt.expected) { + t.Errorf("Expected map length %d, but got %d", len(tt.expected), len(result)) + return + } + for key := range tt.expected { + if _, exists := result[key]; !exists { + t.Errorf("Expected key %d not found in result", key) + } + } + for key := range result { + if _, exists := tt.expected[key]; !exists { + t.Errorf("Unexpected key %d found in result", key) + } + } + }) + } +} + +func TestSameElementInMap(t *testing.T) { + for _, tt := range buildSameElementInMapTestCase() { + t.Run(tt.name, func(t *testing.T) { + result := SameElementInMap(tt.sources, tt.targets) + if result != tt.expected { + t.Errorf("SameElementInMap() = %v, expected %v", result, tt.expected) + } + }) + } +} + +func buildSameElementInMapTestCase() []struct { + name string + sources map[int]struct{} + targets []int + expected bool +} { + return []struct { + name string + sources map[int]struct{} + targets []int + expected bool + }{ + { + name: "01 There are identical elements present", + sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, + targets: []int{4, 5, 2}, + expected: true, + }, + { + name: "02 There are no identical elements present\n", + sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, + targets: []int{4, 5, 6}, + expected: false, + }, + { + name: "03 target is nil", + sources: map[int]struct{}{1: {}, 2: {}}, + targets: []int{}, + expected: false, + }, + { + name: "04 source is nil", + sources: map[int]struct{}{}, + targets: []int{1, 2, 3}, + expected: false, + }, + { + name: "05 source and target are both nil", + sources: map[int]struct{}{}, + targets: []int{}, + expected: false, + }, + } +} + +func TestSameElementInMap_StringType(t *testing.T) { + sources := map[string]struct{}{ + "apple": {}, + "banana": {}, + "orange": {}, + } + targets := []string{"grape", "apple", "kiwi"} + result := SameElementInMap(sources, targets) + if !result { + t.Errorf("SameElementInMap() with string type should return true, got false") + } + targetsNoMatch := []string{"grape", "kiwi", "mango"} + resultNoMatch := SameElementInMap(sources, targetsNoMatch) + if resultNoMatch { + t.Errorf("SameElementInMap() with string type should return false, got true") + } +} + +func TestContains(t *testing.T) { + for _, tt := range buildContainsTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.source.(type) { + case []int: + s2 := tt.target.(int) + result := Contains(s1, s2) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Contains() = %v, want %v", result, tt.expected) + } + case []string: + s2 := tt.target.(string) + result := Contains(s1, s2) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Contains() = %v, want %v", result, tt.expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildContainsTestCase() []struct { + name string + source interface{} + target interface{} + expected bool +} { + return []struct { + name string + source interface{} + target interface{} + expected bool + }{ + { + name: "01 contains for int type", + source: []int{1, 2, 3, 4}, + target: 1, + expected: true, + }, + { + name: "02 not contains for int type", + source: []int{1, 2, 3, 4}, + target: 0, + expected: false, + }, + { + name: "03 contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "1", + expected: true, + }, + { + name: "04 not contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "0", + expected: false, + }, + { + name: "05 empty source slice", + source: []int{}, + target: 1, + expected: false, + }, + } +} + +func TestRemove(t *testing.T) { + for _, tt := range buildRemoveTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.source.(type) { + case []int: + s2 := tt.target.(int) + result := Remove(s1, s2) + expected := tt.expected.([]int) + if !reflect.DeepEqual(result, expected) { + t.Errorf("Contains() = %v, want %v", result, expected) + } + case []string: + s2 := tt.target.(string) + result := Remove(s1, s2) + expected := tt.expected.([]string) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildRemoveTestCase() []struct { + name string + source interface{} + target interface{} + expected interface{} +} { + return []struct { + name string + source interface{} + target interface{} + expected interface{} + }{ + { + name: "01 contains for int type", + source: []int{1, 2, 3, 4}, + target: 1, + expected: []int{2, 3, 4}, + }, + { + name: "02 not contains for int type", + source: []int{1, 2, 3, 4}, + target: 0, + expected: []int{1, 2, 3, 4}, + }, + { + name: "03 contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "1", + expected: []string{"2", "3", "4"}, + }, + { + name: "04 not contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "0", + expected: []string{"1", "2", "3", "4"}, + }, + { + name: "05 empty source slice", + source: []int{}, + target: 1, + expected: []int{}, + }, + } +} + +func buildRemoveElementsNotInSecondTestCase() []struct { + name string + slice1 interface{} + slice2 interface{} + expected interface{} +} { + return []struct { + name string + slice1 interface{} + slice2 interface{} + expected interface{} + }{ + { + name: "01 Basic functionality - integer slices with partial overlap", + slice1: []int{1, 2, 3, 4}, + slice2: []int{2, 4, 6, 8}, + expected: []int{2, 4}, + }, + { + name: "02 Empty first slice", + slice1: []int{}, + slice2: []int{1, 2, 3}, + expected: []int{}, + }, + { + name: "03 Empty second slice", + slice1: []int{1, 2, 3}, + slice2: []int{}, + expected: []int{}, + }, + { + name: "04 Both slices empty", + slice1: []int{}, + slice2: []int{}, + expected: []int{}, + }, + { + name: "05 No intersection between slices", + slice1: []int{1, 2, 3}, + slice2: []int{4, 5, 6}, + expected: []int{}, + }, + { + name: "06 String type test", + slice1: []string{"1", "2", "3"}, + slice2: []string{"2", "3", "4"}, + expected: []string{"2", "3"}, + }, + } +} + +func TestRemoveElementsNotInSecond(t *testing.T) { + for _, tt := range buildRemoveElementsNotInSecondTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.slice1.(type) { + case []int: + s2 := tt.slice2.([]int) + expected := tt.expected.([]int) + result := RemoveElementsNotInSecond(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) + } + case []string: + s2 := tt.slice2.([]string) + expected := tt.expected.([]string) + result := RemoveElementsNotInSecond(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildRemoveEleSliTestCase() []struct { + name string + source interface{} + target interface{} + expected interface{} +} { + return []struct { + name string + source interface{} + target interface{} + expected interface{} + }{ + { + name: "01 int type", + source: []int{1, 2, 3, 4, 5}, + target: []int{2, 4}, + expected: []int{1, 3, 5}, + }, + { + name: "02 source is empty for int type", + source: []int{}, + target: []int{1, 2}, + expected: []int{}, + }, + { + name: "03 target is empty for int type", + source: []int{1, 2, 3}, + target: []int{}, + expected: []int{1, 2, 3}, + }, + { + name: "04 source and target are both empty for int type", + source: []int{}, + target: []int{}, + expected: []int{}, + }, + { + name: "05 string type", + source: []string{"a", "b", "c", "d"}, + target: []string{"b", "d"}, + expected: []string{"a", "c"}, + }, + } +} + +func TestRemoveEleSli(t *testing.T) { + for _, tt := range buildRemoveEleSliTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.source.(type) { + case []int: + s2 := tt.target.([]int) + expected := tt.expected.([]int) + result := RemoveEleSli(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveEleSli() = %v, want %v", result, expected) + } + case []string: + s2 := tt.target.([]string) + expected := tt.expected.([]string) + result := RemoveEleSli(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveEleSli() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildRemoveDuplicatesCase() []struct { + name string + input interface{} + expected interface{} +} { + return []struct { + name string + input interface{} + expected interface{} + }{ + { + name: "01 empty slice for int type", + input: []int{}, + expected: []int{}, + }, + { + name: "02 no duplicates for int type", + input: []int{1, 2, 3}, + expected: []int{1, 2, 3}, + }, + { + name: "03 with duplicates for int type", + input: []int{1, 2, 2, 3, 1, 4}, + expected: []int{1, 2, 3, 4}, + }, + { + name: "04 with duplicates for string type", + input: []string{"1", "3", "3", "4"}, + expected: []string{"1", "3", "4"}, + }, + } +} + +func TestRemoveDuplicates(t *testing.T) { + for _, tt := range buildRemoveDuplicatesCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.input.(type) { + case []int: + expected := tt.expected.([]int) + result := RemoveDuplicates(s1) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) + } + case []string: + expected := tt.expected.([]string) + result := RemoveDuplicates(s1) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func TestCheckSliceSupport(t *testing.T) { + convey.Convey("test TestCheckSliceSupport, check ok", t, func() { + elements := []int64{1, 2} + expects := []int64{1, 2, 3} + err := CheckSliceSupport(elements, expects) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("test TestCheckSliceSupport, check fail", t, func() { + elements := []int64{1, 2, 4} + expects := []int64{1, 2, 3} + err := CheckSliceSupport(elements, expects) + convey.So(err, convey.ShouldNotBeNil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings.go b/mind-cluster/component/ascend-common/common-utils/utils/strings.go new file mode 100644 index 0000000..c3d98aa --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/strings.go @@ -0,0 +1,75 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "crypto/sha256" + "fmt" + "unicode" +) + +const ( + maskLen = 2 +) + +// ReplacePrefix replace string with prefix +func ReplacePrefix(source, prefix string) string { + if prefix == "" { + prefix = "****" + } + if len(source) <= maskLen { + return prefix + } + end := string([]rune(source)[maskLen:len(source)]) + return prefix + end +} + +// MaskPrefix mask string prefix with **** +func MaskPrefix(source string) string { + return ReplacePrefix(source, "") +} + +// GetSha256Code return the sha256 hash bytes +func GetSha256Code(data []byte) []byte { + hash256 := sha256.New() + if _, err := hash256.Write(data); err != nil { + fmt.Println(err) + return nil + } + return hash256.Sum(nil) +} + +// ReverseString reverse string +func ReverseString(s string) string { + runes := []rune(s) + for start, end := 0, len(runes)-1; start < end; start, end = start+1, end-1 { + runes[start], runes[end] = runes[end], runes[start] + } + return string(runes) +} + +// IsDigitString return string is all digit +func IsDigitString(s string) bool { + if len(s) == 0 { + return false + } + for _, c := range s { + if !unicode.IsDigit(c) { + return false + } + } + return true +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go b/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go new file mode 100644 index 0000000..390e424 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go @@ -0,0 +1,84 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +const byteLength = 32 + +func TestReplacePrefix(t *testing.T) { + convey.Convey("relative path", t, func() { + path := ReplacePrefix("./testdata/cert/ca.crt", "****") + convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") + }) + convey.Convey("abconvey.Solute path", t, func() { + path := ReplacePrefix("/testdata/cert/ca.crt", "****") + convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") + }) + convey.Convey("path length less than 2", t, func() { + path := ReplacePrefix("/", "****") + convey.So(path, convey.ShouldEqual, "****") + }) + convey.Convey("empty string", t, func() { + path := ReplacePrefix("", "****") + convey.So(path, convey.ShouldEqual, "****") + }) + +} + +func TestMaskPrefix(t *testing.T) { + convey.Convey("relative path", t, func() { + path := MaskPrefix("./testdata/cert/ca.crt") + convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") + }) + convey.Convey("abconvey.Solute path", t, func() { + path := MaskPrefix("/testdata/cert/ca.crt") + convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") + }) + convey.Convey("path length less than 2", t, func() { + path := MaskPrefix("/") + convey.So(path, convey.ShouldEqual, "****") + }) + convey.Convey("empty string", t, func() { + path := MaskPrefix("") + convey.So(path, convey.ShouldEqual, "****") + }) + +} + +func TestGetSha256Code(t *testing.T) { + convey.Convey("test sha256", t, func() { + hashs := GetSha256Code([]byte("this is a test sentence")) + convey.So(len(hashs), convey.ShouldEqual, byteLength) + }) +} + +func TestIsDigitString(t *testing.T) { + convey.Convey("test IsDigitString", t, func() { + convey.Convey("case IsDigitString is true", func() { + str := "123" + convey.ShouldBeTrue(IsDigitString(str)) + }) + convey.Convey("case IsDigitString is false", func() { + str := "123a" + convey.ShouldBeFalse(IsDigitString(str)) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/devmanager/a310mgr.go b/mind-cluster/component/ascend-common/devmanager/a310mgr.go new file mode 100644 index 0000000..081f167 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/a310mgr.go @@ -0,0 +1,25 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this Ascend310 device manager +package devmanager + +import ( + "ascend-common/devmanager/dcmi" +) + +// A310Manager Ascend310 device manager +type A310Manager struct { + dcmi.DcManager +} diff --git a/mind-cluster/component/ascend-common/devmanager/a310pmgr.go b/mind-cluster/component/ascend-common/devmanager/a310pmgr.go new file mode 100644 index 0000000..b32d1fa --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/a310pmgr.go @@ -0,0 +1,35 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this Ascend310P device manager +package devmanager + +import ( + "ascend-common/devmanager/dcmi" +) + +// A310PManager Ascend310P device manager +type A310PManager struct { + dcmi.DcManager +} + +// DcGetDevicePowerInfo query power by mcu interface for 310P +func (d *A310PManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { + return d.DcGetMcuPowerInfo(cardID) +} + +// DcGetMcuPowerInfo this function is only for Ascend310P +func (d *A310PManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { + return dcmi.FuncDcmiMcuGetPowerInfo(cardID) +} diff --git a/mind-cluster/component/ascend-common/devmanager/a910mgr.go b/mind-cluster/component/ascend-common/devmanager/a910mgr.go new file mode 100644 index 0000000..1bb2beb --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/a910mgr.go @@ -0,0 +1,31 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this Ascend910 device manager +package devmanager + +import ( + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// A910Manager Ascend910 device manager +type A910Manager struct { + dcmi.DcManager +} + +// DcGetHbmInfo get HBM information, only for Ascend910 +func (d *A910Manager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { + return dcmi.FuncDcmiGetDeviceHbmInfo(cardID, deviceID) +} diff --git a/mind-cluster/component/ascend-common/devmanager/common/constants.go b/mind-cluster/component/ascend-common/devmanager/common/constants.go new file mode 100644 index 0000000..e39ddac --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/constants.go @@ -0,0 +1,272 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common define common variable +package common + +import ( + "math" + + "k8s.io/apimachinery/pkg/util/sets" +) + +// DeviceType define device type +type DeviceType struct { + // Code device type code + Code int32 + // Name device type name + Name string +} + +var ( + // ProfilingTime for getting PCIe bandwidth + ProfilingTime int + + // HccsBWProfilingTime for getting hccs bandwidth + HccsBWProfilingTime int + + // a3BoardIds for A3 Board IDs + a3BoardIds = sets.NewInt32(A900A3SuperPodBin1BoardId, A900A3SuperPodBin2BoardId, + A900A3SuperPodBin3BoardId, A800IA3BoardId) + + // a900A3SuperPodMainBoardIds for A900 A3 Super Pod Main Board IDs + a900A3SuperPodMainBoardIds = sets.NewInt32(A900A3SuperPodMainBoardId1, A900A3SuperPodMainBoardId2) + + // a9000A3SuperPodMainBoardIds for A9000 A3 Super Pod Main Board IDs + a9000A3SuperPodMainBoardIds = sets.NewInt32(A9000A3SuperPodMainBoardId1, A9000A3SuperPodMainBoardId2) +) + +// DeviceType for utilization +var ( + // AICore Ascend310 & Ascend910 + AICore = DeviceType{Code: 2, Name: "AICore"} + // HbmUtilization utilization rate of hbm + HbmUtilization = DeviceType{Code: 6, Name: "Hbm"} + // VectorCore Ascend310P + VectorCore = DeviceType{Code: 12, Name: "VectorCore"} + // Overall Overall utilization rate of NPU + Overall = DeviceType{Code: 13, Name: "Overall"} +) + +// DeviceType for frequency +var ( + // AICoreCurrentFreq Ascend310 & Ascend910 & Ascend910B & Ascend310P + AICoreCurrentFreq = DeviceType{Code: 7, Name: "AICore Current"} +) + +const ( + // Success for interface return code + Success = 0 + // DeviceNotReadyErrCodeStr for dcmi interface device not ready err code string + DeviceNotReadyErrCodeStr = "-8012" + // DeviceNotReadyErrCode for dcmi interface device not ready err code + DeviceNotReadyErrCode = -8012 + // CardDropFaultCode card drop fault code + CardDropFaultCode = 0x40F84E00 + // RetError return error when the function failed + RetError = -1 + // Percent constant of 100 + Percent = 100 + // MaxErrorCodeCount number of error codes + MaxErrorCodeCount = 128 + // UnRetError return unsigned int error + UnRetError = math.MaxUint32 + // Abnormal status of Abnormal + Abnormal = "Abnormal" + // ChannelStateOk means out band channel is ok for resetting + ChannelStateOk = 1 + + // HiAIMaxCardID max card id for Ascend chip + HiAIMaxCardID = math.MaxInt32 + + // HiAIMaxCardNum max card number + HiAIMaxCardNum = 64 + + // HiAIMaxDeviceNum max device number + HiAIMaxDeviceNum = 4 + + // NpuType present npu chip + NpuType = 0 + + // ReduceOnePercent for calculation reduce one percent + ReduceOnePercent = 0.01 + // ReduceTenth for calculation reduce one tenth + ReduceTenth = 0.1 + // DefaultTemperatureWhenQueryFailed when get temperature failed, use this value + DefaultTemperatureWhenQueryFailed = -275 + + // Ascend310P ascend 310P chip + Ascend310P = "Ascend310P" + // Ascend910 ascend 910 chip + Ascend910 = "Ascend910" + // Ascend910B ascend 910B chip + Ascend910B = "Ascend910B" + // Ascend910A3 ascend Ascend910A3 chip + Ascend910A3 = "Ascend910A3" + // Atlas200ISoc 200 soc env + Atlas200ISoc = "Atlas 200I SoC A1" + + // DcmiApiTimeout dcmi interface timeout seconds + DcmiApiTimeout = 1 + + // SubscribeAllDevice subscribe all device ID + SubscribeAllDevice = -1 + // MinVDevID min value of virtual device id + MinVDevID = 100 + // MaxVDevID max value of virtual device id + MaxVDevID = 1124 + + // InvalidID invalid ID + InvalidID = 0xffffffff + + // FailedMetricValue for failed metric value + FailedMetricValue = -1 + + // FailedValue for failed value + FailedValue = 0xffffffff + + // MaxErrorCodeLen max length of error code for Prometheus + MaxErrorCodeLen = 10 +) + +const ( + // BootStartFinish chip hot reset finish + BootStartFinish = 16 +) + +const ( + // FaultRecover device fault recover + FaultRecover = int8(0) + // FaultOccur device fault occur + FaultOccur = int8(1) + // FaultOnce once device fault + FaultOnce = int8(2) +) + +const ( + // AMPMode for AMP chip work mode + AMPMode = "AMP" + // SMPMode for SMP chip work mode + SMPMode = "SMP" + + // NetworkInit init status + NetworkInit = 6 + // NetworkSuccess chip network is healthy + NetworkSuccess = 0 + + // MaxProcNum process number in device side + MaxProcNum = 32 + // UnitMB MB + UnitMB float64 = 1024 * 1024 + + // Chip910 chip name 910 + Chip910 = "910" + + // A300IA2BoardId board id of A300I A2 and 910proB + A300IA2BoardId = 0x28 + + // A300IA2GB64BoardId board id of A300I A2 64GB + A300IA2GB64BoardId = 0x29 + + // A900A3SuperPodBin1BoardId board id of A900/A9000 A3 SuperPod Bin1 + A900A3SuperPodBin1BoardId = 0xb0 + + // A900A3SuperPodBin2BoardId board id of A900/A9000 A3 SuperPod Bin2 + A900A3SuperPodBin2BoardId = 0xb1 + + // A900A3SuperPodBin3BoardId board id of A900/A9000 A3 SuperPod Bin3 + A900A3SuperPodBin3BoardId = 0xb2 + + // A800IA3BoardId board id of A800I A3 + A800IA3BoardId = 0xb3 + + // A900A3SuperPodMainBoardId1 board id of A900 A3 SuperPod MainBoard1 + A900A3SuperPodMainBoardId1 = 0x18 + + // A900A3SuperPodMainBoardId2 board id of A900 A3 SuperPod MainBoard2 + A900A3SuperPodMainBoardId2 = 0x19 + + // A800IA3MainBoardId A800I A3 MainBoardId + A800IA3MainBoardId = 0x14 + + // A9000A3SuperPodMainBoardId1 board id of A9000 A3 SuperPod MainBoard1 + A9000A3SuperPodMainBoardId1 = 0x1C + + // A9000A3SuperPodMainBoardId2 board id of A9000 A3 SuperPod MainBoard2 + A9000A3SuperPodMainBoardId2 = 0x1D +) + +// log limit domains for metrics +const ( + // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID + DomainForLogicIdErr = "logicID" +) + +// DcmiDeviceType used to represent the dcmi device type +type DcmiDeviceType int32 + +const ( + // DcmiDeviceTypeDDR represents the component type DCMI_DEVICE_TYPE_DDR + DcmiDeviceTypeDDR DcmiDeviceType = 0 + // DcmiDeviceTypeSRAM represents the component type DCMI_DEVICE_TYPE_SRAM + DcmiDeviceTypeSRAM DcmiDeviceType = 1 + // DcmiDeviceTypeHBM represents the component type DCMI_DEVICE_TYPE_HBM + DcmiDeviceTypeHBM DcmiDeviceType = 2 + // DcmiDeviceTypeNPU represents the component type DCMI_DEVICE_TYPE_NPU + DcmiDeviceTypeNPU DcmiDeviceType = 3 + // DcmiDeviceTypeNONE represents the component type DCMI_DEVICE_TYPE_NONE + DcmiDeviceTypeNONE DcmiDeviceType = 0xff +) + +const ( + // ErrMsgInitCardListFailed is used where initialization of the card list fails + ErrMsgInitCardListFailed = "get card list failed for init" + // ErrMsgGetBoardInfoFailed is used where there is a failure in getting board info + ErrMsgGetBoardInfoFailed = "get board info failed, no card found" +) + +const ( + // MaxHccspingMeshAddr is the max number of hccsping addresses + MaxHccspingMeshAddr = 1024 + // MinPktSize is the min packet size + MinPktSize = 1792 + // MaxPktSize is the max packet size + MaxPktSize = 3000 + // MinPktSendNum is the min packet send number + MinPktSendNum = 1 + // MaxPktSendNum is the max packet send number + MaxPktSendNum = 1000 + // MinPktInterval is the min packet interval + MinPktInterval = 1 + // MaxPktInterval is the max packet interval + MaxPktInterval = 1000 + // MinTaskInterval is the min task interval + MinTaskInterval = 1 + // MaxTaskInterval is the max task interval + MaxTaskInterval = 60 + // InternalPingMeshTaskID is the inner ping mesh task id + InternalPingMeshTaskID uint = 0 + // ExternalPingMeshTaskID is the outer ping mesh task id + ExternalPingMeshTaskID uint = 1 + // DefaultPingMeshPortID is the default ping mesh port + DefaultPingMeshPortID = 0 + // DefaultPktSize is the default packet size + DefaultPktSize = 1792 + // DefaultPktSendNum is the default packet send number + DefaultPktSendNum = 10 + // DefaultPktInterval is the default packet interval + DefaultPktInterval = 10 + // DefaultTimeout is the default timeout + DefaultTimeout = 1 +) diff --git a/mind-cluster/component/ascend-common/devmanager/common/types.go b/mind-cluster/component/ascend-common/devmanager/common/types.go new file mode 100644 index 0000000..870c716 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/types.go @@ -0,0 +1,435 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common define common types +package common + +// MemoryInfo memory information struct +type MemoryInfo struct { + MemorySize uint64 `json:"memory_size"` + MemoryAvailable uint64 `json:"memory_available"` + Frequency uint32 `json:"memory_frequency"` + Utilization uint32 `json:"memory_utilization"` +} + +// HbmInfo high bandwidth memory info +type HbmInfo struct { + MemorySize uint64 `json:"memory_size"` // total size,MB + Frequency uint32 `json:"hbm_frequency"` // frequency MHz + Usage uint64 `json:"memory_usage"` // memory usage,MB + Temp int32 `json:"hbm_temperature"` // temperature + BandWidthUtilRate uint32 `json:"hbm_bandwidth_util"` // bandwidth utilization +} + +// HbmAggregateInfo more comprehensive high bandwidth memory information with ecc information +type HbmAggregateInfo struct { + *HbmInfo + ECCInfo *ECCInfo `json:"hbm_ecc_info"` // ECC information +} + +// ChipInfo chip info +type ChipInfo struct { + Type string `json:"chip_type"` + Name string `json:"chip_name"` + Version string `json:"chip_version"` + NpuName string `json:"npu_name"` + AICoreCnt int `json:"aicore_cnt"` +} + +// ChipBaseInfo all id of chip +type ChipBaseInfo struct { + PhysicID int32 + LogicID int32 + CardID int32 + DeviceID int32 +} + +// CgoCreateVDevOut create virtual device output info +type CgoCreateVDevOut struct { + VDevID uint32 + PcieBus uint32 + PcieDevice uint32 + PcieFunc uint32 + VfgID uint32 + Reserved []uint8 +} + +// CgoCreateVDevRes create virtual device input info +type CgoCreateVDevRes struct { + VDevID uint32 + VfgID uint32 + TemplateName string + Reserved []uint8 +} + +// CgoBaseResource base resource info +type CgoBaseResource struct { + Token uint64 + TokenMax uint64 + TaskTimeout uint64 + VfgID uint32 + VipMode uint8 + Reserved []uint8 +} + +// CgoComputingResource compute resource info +type CgoComputingResource struct { + // accelator resource + Aic float32 + Aiv float32 + Dsa uint16 + Rtsq uint16 + Acsq uint16 + Cdqm uint16 + CCore uint16 + Ffts uint16 + Sdma uint16 + PcieDma uint16 + + // memory resource, MB as unit + MemorySize uint64 + + // id resource + EventID uint32 + NotifyID uint32 + StreamID uint32 + ModelID uint32 + + // cpu resource + TopicScheduleAicpu uint16 + HostCtrlCPU uint16 + HostAicpu uint16 + DeviceAicpu uint16 + TopicCtrlCPUSlot uint16 + + Reserved []uint8 +} + +// CgoMediaResource media resource info +type CgoMediaResource struct { + Jpegd float32 + Jpege float32 + Vpc float32 + Vdec float32 + Pngd float32 + Venc float32 + Reserved []uint8 +} + +// CgoVDevQueryInfo virtual resource special info +type CgoVDevQueryInfo struct { + Name string + Status uint32 + IsContainerUsed uint32 + Vfid uint32 + VfgID uint32 + ContainerID uint64 + Base CgoBaseResource + Computing CgoComputingResource + Media CgoMediaResource +} + +// CgoVDevQueryStru virtual resource info +type CgoVDevQueryStru struct { + VDevID uint32 + QueryInfo CgoVDevQueryInfo +} + +// CgoSocFreeResource soc free resource info +type CgoSocFreeResource struct { + VfgNum uint32 + VfgBitmap uint32 + Base CgoBaseResource + Computing CgoComputingResource + Media CgoMediaResource +} + +// CgoSocTotalResource soc total resource info +type CgoSocTotalResource struct { + VDevNum uint32 + VDevID []uint32 + VfgNum uint32 + VfgBitmap uint32 + Base CgoBaseResource + Computing CgoComputingResource + Media CgoMediaResource +} + +// CgoSuperPodInfo super pod info +type CgoSuperPodInfo struct { + SdId uint32 + ScaleType uint32 + SuperPodId uint32 + ServerId uint32 + Reserve []uint32 +} + +// VirtualDevInfo virtual device infos +type VirtualDevInfo struct { + TotalResource CgoSocTotalResource + FreeResource CgoSocFreeResource + VDevInfo []CgoVDevQueryStru + VDevActivityInfo []VDevActivityInfo +} + +// DevFaultInfo device's fault info +type DevFaultInfo struct { + EventID int64 + LogicID int32 + ModuleType int8 // ModuleType prototype is dcmi node_type + ModuleID int8 // ModuleID prototype is dcmi node_id + SubModuleType int8 // SubModuleType prototype is dcmi sub_node_type + SubModuleID int8 // SubModuleID prototype is dcmi sub_node_id + Severity int8 + Assertion int8 + AlarmRaisedTime int64 +} + +// DevProcessInfo device process info +type DevProcessInfo struct { + DevProcArray []DevProcInfo + ProcNum int32 +} + +// DevProcInfo process info in device side +type DevProcInfo struct { + Pid int32 + // the total amount of memory occupied by the device side OS and allocated by the business, unit is MB + MemUsage float64 +} + +// BoardInfo board info of device +type BoardInfo struct { + BoardId uint32 + PcbId uint32 + BomId uint32 + SlotId uint32 +} + +// VDevActivityInfo vNPU activity info for 310P +type VDevActivityInfo struct { + VDevID uint32 + VDevAiCoreRate uint32 + VDevTotalMem uint64 + VDevUsedMem uint64 + VDevAiCore float64 + IsVirtualDev bool +} + +// PCIEBwStat contains pcie bandwidth +type PCIEBwStat struct { + PcieRxPBw PcieStatValue + PcieRxNPBw PcieStatValue + PcieRxCPLBw PcieStatValue + PcieTxPBw PcieStatValue + PcieTxNPBw PcieStatValue + PcieTxCPLBw PcieStatValue +} + +// PcieStatValue pcie stat three value, like [min_bw,max_bw,avg_bw] +type PcieStatValue struct { + PcieMinBw int32 + PcieMaxBw int32 + PcieAvgBw int32 +} + +// DeviceNetworkHealth dcmi_get_device_network_health api return value +type DeviceNetworkHealth struct { + HealthCode uint32 + RetCode int32 +} + +// ECCInfo dcmi_get_device_ecc_info api return value +type ECCInfo struct { + EnableFlag int32 + SingleBitErrorCnt int64 + DoubleBitErrorCnt int64 + TotalSingleBitErrorCnt int64 + TotalDoubleBitErrorCnt int64 + SingleBitIsolatedPagesCnt int64 + DoubleBitIsolatedPagesCnt int64 +} + +// NpuNetInfo network info of npu +type NpuNetInfo struct { + // The optical info + OpticalInfo *OpticalInfo + // The transfer rate of network port + LinkSpeedInfo *LinkSpeedInfo + // Historical link statistics of network ports + LinkStatInfo *LinkStatInfo + // Statistics about packets + StatInfo *StatInfo + // Network port real-time bandwidth + BandwidthInfo *BandwidthInfo + // LinkStatusInfo refers to the link state + LinkStatusInfo *LinkStatusInfo +} + +// BandwidthInfo contains network port real-time bandwidth +type BandwidthInfo struct { + // TxValue transform speed + TxValue float64 `json:"tx_value"` + // RxValue receive speed + RxValue float64 `json:"rx_value"` +} + +// HccsStatisticInfo contains hccs statistic info +type HccsStatisticInfo struct { + TxCnt []uint64 + RxCnt []uint64 + CrcErrCnt []uint64 + retryCnt []uint64 + reservedFieldCnt []uint64 +} + +// HccsBandwidthInfo contains hccs bandwidth info +type HccsBandwidthInfo struct { + ProfilingTime uint32 + TotalTxbw float64 + TotalRxbw float64 + TxBandwidth []float64 + RxBandwidth []float64 +} + +// SioCrcErrStatisticInfo contains sio crc error statistic info +type SioCrcErrStatisticInfo struct { + TxErrCnt int64 + RxErrCnt int64 + Reserved []uint32 +} + +// StatInfo the statistics about packets +type StatInfo struct { + // Total number of pause frames received by the MAC + MacRxPauseNum float64 + // Total number of pause frames sent by MAC + MacTxPauseNum float64 + // Total number of PFC frames received by MAC + MacRxPfcPktNum float64 + // Total number of PFC frames sent by MAC + MacTxPfcPktNum float64 + // Total number of bad packets received by MAC + MacRxBadPktNum float64 + // Total number of bad packets sent by MAC + MacTxBadPktNum float64 + // The total number of packets received by the RoCE network card + RoceRxAllPktNum float64 + // The total number of packets sent by the RoCE network card + RoceTxAllPktNum float64 + // The number of bad packets received by the RoCE network card + RoceRxErrPktNum float64 + // The number of bad packets sent by the RoCE network card + RoceTxErrPktNum float64 + // The number of CNP type packets received by the RoCE network card + RoceRxCnpPktNum float64 + // The number of CNP type packets sent by the RoCE network card + RoceTxCnpPktNum float64 + // Number of RoCE network card retry messages + RoceNewPktRtyNum float64 + // Total number of bytes of bad packets sent by MAC + MacTxBadOctNum float64 + // Total number of bytes of bad packets received by MAC + MacRxBadOctNum float64 + // The number of unexpected ACK messages received by the RoCE network card + RoceUnexpectedAckNum float64 + // The number of out-of-order packets received by the RoCE network card + RoceOutOfOrderNum float64 + // The number of packets with domain segment verification errors received by the RoCE network card + RoceVerificationErrNum float64 + // The number of messages generated by abnormal QP connection status received by the RoCE network card + RoceQpStatusErrNum float64 + // The number of ecn + RoceEcnDBNum float64 + // The number of err info + MacRXFcsErrPktNum float64 +} + +// LinkStatInfo refers to the historical link statistics, including the times of link-up +type LinkStatInfo struct { + // The times of link-up + LinkUPNum float64 +} + +// LinkStatusInfo refers to the link state +type LinkStatusInfo struct { + // The state of link + LinkState string +} + +// LinkSpeedInfo the transfer rate of network port +type LinkSpeedInfo struct { + // The rate of network port + Speed float64 +} + +// OpticalInfo indicates the optical module information +type OpticalInfo struct { + // Optical module status, indicating whether it is in place (present) + OpticalState float64 + // Power sent by No.0 optical module + OpticalTxPower0 float64 + // Power sent by No.1 optical module + OpticalTxPower1 float64 + // Power sent by No.2 optical module + OpticalTxPower2 float64 + // Power sent by No.3 optical module + OpticalTxPower3 float64 + // Reception power of No.0 optical module + OpticalRxPower0 float64 + // Reception power of No.1 optical module + OpticalRxPower1 float64 + // Reception power of No.2 optical module + OpticalRxPower2 float64 + // Reception power of No.3 optical module + OpticalRxPower3 float64 + // Optical module voltage + OpticalVcc float64 + // Optical module temperature + OpticalTemp float64 +} + +// HccspingMeshOperate refers to the operation of hccsping mesh +type HccspingMeshOperate struct { + DstAddr string + PktSize int + PktSendNum int + PktInterval int + Timeout int + TaskInterval int + TaskId int +} + +// HccspingMeshInfo refers to the result of hccsping mesh +type HccspingMeshInfo struct { + DstAddr []string + SucPktNum []uint + FailPktNum []uint + MaxTime []int + MinTime []int + AvgTime []int + TP95Time []int + ReplyStatNum []int + PingTotalNum []int + DestNum int +} + +// ElabelInfo elabel information structure +type ElabelInfo struct { + ProductName string + Model string + Manufacturer string + ManufacturerDate string + SerialNumber string +} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils.go b/mind-cluster/component/ascend-common/devmanager/common/utils.go new file mode 100644 index 0000000..87e14df --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/utils.go @@ -0,0 +1,305 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common this for util method +package common + +import ( + "fmt" + "math" + "regexp" + "strings" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" +) + +var ( + reg910A = regexp.MustCompile(api.Ascend910APattern) + reg910B = regexp.MustCompile(api.Ascend910BPattern) + reg310P = regexp.MustCompile(api.Ascend310PPattern) +) + +// IsGreaterThanOrEqualInt32 check num range +func IsGreaterThanOrEqualInt32(num int64) bool { + if num >= int64(math.MaxInt32) { + return true + } + + return false +} + +// IsValidUtilizationRate valid utilization rate is 0-100 +func IsValidUtilizationRate(num uint32) bool { + if num > uint32(Percent) || num < 0 { + return false + } + + return true +} + +// IsValidChipInfo valid chip info is or not empty +func IsValidChipInfo(chip *ChipInfo) bool { + return chip.Name != "" || chip.Type != "" || chip.Version != "" +} + +// IsValidBoardInfo check whether the board info is valid +func IsValidBoardInfo(board *BoardInfo) bool { + return board.BoardId != InvalidID || board.PcbId != InvalidID || + board.BomId != InvalidID || board.SlotId != InvalidID +} + +// IsValidMainBoardInfo check whether the mainBoardId is valid +func IsValidMainBoardInfo(mainBoardId uint32) bool { + return mainBoardId != InvalidID +} + +// IsValidCardID valid card id +func IsValidCardID(cardID int32) bool { + // for cardID, please watch the maximum value of the driver is changed in the future version + return cardID >= 0 && cardID < HiAIMaxCardID +} + +// IsValidDeviceID valid device id +func IsValidDeviceID(deviceID int32) bool { + return deviceID >= 0 && deviceID < HiAIMaxDeviceNum +} + +// IsValidLogicIDOrPhyID valid logic id +func IsValidLogicIDOrPhyID(id int32) bool { + return id >= 0 && id < HiAIMaxCardNum*HiAIMaxDeviceNum +} + +// IsValidCardIDAndDeviceID check two params both needs meet the requirement +func IsValidCardIDAndDeviceID(cardID, deviceID int32) bool { + if !IsValidCardID(cardID) { + return false + } + + return IsValidDeviceID(deviceID) +} + +// IsValidDevNumInCard valid devNum in card +func IsValidDevNumInCard(num int32) bool { + return num > 0 && num <= HiAIMaxDeviceNum +} + +// IsValidVDevID valid vir device id +func IsValidVDevID(vDevID uint32) bool { + return vDevID >= MinVDevID && vDevID < MaxVDevID +} + +// IsValidPortID valid port id +func IsValidPortID(portID int) bool { + return portID == DefaultPingMeshPortID +} + +// IsValidTaskID valid task id +func IsValidTaskID(taskID uint) bool { + return taskID == InternalPingMeshTaskID || taskID == ExternalPingMeshTaskID +} + +// IsValidHccspingMeshOperate valid hccsping mesh operate +func IsValidHccspingMeshOperate(operate HccspingMeshOperate) error { + if len(operate.DstAddr) > MaxHccspingMeshAddr { + return fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(operate.DstAddr), + MaxHccspingMeshAddr) + } + if operate.PktSize < MinPktSize || operate.PktSize > MaxPktSize { + return fmt.Errorf("pkt size %d is invalid, should be between %d and %d", operate.PktSize, MinPktSize, MaxPktSize) + } + if operate.PktSendNum < MinPktSendNum || operate.PktSendNum > MaxPktSendNum { + return fmt.Errorf("pkt send num %d is invalid, should be between %d and %d", operate.PktSendNum, + MinPktSendNum, MaxPktSendNum) + } + if operate.PktInterval < MinPktInterval || operate.PktInterval > MaxPktInterval { + return fmt.Errorf("pkt interval %d is invalid, should be between %d and %d", operate.PktInterval, + MinPktInterval, MaxPktInterval) + } + if operate.TaskInterval < MinTaskInterval || operate.TaskInterval > MaxTaskInterval { + return fmt.Errorf("task interval %d is invalid, should be between %d and %d", operate.TaskInterval, + MinTaskInterval, MaxTaskInterval) + } + if !IsValidTaskID(uint(operate.TaskId)) { + return fmt.Errorf("task id %d is invalid", operate.TaskId) + } + return nil +} + +// GetDeviceTypeByChipName get device type by chipName +func GetDeviceTypeByChipName(chipName string) string { + if reg310P.MatchString(chipName) { + return api.Ascend310P + } + if strings.Contains(chipName, api.Ascend310BNo) { + return api.Ascend310B + } + if strings.Contains(chipName, api.Ascend310No) { + return api.Ascend310 + } + if reg910B.MatchString(chipName) { + return api.Ascend910B + } + if reg910A.MatchString(chipName) { + return api.Ascend910A + } + return "" +} + +func get910TemplateNameList() map[string]struct{} { + return map[string]struct{}{"vir16": {}, "vir08": {}, "vir04": {}, "vir02": {}, "vir01": {}} +} + +func get910BTemplateNameList() map[string]struct{} { + return map[string]struct{}{ + "vir03_1c_8g": {}, "vir05_1c_8g": {}, "vir05_1c_16g": {}, + "vir06_1c_16g": {}, "vir10_3c_16g": {}, "vir10_3c_16g_nm": {}, + "vir10_3c_32g": {}, "vir10_4c_16g_m": {}, "vir12_3c_32g": {}} +} + +func get310PTemplateNameList() map[string]struct{} { + return map[string]struct{}{"vir04": {}, "vir02": {}, "vir01": {}, "vir04_3c": {}, "vir02_1c": {}, + "vir04_4c_dvpp": {}, "vir04_3c_ndvpp": {}} +} + +// IsValidTemplateName check template name meet the requirement +func IsValidTemplateName(devType, templateName string) bool { + isTemplateNameValid := false + switch devType { + case api.Ascend310P: + _, isTemplateNameValid = get310PTemplateNameList()[templateName] + case api.Ascend910A: + _, isTemplateNameValid = get910TemplateNameList()[templateName] + case api.Ascend910B: + _, isTemplateNameValid = get910BTemplateNameList()[templateName] + default: + } + return isTemplateNameValid +} + +// RemoveDuplicate remove duplicate device +func RemoveDuplicate(list *[]string) []string { + listValueMap := make(map[string]string, len(*list)) + var rmDupValueList []string + for _, value := range *list { + listValueMap[value] = value + } + for _, value := range listValueMap { + rmDupValueList = append(rmDupValueList, value) + } + return rmDupValueList +} + +// GetNpuName get npu name eg: name-type-version +func GetNpuName(chipInfo *ChipInfo) string { + if chipInfo == nil { + return "" + } + if len(chipInfo.Name) == 0 && len(chipInfo.Type) == 0 && len(chipInfo.Version) == 0 { + return "" + } + return fmt.Sprintf("%s-%s-%s", chipInfo.Name, chipInfo.Type, chipInfo.Version) +} + +// SetExternalParams transmit npu-exporter's startup parameters +func SetExternalParams(profilingTime int) { + ProfilingTime = profilingTime +} + +// SetHccsBWProfilingTime set hccs bw profiling time +func SetHccsBWProfilingTime(hccsbwProfilingTime int) { + HccsBWProfilingTime = hccsbwProfilingTime +} + +// DeepCopyChipInfo copy chip info deeply +func DeepCopyChipInfo(chipInfo *ChipInfo) *ChipInfo { + if chipInfo == nil { + return nil + } + + return &ChipInfo{ + Type: chipInfo.Type, + Name: chipInfo.Name, + Version: chipInfo.Version, + } +} + +// DeepCopyVDevActivityInfo copy VDevActivityInfo deeply +func DeepCopyVDevActivityInfo(vDevActivityInfo *VDevActivityInfo) *VDevActivityInfo { + if vDevActivityInfo == nil { + return nil + } + + return &VDevActivityInfo{ + VDevID: vDevActivityInfo.VDevID, + VDevAiCoreRate: vDevActivityInfo.VDevAiCoreRate, + VDevTotalMem: vDevActivityInfo.VDevTotalMem, + VDevUsedMem: vDevActivityInfo.VDevUsedMem, + VDevAiCore: vDevActivityInfo.VDevAiCore, + IsVirtualDev: vDevActivityInfo.IsVirtualDev, + } +} + +// DeepCopySlice Deep copy slice +func deepCopySlice(slice interface{}) interface{} { + + switch v := slice.(type) { + case []int: + newSlice := make([]int, len(v)) + copy(newSlice, v) + return newSlice + case []uint32: + newSlice := make([]uint32, len(v)) + copy(newSlice, v) + return newSlice + case []float64: + newSlice := make([]float64, len(v)) + copy(newSlice, v) + return newSlice + default: + hwlog.RunLog.Warn("Unsupported slice type") + return slice + } +} + +// GetDevType get device type by chip name,boardId +func GetDevType(chipName string, boardId uint32) string { + var devType string + if Is910A3Chip(boardId) { + devType = api.Ascend910A3 + } else { + devType = GetDeviceTypeByChipName(chipName) + } + return devType +} + +// Is910A3Chip current chip is 910A3 or not,include A900A3 and A9000A3 +func Is910A3Chip(boardId uint32) bool { + return a3BoardIds.Has(int32(boardId)) +} + +// IsA900A3SuperPod current product is A900A3 super pod or not +func IsA900A3SuperPod(mainBoardId uint32) bool { + return a900A3SuperPodMainBoardIds.Has(int32(mainBoardId)) +} + +// IsA9000A3SuperPod current product is A9000A3 super pod or not +func IsA9000A3SuperPod(mainBoardId uint32) bool { + return a9000A3SuperPodMainBoardIds.Has(int32(mainBoardId)) +} + +// Is800IA3Chip current chip is 800IA3 or not +func Is800IA3Chip(mainBoardId uint32) bool { + return mainBoardId == A800IA3MainBoardId +} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils_test.go b/mind-cluster/component/ascend-common/devmanager/common/utils_test.go new file mode 100644 index 0000000..548a1c0 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/utils_test.go @@ -0,0 +1,163 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package common + +import ( + "fmt" + "strings" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +// TestDeepCopyHccsBandwidthInfo TestDeepCopySlice +func TestDeepCopyHccsBandwidthInfo(t *testing.T) { + + convey.Convey("should copy a new []int", t, func() { + slice := []int{1, 2} + newSlice := deepCopySlice(slice) + convey.So(&newSlice, convey.ShouldNotEqual, &slice) + }) + + convey.Convey("should copy a new []int32", t, func() { + slice := []uint32{1, 2} + + newSlice := deepCopySlice(slice) + convey.So(&newSlice, convey.ShouldNotEqual, &slice) + }) + + convey.Convey("should copy a new []float64", t, func() { + slice := []float64{1, 2} + newSlice := deepCopySlice(slice) + convey.So(&newSlice, convey.ShouldNotEqual, &slice) + }) +} + +func TestIsValidPortID(t *testing.T) { + convey.Convey("Given a port ID", t, func() { + convey.Convey("01-When the port ID is invalid, should return false", func() { + portID1 := 1 + convey.So(IsValidPortID(portID1), convey.ShouldBeFalse) + }) + + convey.Convey("02-When the port ID is the default, should return true", func() { + portID3 := DefaultPingMeshPortID + convey.So(IsValidPortID(portID3), convey.ShouldBeTrue) + }) + }) +} + +func TestIsValidTaskID(t *testing.T) { + convey.Convey("Given a task ID", t, func() { + convey.Convey("01-When the task ID is valid, should return true", func() { + taskID1 := InternalPingMeshTaskID + convey.So(IsValidTaskID(taskID1), convey.ShouldBeTrue) + + taskID2 := ExternalPingMeshTaskID + convey.So(IsValidTaskID(taskID2), convey.ShouldBeTrue) + }) + + convey.Convey("02-When the task ID is invalid, should return false", func() { + const taskID3 = 3 + convey.So(IsValidTaskID(taskID3), convey.ShouldBeFalse) + }) + }) +} + +func defaultHccspingMeshOperate() HccspingMeshOperate { + return HccspingMeshOperate{ + DstAddr: "1111", + PktSize: MinPktSize, + PktSendNum: MinPktSendNum, + PktInterval: MinPktInterval, + TaskInterval: MinTaskInterval, + TaskId: int(InternalPingMeshTaskID), + } +} + +func check(op HccspingMeshOperate, expectedErr error) { + err := IsValidHccspingMeshOperate(op) + convey.So(err, convey.ShouldResemble, expectedErr) +} + +func expectedError(pattern string, current, min, max int) error { + return fmt.Errorf(pattern, current, min, max) +} + +func TestIsValidHccspingMeshOperate01(t *testing.T) { + convey.Convey("Given a pingmesh operate", t, func() { + op := defaultHccspingMeshOperate() + convey.Convey("01-When operation valid, should return nil", func() { + check(op, nil) + }) + var expectedErr error + convey.Convey("01-When the dst addr is invalid, should return error", func() { + op.DstAddr = strings.Repeat("a", MaxHccspingMeshAddr+1) + expectedErr = fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(op.DstAddr), + MaxHccspingMeshAddr) + check(op, expectedErr) + }) + op.DstAddr = "1111" + convey.Convey("02-When the pkt size is invalid, should return error", func() { + pattern := "pkt size %d is invalid, should be between %d and %d" + op.PktSize = MinPktSize - 1 + check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) + op.PktSize = MaxPktSize + 1 + check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) + }) + op.PktSize = MinPktSize + convey.Convey("03-When the pkt send num is invalid, should return error", func() { + pattern := "pkt send num %d is invalid, should be between %d and %d" + op.PktSendNum = MinPktSendNum - 1 + check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) + op.PktSendNum = MaxPktSendNum + 1 + check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) + }) + op.TaskInterval = MinTaskInterval + convey.Convey("06-When the task id is invalid, should return error", func() { + op.TaskId = int(ExternalPingMeshTaskID) + 1 + expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) + check(op, expectedErr) + }) + }) +} + +func TestIsValidHccspingMeshOperate02(t *testing.T) { + convey.Convey("Given a pingmesh operate", t, func() { + op := defaultHccspingMeshOperate() + convey.Convey("04-When the pkt interval is invalid, should return error", func() { + pattern := "pkt interval %d is invalid, should be between %d and %d" + op.PktInterval = MinPktInterval - 1 + check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) + op.PktInterval = MaxPktInterval + 1 + check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) + }) + op.PktInterval = MinPktInterval + convey.Convey("05-When the task interval is invalid, should return error", func() { + pattern := "task interval %d is invalid, should be between %d and %d" + op.TaskInterval = MinTaskInterval - 1 + check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) + op.TaskInterval = MaxTaskInterval + 1 + check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) + }) + op.TaskInterval = MinTaskInterval + var expectedErr error + convey.Convey("06-When the task id is invalid, should return error", func() { + op.TaskId = int(ExternalPingMeshTaskID) + 1 + expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) + check(op, expectedErr) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go b/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go new file mode 100644 index 0000000..bd68af3 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go @@ -0,0 +1,78 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package dcmi this for constants +package dcmi + +// MainCmd main command enum +type MainCmd uint32 + +// VDevMngSubCmd virtual device manager sub command +type VDevMngSubCmd uint32 + +// DieType present chip die type +type DieType int32 + +const ( + // dcmiMaxVdevNum is max number of vdevice, value is from driver specification + dcmiMaxVdevNum = 32 + // dcmiMaxReserveNum is max number of reserve, value is from driver specification + dcmiMaxReserveNum = 8 + // dcmiVDevResNameLen length of vnpu resource name + dcmiVDevResNameLen = 16 + // dcmiHccsMaxPcsNum max pcs number for hccs + dcmiHccsMaxPcsNum = 16 + + maxChipNameLen = 32 + productTypeLen = 64 + dcmiVersionLen = 32 + + // MainCmdChipInf main cmd chip inf + MainCmdChipInf MainCmd = 12 + // MainCmdHccs main cmd of hccs + MainCmdHccs MainCmd = 16 + // MainCmdVDevMng virtual device manager + MainCmdVDevMng MainCmd = 52 + // MainCmdSio SIO status between die + MainCmdSio MainCmd = 56 + + // VmngSubCmdGetVDevResource get virtual device resource info + VmngSubCmdGetVDevResource VDevMngSubCmd = 0 + // VmngSubCmdGetTotalResource get total resource info + VmngSubCmdGetTotalResource VDevMngSubCmd = 1 + // VmngSubCmdGetFreeResource get free resource info + VmngSubCmdGetFreeResource VDevMngSubCmd = 2 + // VmngSubCmdGetVDevActivity get vir device activity info + VmngSubCmdGetVDevActivity VDevMngSubCmd = 5 + // CinfSubCmdGetSPodInfo get super pod info + CinfSubCmdGetSPodInfo VDevMngSubCmd = 1 + // SioSubCmdCrcErrStatistics get SIO err statistics info + SioSubCmdCrcErrStatistics VDevMngSubCmd = 0 + // HccsSubCmdGetStatisticInfo get statistic info + HccsSubCmdGetStatisticInfo VDevMngSubCmd = 3 + // HccsSubCmdGetStatisticInfoU64 get statistic info in u64 + HccsSubCmdGetStatisticInfoU64 VDevMngSubCmd = 5 + + // NDIE NDie ID, only Ascend910 has + NDIE DieType = 0 + // VDIE VDie ID, it can be the uuid of chip + VDIE DieType = 1 + // DieIDCount die id array max length + DieIDCount = 5 + + // ipAddrTypeV6 ip address type of IPv6 + ipAddrTypeV6 = 1 + + agentdrvProfDataNum = 3 +) diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go new file mode 100644 index 0000000..834397c --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go @@ -0,0 +1,2213 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package dcmi this for dcmi manager +package dcmi + +// #cgo LDFLAGS: -ldl +/* + #include + #include + #include + #include + + #include "dcmi_interface_api.h" + + static void *dcmiHandle; + #define SO_NOT_FOUND -99999 + #define FUNCTION_NOT_FOUND -99998 + #define SUCCESS 0 + #define ERROR_UNKNOWN -99997 + #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); + + // dcmi + static int (*dcmi_init_func)(); + static int dcmi_init_new(){ + CALL_FUNC(dcmi_init) + } + + static int (*dcmi_get_card_num_list_func)(int *card_num,int *card_list,int list_length); + static int dcmi_get_card_num_list_new(int *card_num,int *card_list,int list_length){ + CALL_FUNC(dcmi_get_card_num_list,card_num,card_list,list_length) + } + + static int (*dcmi_get_device_num_in_card_func)(int card_id,int *device_num); + static int dcmi_get_device_num_in_card_new(int card_id,int *device_num){ + CALL_FUNC(dcmi_get_device_num_in_card,card_id,device_num) + } + + static int (*dcmi_get_device_logic_id_func)(int *device_logic_id,int card_id,int device_id); + static int dcmi_get_device_logic_id_new(int *device_logic_id,int card_id,int device_id){ + CALL_FUNC(dcmi_get_device_logic_id,device_logic_id,card_id,device_id) + } + + static int (*dcmi_create_vdevice_func)(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, + struct dcmi_create_vdev_out *out); + int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, + struct dcmi_create_vdev_out *out){ + CALL_FUNC(dcmi_create_vdevice,card_id,device_id,vdev,out) + } + + static int (*dcmi_get_device_info_func)(int card_id, int device_id, enum dcmi_main_cmd main_cmd, + unsigned int sub_cmd,void *buf, unsigned int *size); + int dcmi_get_device_info(int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, + unsigned int *size){ + CALL_FUNC(dcmi_get_device_info,card_id,device_id,main_cmd,sub_cmd,buf,size) + } + + static int (*dcmi_get_hccs_link_bandwidth_info_func)(int card_id, int device_id, +struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); + int dcmi_get_hccs_link_bandwidth_info(int card_id, int device_id, +struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info){ + CALL_FUNC(dcmi_get_hccs_link_bandwidth_info,card_id,device_id,hccs_bandwidth_info) + } + + static int (*dcmi_set_destroy_vdevice_func)(int card_id,int device_id, unsigned int VDevid); + int dcmi_set_destroy_vdevice(int card_id,int device_id, unsigned int VDevid){ + CALL_FUNC(dcmi_set_destroy_vdevice,card_id,device_id,VDevid) + } + + static int (*dcmi_get_device_type_func)(int card_id,int device_id,enum dcmi_unit_type *device_type); + int dcmi_get_device_type(int card_id,int device_id,enum dcmi_unit_type *device_type){ + CALL_FUNC(dcmi_get_device_type,card_id,device_id,device_type) + } + + static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); + int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ + CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) + } + + static int (*dcmi_get_device_utilization_rate_func)(int card_id, int device_id, int input_type, + unsigned int *utilization_rate); + int dcmi_get_device_utilization_rate(int card_id, int device_id, int input_type, unsigned int *utilization_rate){ + CALL_FUNC(dcmi_get_device_utilization_rate,card_id,device_id,input_type,utilization_rate) + } + + static int (*dcmi_get_device_temperature_func)(int card_id, int device_id, int *temperature); + int dcmi_get_device_temperature(int card_id, int device_id, int *temperature){ + CALL_FUNC(dcmi_get_device_temperature,card_id,device_id,temperature) + } + + static int (*dcmi_get_device_voltage_func)(int card_id, int device_id, unsigned int *voltage); + int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage){ + CALL_FUNC(dcmi_get_device_voltage,card_id,device_id,voltage) + } + + static int (*dcmi_get_device_power_info_func)(int card_id, int device_id, int *power); + int dcmi_get_device_power_info(int card_id, int device_id, int *power){ + CALL_FUNC(dcmi_get_device_power_info,card_id,device_id,power) + } + + static int (*dcmi_get_device_frequency_func)(int card_id, int device_id, enum dcmi_freq_type input_type, + unsigned int *frequency); + int dcmi_get_device_frequency(int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency){ + CALL_FUNC(dcmi_get_device_frequency,card_id,device_id,input_type,frequency) + } + + static int (*dcmi_get_device_memory_info_v3_func)(int card_id, int device_id, + struct dcmi_get_memory_info_stru *memory_info); + int dcmi_get_device_memory_info_v3(int card_id, int device_id, struct dcmi_get_memory_info_stru *memory_info){ + CALL_FUNC(dcmi_get_device_memory_info_v3,card_id,device_id,memory_info) + } + + static int (*dcmi_get_device_hbm_info_func)(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); + int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info){ + CALL_FUNC(dcmi_get_device_hbm_info,card_id,device_id,hbm_info) + } + + static int (*dcmi_get_device_errorcode_v2_func)(int card_id, int device_id, int *error_count, + unsigned int *error_code_list, unsigned int list_len); + int dcmi_get_device_errorcode_v2(int card_id, int device_id, int *error_count, + unsigned int *error_code_list, unsigned int list_len){ + CALL_FUNC(dcmi_get_device_errorcode_v2,card_id,device_id,error_count,error_code_list,list_len) + } + + static int (*dcmi_get_device_chip_info_func)(int card_id, int device_id, struct dcmi_chip_info *chip_info); + int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info){ + CALL_FUNC(dcmi_get_device_chip_info,card_id,device_id,chip_info) + } + + static int (*dcmi_get_device_chip_info_v2_func)(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); + int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info){ + CALL_FUNC(dcmi_get_device_chip_info_v2,card_id,device_id,chip_info) + } + + static int (*dcmi_get_device_phyid_from_logicid_func)(unsigned int logicid, unsigned int *phyid); + int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid){ + CALL_FUNC(dcmi_get_device_phyid_from_logicid,logicid,phyid) + } + + static int (*dcmi_get_device_logicid_from_phyid_func)(unsigned int phyid, unsigned int *logicid); + int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid){ + CALL_FUNC(dcmi_get_device_logicid_from_phyid,phyid,logicid) + } + + static int (*dcmi_get_device_ip_func)(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, + struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); + int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, + struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask){ + CALL_FUNC(dcmi_get_device_ip,card_id,device_id,input_type,port_id,ip,mask) + } + + static int (*dcmi_get_device_network_health_func)(int card_id, int device_id, + enum dcmi_rdfx_detect_result *result); + int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result){ + CALL_FUNC(dcmi_get_device_network_health,card_id,device_id,result) + } + + static int (*dcmi_get_card_list_func)(int *card_num, int *card_list, int list_len); + int dcmi_get_card_list(int *card_num, int *card_list, int list_len){ + CALL_FUNC(dcmi_get_card_list,card_num,card_list,list_len) + } + + static int (*dcmi_get_device_id_in_card_func)(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); + int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id){ + CALL_FUNC(dcmi_get_device_id_in_card,card_id,device_id_max,mcu_id,cpu_id) + } + + static int (*dcmi_get_memory_info_func)(int card_id, int device_id, + struct dcmi_memory_info_stru *device_memory_info); + int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info){ + CALL_FUNC(dcmi_get_memory_info,card_id,device_id,device_memory_info) + } + + static int (*dcmi_get_device_errorcode_func)(int card_id, int device_id, int *error_count, unsigned int *error_code, + int *error_width); + int dcmi_get_device_errorcode(int card_id, int device_id, int *error_count, unsigned int *error_code, + int *error_width){ + CALL_FUNC(dcmi_get_device_errorcode,card_id,device_id,error_count,error_code,error_width) + } + + static int (*dcmi_get_card_id_device_id_from_logicid_func)(int *card_id, int *device_id, + unsigned int device_logic_id); + int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id){ + CALL_FUNC(dcmi_get_card_id_device_id_from_logicid,card_id,device_id,device_logic_id) + } + + static int (*dcmi_mcu_get_power_info_func)(int card_id, int *power); + static int dcmi_mcu_get_power_info_new(int card_id, int *power){ + CALL_FUNC(dcmi_mcu_get_power_info,card_id,power) + } + + static int (*dcmi_get_product_type_func)(int card_id, int device_id, char *product_type_str, int buf_size); + int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size){ + CALL_FUNC(dcmi_get_product_type,card_id,device_id,product_type_str,buf_size) + } + + static int (*dcmi_get_card_elabel_v2_func)(int card_id, struct dcmi_elabel_info *elabel_info); + int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info){ + CALL_FUNC(dcmi_get_card_elabel_v2,card_id,elabel_info) + } + + static int (*dcmi_set_device_reset_func)(int card_id, int device_id, enum dcmi_reset_channel channel_type); + int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type){ + CALL_FUNC(dcmi_set_device_reset,card_id,device_id,channel_type) + } + + static int (*dcmi_get_device_outband_channel_state_func)(int card_id, int device_id, int* channel_state); + int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state){ + CALL_FUNC(dcmi_get_device_outband_channel_state,card_id,device_id,channel_state) + } + + static int (*dcmi_pre_reset_soc_func)(int card_id, int device_id); + int dcmi_pre_reset_soc(int card_id, int device_id){ + CALL_FUNC(dcmi_pre_reset_soc,card_id,device_id) + } + + static int (*dcmi_rescan_soc_func)(int card_id, int device_id); + int dcmi_rescan_soc(int card_id, int device_id){ + CALL_FUNC(dcmi_rescan_soc,card_id,device_id) + } + + static int (*dcmi_get_netdev_brother_device_func)(int card_id, int device_id, int* brother_card_id); + int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id){ + CALL_FUNC(dcmi_get_netdev_brother_device,card_id,device_id,brother_card_id) + } + + static int (*dcmi_get_device_boot_status_func)(int card_id, int device_id, enum dcmi_boot_status *boot_status); + int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status){ + CALL_FUNC(dcmi_get_device_boot_status,card_id,device_id,boot_status) + } + + void goEventFaultCallBack(struct dcmi_dms_fault_event); + static void event_handler(struct dcmi_event *fault_event) { + goEventFaultCallBack(fault_event->event_t.dms_event); + } + + static int (*dcmi_subscribe_fault_event_func)(int card_id, int device_id, struct dcmi_event_filter filter, + void (*f_name)(struct dcmi_event *fault_event)); + int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter){ + CALL_FUNC(dcmi_subscribe_fault_event,card_id,device_id,filter,event_handler) + } + + static int (*dcmi_get_npu_work_mode_func)(int card_id, unsigned char *work_mode); + int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode){ + CALL_FUNC(dcmi_get_npu_work_mode,card_id,work_mode) + } + + static int (*dcmi_get_device_die_v2_func)(int card_id, int device_id, enum dcmi_die_type input_type, + struct dcmi_die_id *die_id); + int dcmi_get_device_die_v2(int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id){ + CALL_FUNC(dcmi_get_device_die_v2,card_id,device_id,input_type,die_id) + } + + static int (*dcmi_get_device_resource_info_func)(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, + int *proc_num); + int dcmi_get_device_resource_info(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, int *proc_num){ + CALL_FUNC(dcmi_get_device_resource_info,card_id,device_id,proc_info,proc_num) + } + + static int (*dcmi_get_device_pcie_info_v2_func)(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); + int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info){ + CALL_FUNC(dcmi_get_device_pcie_info_v2,card_id,device_id,pcie_info) + } + + static int (*dcmi_get_device_board_info_func)(int card_id, int device_id, struct dcmi_board_info *board_info); + int dcmi_get_device_board_info(int card_id, int device_id, struct dcmi_board_info *board_info){ + CALL_FUNC(dcmi_get_device_board_info,card_id,device_id,board_info) + } + + static int (*dcmi_get_pcie_link_bandwidth_info_func)(int card_id, int device_id, + struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); + int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, + struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info){ + CALL_FUNC(dcmi_get_pcie_link_bandwidth_info,card_id,device_id,pcie_link_bandwidth_info) + } + + static int (*dcmi_get_dcmi_version_func)(char *dcmi_ver, int buf_size); + int dcmi_get_dcmi_version(char *dcmi_ver, int buf_size){ + CALL_FUNC(dcmi_get_dcmi_version,dcmi_ver,buf_size) + } + + static int (*dcmi_get_device_ecc_info_func)(int card_id, int device_id, enum dcmi_device_type input_type, + struct dcmi_ecc_info *device_ecc_info); + int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, + struct dcmi_ecc_info *device_ecc_info){ + CALL_FUNC(dcmi_get_device_ecc_info,card_id,device_id,input_type,device_ecc_info) + } + + static int (*dcmi_get_mainboard_id_func)(int card_id, int device_id, unsigned int *mainboard_id); + int dcmi_get_mainboard_id(int card_id, int device_id, unsigned int *mainboard_id){ + CALL_FUNC(dcmi_get_mainboard_id,card_id,device_id,mainboard_id) + } + + static int (*dcmi_start_hccsping_mesh_func)(int card_id, int device_id, int port_id, +struct dcmi_hccsping_mesh_operate *hccsping_mesh); + int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, +struct dcmi_hccsping_mesh_operate *hccsping_mesh){ + CALL_FUNC(dcmi_start_hccsping_mesh,card_id,device_id,port_id,hccsping_mesh) +} + static int (*dcmi_stop_hccsping_mesh_func)(int card_id, int device_id, int port_id, unsigned int task_id); + int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id){ + CALL_FUNC(dcmi_stop_hccsping_mesh,card_id,device_id,port_id,task_id) + } + + static int (*dcmi_get_hccsping_mesh_info_func)(int card_id, int device_id, int port_id, unsigned int task_id, +struct dcmi_hccsping_mesh_info *hccsping_mesh_info); + int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, +struct dcmi_hccsping_mesh_info *hccsping_mesh_info){ + CALL_FUNC(dcmi_get_hccsping_mesh_info,card_id,device_id,port_id,task_id,hccsping_mesh_info) +} + + static int (*dcmi_get_hccsping_mesh_state_func)(int card_id, int device_id, int port_id, unsigned int task_id, +unsigned int *state); + int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, +unsigned int *state){ + CALL_FUNC(dcmi_get_hccsping_mesh_state,card_id,device_id,port_id,task_id,state) +} + + static int (*dcmi_get_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int *status); + int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status){ + CALL_FUNC(dcmi_get_spod_node_status,card_id,device_id,sdid,status) + } + + static int (*dcmi_set_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int status); + int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status){ + CALL_FUNC(dcmi_set_spod_node_status,card_id,device_id,sdid,status) + } + + // load .so files and functions + static int dcmiInit_dl(const char* dcmiLibPath){ + if (dcmiLibPath == NULL) { + fprintf (stderr,"lib path is null\n"); + return SO_NOT_FOUND; + } + dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); + if (dcmiHandle == NULL){ + fprintf (stderr,"%s\n",dlerror()); + return SO_NOT_FOUND; + } + + dcmi_init_func = dlsym(dcmiHandle,"dcmi_init"); + + dcmi_get_card_num_list_func = dlsym(dcmiHandle,"dcmi_get_card_num_list"); + + dcmi_get_device_num_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_num_in_card"); + + dcmi_get_device_logic_id_func = dlsym(dcmiHandle,"dcmi_get_device_logic_id"); + + dcmi_create_vdevice_func = dlsym(dcmiHandle,"dcmi_create_vdevice"); + + dcmi_get_device_info_func = dlsym(dcmiHandle,"dcmi_get_device_info"); + + dcmi_set_destroy_vdevice_func = dlsym(dcmiHandle,"dcmi_set_destroy_vdevice"); + + dcmi_get_device_type_func = dlsym(dcmiHandle,"dcmi_get_device_type"); + + dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); + + dcmi_get_device_utilization_rate_func = dlsym(dcmiHandle,"dcmi_get_device_utilization_rate"); + + dcmi_get_device_temperature_func = dlsym(dcmiHandle,"dcmi_get_device_temperature"); + + dcmi_get_device_voltage_func = dlsym(dcmiHandle,"dcmi_get_device_voltage"); + + dcmi_get_device_power_info_func = dlsym(dcmiHandle,"dcmi_get_device_power_info"); + + dcmi_get_device_frequency_func = dlsym(dcmiHandle,"dcmi_get_device_frequency"); + + dcmi_get_device_memory_info_v3_func = dlsym(dcmiHandle,"dcmi_get_device_memory_info_v3"); + + dcmi_get_device_hbm_info_func = dlsym(dcmiHandle,"dcmi_get_device_hbm_info"); + + dcmi_get_device_errorcode_v2_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode_v2"); + + dcmi_get_device_chip_info_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info"); + + dcmi_get_device_chip_info_v2_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info_v2"); + + dcmi_get_device_phyid_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_device_phyid_from_logicid"); + + dcmi_get_device_logicid_from_phyid_func = dlsym(dcmiHandle,"dcmi_get_device_logicid_from_phyid"); + + dcmi_get_device_ip_func = dlsym(dcmiHandle,"dcmi_get_device_ip"); + + dcmi_get_device_network_health_func = dlsym(dcmiHandle,"dcmi_get_device_network_health"); + + dcmi_get_card_list_func = dlsym(dcmiHandle,"dcmi_get_card_list"); + + dcmi_get_device_id_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_id_in_card"); + + dcmi_get_memory_info_func = dlsym(dcmiHandle,"dcmi_get_memory_info"); + + dcmi_get_device_errorcode_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode"); + + dcmi_get_card_id_device_id_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_card_id_device_id_from_logicid"); + + dcmi_mcu_get_power_info_func = dlsym(dcmiHandle,"dcmi_mcu_get_power_info"); + + dcmi_get_product_type_func = dlsym(dcmiHandle,"dcmi_get_product_type"); + + dcmi_get_card_elabel_v2_func = dlsym(dcmiHandle,"dcmi_get_card_elabel_v2"); + + dcmi_set_device_reset_func = dlsym(dcmiHandle,"dcmi_set_device_reset"); + + dcmi_get_device_outband_channel_state_func = dlsym(dcmiHandle,"dcmi_get_device_outband_channel_state"); + + dcmi_pre_reset_soc_func = dlsym(dcmiHandle,"dcmi_pre_reset_soc"); + + dcmi_rescan_soc_func = dlsym(dcmiHandle,"dcmi_rescan_soc"); + + dcmi_get_netdev_brother_device_func = dlsym(dcmiHandle,"dcmi_get_netdev_brother_device"); + + dcmi_get_device_boot_status_func = dlsym(dcmiHandle,"dcmi_get_device_boot_status"); + + dcmi_subscribe_fault_event_func = dlsym(dcmiHandle,"dcmi_subscribe_fault_event"); + + dcmi_get_npu_work_mode_func = dlsym(dcmiHandle, "dcmi_get_npu_work_mode"); + + dcmi_get_device_die_v2_func = dlsym(dcmiHandle, "dcmi_get_device_die_v2"); + + dcmi_get_device_resource_info_func = dlsym(dcmiHandle, "dcmi_get_device_resource_info"); + + dcmi_get_device_pcie_info_v2_func = dlsym(dcmiHandle, "dcmi_get_device_pcie_info_v2"); + + dcmi_get_device_board_info_func = dlsym(dcmiHandle, "dcmi_get_device_board_info"); + + dcmi_get_pcie_link_bandwidth_info_func = dlsym(dcmiHandle, "dcmi_get_pcie_link_bandwidth_info"); + + dcmi_get_dcmi_version_func = dlsym(dcmiHandle,"dcmi_get_dcmi_version"); + + dcmi_get_device_ecc_info_func = dlsym(dcmiHandle,"dcmi_get_device_ecc_info"); + + dcmi_get_mainboard_id_func = dlsym(dcmiHandle, "dcmi_get_mainboard_id"); + + dcmi_get_hccs_link_bandwidth_info_func = dlsym(dcmiHandle,"dcmi_get_hccs_link_bandwidth_info"); + + dcmi_start_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_start_hccsping_mesh"); + + dcmi_stop_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_stop_hccsping_mesh"); + + dcmi_get_hccsping_mesh_info_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_info"); + + dcmi_get_hccsping_mesh_state_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_state"); + + dcmi_get_spod_node_status_func = dlsym(dcmiHandle,"dcmi_get_spod_node_status"); + + dcmi_set_spod_node_status_func = dlsym(dcmiHandle,"dcmi_set_spod_node_status"); + + return SUCCESS; + } + + static int dcmiShutDown(void){ + if (dcmiHandle == NULL) { + return SUCCESS; + } + return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); + } +*/ +import "C" +import ( + "errors" + "fmt" + "math" + "net" + "strconv" + "strings" + "time" + "unsafe" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "ascend-common/devmanager/common" +) + +// CDcmiMemoryInfoV3 the c struct of memoryInfo for v3 +type CDcmiMemoryInfoV3 = C.struct_dcmi_get_memory_info_stru + +// CDcmiMemoryInfoV1 the c struct of memoryInfo for v1 +type CDcmiMemoryInfoV1 = C.struct_dcmi_memory_info_stru + +// DcDriverInterface interface for dcmi +type DcDriverInterface interface { + DcInit() error + DcShutDown() error + + DcGetDcmiVersion() (string, error) + DcGetDeviceCount() (int32, error) + DcGetLogicIDList() (int32, []int32, error) + DcGetDeviceHealth(int32, int32) (int32, error) + DcGetDeviceNetWorkHealth(int32, int32) (uint32, error) + DcGetDeviceUtilizationRate(int32, int32, common.DeviceType) (int32, error) + DcGetDeviceTemperature(int32, int32) (int32, error) + DcGetDeviceVoltage(int32, int32) (float32, error) + DcGetDevicePowerInfo(int32, int32) (float32, error) + DcGetDeviceFrequency(int32, int32, common.DeviceType) (uint32, error) + DcGetMemoryInfo(int32, int32) (*common.MemoryInfo, error) + DcGetHbmInfo(int32, int32) (*common.HbmInfo, error) + DcGetDeviceErrorCode(int32, int32) (int32, int64, error) + DcGetChipInfo(int32, int32) (*common.ChipInfo, error) + DcGetPhysicIDFromLogicID(int32) (int32, error) + DcGetLogicIDFromPhysicID(int32) (int32, error) + DcGetDeviceLogicID(int32, int32) (int32, error) + DcGetDeviceIPAddress(int32, int32, int32) (string, error) + DcGetMcuPowerInfo(int32) (float32, error) + DcGetDieID(int32, int32, DieType) (string, error) + DcGetPCIeBusInfo(int32, int32) (string, error) + + DcGetCardList() (int32, []int32, error) + DcGetDeviceNumInCard(int32) (int32, error) + DcSetDestroyVirtualDevice(int32, int32, uint32) error + DcCreateVirtualDevice(int32, int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) + DcGetDeviceVDevResource(int32, int32, uint32) (common.CgoVDevQueryStru, error) + DcGetDeviceTotalResource(int32, int32) (common.CgoSocTotalResource, error) + DcGetDeviceFreeResource(int32, int32) (common.CgoSocFreeResource, error) + DcGetVDevActivityInfo(int32, int32, uint32) (common.VDevActivityInfo, error) + DcVGetDeviceInfo(int32, int32) (common.VirtualDevInfo, error) + DcGetCardIDDeviceID(int32) (int32, int32, error) + DcCreateVDevice(int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) + DcGetVDeviceInfo(int32) (common.VirtualDevInfo, error) + DcDestroyVDevice(int32, uint32) error + DcGetProductType(int32, int32) (string, error) + DcGetNpuWorkMode(int32) (int, error) + DcSetDeviceReset(int32, int32) error + DcGetBrotherCardID(int32, int32) (int32, error) + DcPreResetSoc(int32, int32) error + DcGetOutBandChannelState(int32, int32) error + DcSetDeviceResetOutBand(int32, int32) error + DcRescanSoc(int32, int32) error + DcGetDeviceBootStatus(int32) (int, error) + DcGetSuperPodInfo(int32, int32) (common.CgoSuperPodInfo, error) + + DcGetDeviceAllErrorCode(int32, int32) (int32, []int64, error) + DcSubscribeDeviceFaultEvent(int32, int32) error + DcSetFaultEventCallFunc(func(common.DevFaultInfo)) + DcGetDevProcessInfo(int32, int32) (*common.DevProcessInfo, error) + DcGetDeviceBoardInfo(int32, int32) (common.BoardInfo, error) + DcGetPCIEBandwidth(int32, int32, int) (common.PCIEBwStat, error) + DcGetDeviceEccInfo(int32, int32, common.DcmiDeviceType) (*common.ECCInfo, error) + DcGetSioInfo(int32, int32) (common.SioCrcErrStatisticInfo, error) + DcGetHccsStatisticInfo(int32, int32) (common.HccsStatisticInfo, error) + DcGetHccsStatisticInfoU64(int32, int32) (common.HccsStatisticInfo, error) + DcGetDeviceMainBoardInfo(int32, int32) (uint32, error) + DcGetHccsBandwidthInfo(int32, int32, int) (common.HccsBandwidthInfo, error) + + DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error + DcStopHccsPingMesh(int32, int32, int, uint) error + DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) + DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) + DcGetSuperPodStatus(int32, int32, uint32) (int, error) + DcSetSuperPodStatus(int32, int32, uint32, uint32) error + DcGetCardElabelV2(int32) (common.ElabelInfo, error) +} + +const ( + dcmiLibraryName = "libdcmi.so" + templateNameLen = 32 + ipAddrListLen = 1024 + hcclpingMeshMaxNum = 48 +) + +var faultEventCallFunc func(common.DevFaultInfo) = nil +var ( + dcmiErrMap = map[int32]string{ + -8001: "The input parameter is incorrect", + -8002: "Permission error", + -8003: "The memory interface operation failed", + -8004: "The security function failed to be executed", + -8005: "Internal errors", + -8006: "Response timed out", + -8007: "Invalid deviceID", + -8008: "The device does not exist", + -8009: "ioctl returns failed", + -8010: "The message failed to be sent", + -8011: "Message reception failed", + -8012: "Not ready yet,please try again", + -8013: "This API is not supported in containers", + -8014: "The file operation failed", + -8015: "Reset failed", + -8016: "Reset cancels", + -8017: "Upgrading", + -8020: "Device resources are occupied", + -8022: "Partition consistency check,inconsistent partitions were found", + -8023: "The configuration information does not exist", + -8255: "Device ID/function is not supported", + -99997: "dcmi shutdown failed", + -99998: "The called function is missing,please upgrade the driver", + -99999: "dcmi libdcmi.so failed to load", + } +) + +// DcManager for manager dcmi interface +type DcManager struct{} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DcManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, + operate common.HccspingMeshOperate) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return fmt.Errorf("portID(%d) is invalid", portID) + } + if err := common.IsValidHccspingMeshOperate(operate); err != nil { + return fmt.Errorf("operate(%v) is invalid, err: %v", operate, err) + } + dtsAddrLsit := [ipAddrListLen]C.char{0} + for i := 0; i < len(operate.DstAddr) && i < len(dtsAddrLsit); i++ { + dtsAddrLsit[i] = C.char(operate.DstAddr[i]) + } + + op := C.struct_dcmi_hccsping_mesh_operate{ + dst_addr_list: dtsAddrLsit, + pkt_size: C.int(operate.PktSize), + pkt_send_num: C.int(operate.PktSendNum), + pkt_interval: C.int(operate.PktInterval), + timeout: C.int(operate.Timeout), + task_interval: C.int(operate.TaskInterval), + task_id: C.int(operate.TaskId), + } + if retCode := C.dcmi_start_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), + &op); retCode != common.Success { + return fmt.Errorf("dcmi start hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", + cardID, deviceID, int32(retCode)) + } + + return nil +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DcManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return fmt.Errorf("portID(%d) is invalid", portID) + } + if !common.IsValidTaskID(taskID) { + return fmt.Errorf("taskID(%d) is invalid", taskID) + } + if retCode := C.dcmi_stop_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), + C.uint(taskID)); retCode != common.Success { + return fmt.Errorf("dcmi stop hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", + cardID, deviceID, int32(retCode)) + } + return nil +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DcManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, + taskID uint) (*common.HccspingMeshInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return nil, fmt.Errorf("portID(%d) is invalid", portID) + } + if !common.IsValidTaskID(taskID) { + return nil, fmt.Errorf("taskID(%d) is invalid", taskID) + } + var info C.struct_dcmi_hccsping_mesh_info + if retCode := C.dcmi_get_hccsping_mesh_info(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), + &info); retCode != common.Success { + return nil, fmt.Errorf("dcmi get hccs ping mesh info failed cardID(%d) deviceID(%d) error code: %d", + cardID, deviceID, int32(retCode)) + } + return convertHccspingMeshInfo(&info) +} + +func convertHccspingMeshInfo(cInfo *C.struct_dcmi_hccsping_mesh_info) (*common.HccspingMeshInfo, error) { + if int(cInfo.dest_num) > hcclpingMeshMaxNum { + return nil, fmt.Errorf("dest_num(%d) is invalid, should not be greater than %d", int(cInfo.dest_num), + hcclpingMeshMaxNum) + } + info := &common.HccspingMeshInfo{} + for i := 0; i < int(cInfo.dest_num); i++ { + info.DstAddr = append(info.DstAddr, convertToString(cInfo.dst_addr[i])) + info.SucPktNum = append(info.SucPktNum, uint(cInfo.suc_pkt_num[i])) + info.FailPktNum = append(info.FailPktNum, uint(cInfo.fail_pkt_num[i])) + info.MaxTime = append(info.MaxTime, int(cInfo.max_time[i])) + info.MinTime = append(info.MinTime, int(cInfo.min_time[i])) + info.AvgTime = append(info.AvgTime, int(cInfo.avg_time[i])) + info.TP95Time = append(info.TP95Time, int(cInfo.tp95_time[i])) + info.ReplyStatNum = append(info.ReplyStatNum, int(cInfo.reply_stat_num[i])) + info.PingTotalNum = append(info.PingTotalNum, int(cInfo.ping_total_num[i])) + } + info.DestNum = int(cInfo.dest_num) + return info, nil +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DcManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return common.RetError, fmt.Errorf("portID(%d) is invalid", portID) + } + if !common.IsValidTaskID(taskID) { + return common.RetError, fmt.Errorf("taskID(%d) is invalid", taskID) + } + var state C.uint + if retCode := C.dcmi_get_hccsping_mesh_state(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), + &state); retCode != common.Success { + return common.RetError, fmt.Errorf("dcmi get hccs ping mesh state failed cardID(%d) deviceID(%d) error "+ + "code: %d", cardID, deviceID, int32(retCode)) + } + return int(state), nil +} + +// DcInit load symbol and initialize dcmi +func (d *DcManager) DcInit() error { + dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) + if err != nil { + return err + } + cDcmiTemplateName := C.CString(dcmiLibPath) + defer C.free(unsafe.Pointer(cDcmiTemplateName)) + if retCode := C.dcmiInit_dl(cDcmiTemplateName); retCode != C.SUCCESS { + return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) + } + if retCode := C.dcmi_init_new(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi init failed, error code: %d", int32(retCode)) + } + return nil +} + +// DcShutDown clean the dynamically loaded resource +func (d *DcManager) DcShutDown() error { + if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) + } + + return nil +} + +// DcGetCardList get card list +func (d *DcManager) DcGetCardList() (int32, []int32, error) { + var ids [common.HiAIMaxCardNum]C.int + var cNum C.int + if retCode := C.dcmi_get_card_list(&cNum, &ids[0], common.HiAIMaxCardNum); int32(retCode) != common. + Success { + return common.RetError, nil, fmt.Errorf("get card list failed, error code: %d", int32(retCode)) + } + // checking card's quantity + if cNum <= 0 || cNum > common.HiAIMaxCardNum { + return common.RetError, nil, fmt.Errorf("get error card quantity: %d", int32(cNum)) + } + var cardNum = int32(cNum) + var i int32 + var cardIDList []int32 + for i = 0; i < cardNum; i++ { + cardID := int32(ids[i]) + if cardID < 0 { + hwlog.RunLog.Errorf("get invalid card ID: %d", cardID) + continue + } + cardIDList = append(cardIDList, cardID) + } + return cardNum, cardIDList, nil +} + +// DcGetDeviceNumInCard get device number in the npu card +func (d *DcManager) DcGetDeviceNumInCard(cardID int32) (int32, error) { + if !common.IsValidCardID(cardID) { + return common.RetError, fmt.Errorf("cardID(%d) is invalid", cardID) + } + var deviceNum C.int + if retCode := C.dcmi_get_device_num_in_card_new(C.int(cardID), &deviceNum); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device count on the card failed, error code: %d", int32(retCode)) + } + if !common.IsValidDevNumInCard(int32(deviceNum)) { + return common.RetError, fmt.Errorf("get error device quantity: %d", int32(deviceNum)) + } + return int32(deviceNum), nil +} + +// DcGetDeviceLogicID get device logicID +func (d *DcManager) DcGetDeviceLogicID(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var logicID C.int + if retCode := C.dcmi_get_device_logic_id_new(&logicID, C.int(cardID), + C.int(deviceID)); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("failed to get logicID by cardID(%d) and deviceID(%d), error code: %d", + cardID, deviceID, int32(retCode)) + } + + // check whether logicID is invalid + if !common.IsValidLogicIDOrPhyID(int32(logicID)) { + return common.RetError, fmt.Errorf("get invalid logicID: %d", int32(logicID)) + } + return int32(logicID), nil +} + +// DcSetDestroyVirtualDevice destroy virtual device +func (d *DcManager) DcSetDestroyVirtualDevice(cardID, deviceID int32, vDevID uint32) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if retCode := C.dcmi_set_destroy_vdevice(C.int(cardID), C.int(deviceID), + C.uint(vDevID)); int32(retCode) != common.Success { + return fmt.Errorf("destroy virtual device failed, error code: %d", int32(retCode)) + } + return nil +} + +func convertCreateVDevOut(cCreateVDevOut C.struct_dcmi_create_vdev_out) common.CgoCreateVDevOut { + cgoCreateVDevOut := common.CgoCreateVDevOut{ + VDevID: uint32(cCreateVDevOut.vdev_id), + PcieBus: uint32(cCreateVDevOut.pcie_bus), + PcieDevice: uint32(cCreateVDevOut.pcie_device), + PcieFunc: uint32(cCreateVDevOut.pcie_func), + VfgID: uint32(cCreateVDevOut.vfg_id), + } + return cgoCreateVDevOut +} + +// DcCreateVirtualDevice create virtual device +func (d *DcManager) DcCreateVirtualDevice(cardID, deviceID int32, vDevInfo common.CgoCreateVDevRes) (common. + CgoCreateVDevOut, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoCreateVDevOut{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if len(vDevInfo.TemplateName) > templateNameLen { + return common.CgoCreateVDevOut{}, fmt.Errorf("the length of template name exceeds the upper limit") + } + cTemplateName := [templateNameLen]C.char{0} + for i := 0; i < len(vDevInfo.TemplateName); i++ { + cTemplateName[i] = C.char(vDevInfo.TemplateName[i]) + } + deviceCreateStr := C.struct_dcmi_create_vdev_res_stru{ + vdev_id: C.uint(vDevInfo.VDevID), + vfg_id: C.uint(vDevInfo.VfgID), + template_name: cTemplateName, + } + + var createVDevOut C.struct_dcmi_create_vdev_out + if retCode := C.dcmi_create_vdevice(C.int(cardID), C.int(deviceID), &deviceCreateStr, + &createVDevOut); int32(retCode) != common.Success { + return common.CgoCreateVDevOut{}, fmt.Errorf("create vdevice failed, error is: %d", int32(retCode)) + } + + return convertCreateVDevOut(createVDevOut), nil +} + +func convertToString(cgoArr [dcmiVDevResNameLen]C.char) string { + var charArr []rune + for _, v := range cgoArr { + if v == 0 { + break + } + charArr = append(charArr, rune(v)) + } + return string(charArr) +} + +func convertBaseResource(cBaseResource C.struct_dcmi_base_resource) common.CgoBaseResource { + baseResource := common.CgoBaseResource{ + Token: uint64(cBaseResource.token), + TokenMax: uint64(cBaseResource.token_max), + TaskTimeout: uint64(cBaseResource.task_timeout), + VfgID: uint32(cBaseResource.vfg_id), + VipMode: uint8(cBaseResource.vip_mode), + } + return baseResource +} + +func convertComputingResource(cComputingResource C.struct_dcmi_computing_resource) common.CgoComputingResource { + computingResource := common.CgoComputingResource{ + Aic: float32(cComputingResource.aic), + Aiv: float32(cComputingResource.aiv), + Dsa: uint16(cComputingResource.dsa), + Rtsq: uint16(cComputingResource.rtsq), + Acsq: uint16(cComputingResource.acsq), + Cdqm: uint16(cComputingResource.cdqm), + CCore: uint16(cComputingResource.c_core), + Ffts: uint16(cComputingResource.ffts), + Sdma: uint16(cComputingResource.sdma), + PcieDma: uint16(cComputingResource.pcie_dma), + MemorySize: uint64(cComputingResource.memory_size), + EventID: uint32(cComputingResource.event_id), + NotifyID: uint32(cComputingResource.notify_id), + StreamID: uint32(cComputingResource.stream_id), + ModelID: uint32(cComputingResource.model_id), + TopicScheduleAicpu: uint16(cComputingResource.topic_schedule_aicpu), + HostCtrlCPU: uint16(cComputingResource.host_ctrl_cpu), + HostAicpu: uint16(cComputingResource.host_aicpu), + DeviceAicpu: uint16(cComputingResource.device_aicpu), + TopicCtrlCPUSlot: uint16(cComputingResource.topic_ctrl_cpu_slot), + } + return computingResource +} + +func convertMediaResource(cMediaResource C.struct_dcmi_media_resource) common.CgoMediaResource { + mediaResource := common.CgoMediaResource{ + Jpegd: float32(cMediaResource.jpegd), + Jpege: float32(cMediaResource.jpege), + Vpc: float32(cMediaResource.vpc), + Vdec: float32(cMediaResource.vdec), + Pngd: float32(cMediaResource.pngd), + Venc: float32(cMediaResource.venc), + } + return mediaResource +} + +func convertVDevQueryInfo(cVDevQueryInfo C.struct_dcmi_vdev_query_info) common.CgoVDevQueryInfo { + name := convertToString(cVDevQueryInfo.name) + vDevQueryInfo := common.CgoVDevQueryInfo{ + Name: string(name), + Status: uint32(cVDevQueryInfo.status), + IsContainerUsed: uint32(cVDevQueryInfo.is_container_used), + Vfid: uint32(cVDevQueryInfo.vfid), + VfgID: uint32(cVDevQueryInfo.vfg_id), + ContainerID: uint64(cVDevQueryInfo.container_id), + Base: convertBaseResource(cVDevQueryInfo.base), + Computing: convertComputingResource(cVDevQueryInfo.computing), + Media: convertMediaResource(cVDevQueryInfo.media), + } + return vDevQueryInfo +} + +func convertVDevQueryStru(cVDevQueryStru C.struct_dcmi_vdev_query_stru) common.CgoVDevQueryStru { + vDevQueryStru := common.CgoVDevQueryStru{ + VDevID: uint32(cVDevQueryStru.vdev_id), + QueryInfo: convertVDevQueryInfo(cVDevQueryStru.query_info), + } + return vDevQueryStru +} + +// DcGetDeviceVDevResource get virtual device resource info +func (d *DcManager) DcGetDeviceVDevResource(cardID, deviceID int32, vDevID uint32) (common.CgoVDevQueryStru, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoVDevQueryStru{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetVDevResource + var vDevResource C.struct_dcmi_vdev_query_stru + size := C.uint(unsafe.Sizeof(vDevResource)) + vDevResource.vdev_id = C.uint(vDevID) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&vDevResource), &size); int32(retCode) != common.Success { + return common.CgoVDevQueryStru{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) + } + return convertVDevQueryStru(vDevResource), nil +} + +func convertSocTotalResource(cSocTotalResource C.struct_dcmi_soc_total_resource) common.CgoSocTotalResource { + socTotalResource := common.CgoSocTotalResource{ + VDevNum: uint32(cSocTotalResource.vdev_num), + VfgNum: uint32(cSocTotalResource.vfg_num), + VfgBitmap: uint32(cSocTotalResource.vfg_bitmap), + Base: convertBaseResource(cSocTotalResource.base), + Computing: convertComputingResource(cSocTotalResource.computing), + Media: convertMediaResource(cSocTotalResource.media), + } + for i := uint32(0); i < uint32(cSocTotalResource.vdev_num) && i < dcmiMaxVdevNum; i++ { + socTotalResource.VDevID = append(socTotalResource.VDevID, uint32(cSocTotalResource.vdev_id[i])) + } + return socTotalResource +} + +// DcGetDeviceTotalResource get device total resource info +func (d *DcManager) DcGetDeviceTotalResource(cardID, deviceID int32) (common.CgoSocTotalResource, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoSocTotalResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetTotalResource + var totalResource C.struct_dcmi_soc_total_resource + size := C.uint(unsafe.Sizeof(totalResource)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&totalResource), &size); int32(retCode) != common.Success { + return common.CgoSocTotalResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) + } + if uint32(totalResource.vdev_num) > dcmiMaxVdevNum { + return common.CgoSocTotalResource{}, fmt.Errorf("get error virtual quantity: %d", + uint32(totalResource.vdev_num)) + } + + return convertSocTotalResource(totalResource), nil +} + +func convertSuperPodInfo(cSuperPodInfo C.struct_dcmi_spod_info) common.CgoSuperPodInfo { + superPodInfo := common.CgoSuperPodInfo{ + SdId: uint32(cSuperPodInfo.sdid), + ScaleType: uint32(cSuperPodInfo.scale_type), + SuperPodId: uint32(cSuperPodInfo.super_pod_id), + ServerId: uint32(cSuperPodInfo.server_id), + } + + for i := uint32(0); i < dcmiMaxReserveNum; i++ { + superPodInfo.Reserve = append(superPodInfo.Reserve, uint32(cSuperPodInfo.reserve[i])) + } + + return superPodInfo +} + +// DcGetSuperPodInfo get device total resource info +func (d *DcManager) DcGetSuperPodInfo(cardID, deviceID int32) (common.CgoSuperPodInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoSuperPodInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var unitType C.enum_dcmi_unit_type + if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { + return common.CgoSuperPodInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) + } + if int32(unitType) != common.NpuType { + return common.CgoSuperPodInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) + } + + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdChipInf) + subCmd := CinfSubCmdGetSPodInfo + var sPodInfo C.struct_dcmi_spod_info + size := C.uint(unsafe.Sizeof(sPodInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&sPodInfo), &size); int32(retCode) != common.Success { + return common.CgoSuperPodInfo{}, fmt.Errorf("get super pod info failed, error is: %d", int32(retCode)) + } + + return convertSuperPodInfo(sPodInfo), nil +} + +func convertSocFreeResource(cSocFreeResource C.struct_dcmi_soc_free_resource) common.CgoSocFreeResource { + socFreeResource := common.CgoSocFreeResource{ + VfgNum: uint32(cSocFreeResource.vfg_num), + VfgBitmap: uint32(cSocFreeResource.vfg_bitmap), + Base: convertBaseResource(cSocFreeResource.base), + Computing: convertComputingResource(cSocFreeResource.computing), + Media: convertMediaResource(cSocFreeResource.media), + } + return socFreeResource +} + +// DcGetDeviceFreeResource get device free resource info +func (d *DcManager) DcGetDeviceFreeResource(cardID, deviceID int32) (common.CgoSocFreeResource, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoSocFreeResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetFreeResource + var freeResource C.struct_dcmi_soc_free_resource + size := C.uint(unsafe.Sizeof(freeResource)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&freeResource), &size); int32(retCode) != common.Success { + return common.CgoSocFreeResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) + } + return convertSocFreeResource(freeResource), nil +} + +// DcGetVDevActivityInfo get vir device activity info by virtual device id +func (d *DcManager) DcGetVDevActivityInfo(cardID, deviceID int32, vDevID uint32) (common.VDevActivityInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.VDevActivityInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidVDevID(vDevID) { + return common.VDevActivityInfo{}, fmt.Errorf("vDevID(%d) invalid", vDevID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetVDevActivity + var vDevActivityInfo C.struct_dcmi_vdev_query_stru + size := C.uint(unsafe.Sizeof(vDevActivityInfo)) + vDevActivityInfo.vdev_id = C.uint(vDevID) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&vDevActivityInfo), &size); int32(retCode) != common.Success { + return common.VDevActivityInfo{}, fmt.Errorf("retCode: %d", int32(retCode)) + } + totalMemSize := uint64(vDevActivityInfo.query_info.computing.vdev_memory_total) + usedMemSize := totalMemSize - uint64(vDevActivityInfo.query_info.computing.vdev_memory_free) + if usedMemSize < 0 { + return common.VDevActivityInfo{}, errors.New("used memory value abnormal") + } + return common.VDevActivityInfo{ + VDevID: vDevID, + VDevAiCoreRate: uint32(vDevActivityInfo.query_info.computing.vdev_aicore_utilization), + VDevTotalMem: totalMemSize, + VDevUsedMem: usedMemSize, + IsVirtualDev: true, + }, nil +} + +// DcVGetDeviceInfo get vdevice resource info +func (d *DcManager) DcVGetDeviceInfo(cardID, deviceID int32) (common.VirtualDevInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.VirtualDevInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var unitType C.enum_dcmi_unit_type + if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { + return common.VirtualDevInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) + } + if int32(unitType) != common.NpuType { + return common.VirtualDevInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) + } + + cgoDcmiSocTotalResource, err := d.DcGetDeviceTotalResource(cardID, deviceID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get device total resource failed, error is: %v", err) + } + + cgoDcmiSocFreeResource, err := d.DcGetDeviceFreeResource(cardID, deviceID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get device free resource failed, error is: %v", err) + } + dcmiVDevInfo := common.VirtualDevInfo{ + TotalResource: cgoDcmiSocTotalResource, + FreeResource: cgoDcmiSocFreeResource, + } + for _, vDevID := range cgoDcmiSocTotalResource.VDevID { + cgoVDevQueryStru, err := d.DcGetDeviceVDevResource(cardID, deviceID, vDevID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get device virtual resource failed, error is: %v", err) + } + dcmiVDevInfo.VDevInfo = append(dcmiVDevInfo.VDevInfo, cgoVDevQueryStru) + vDevActivityInfo, err := d.DcGetVDevActivityInfo(cardID, deviceID, vDevID) + if err != nil { + hwlog.RunLog.Warnf("get cur vDev's activity info failed, err: %s", err) + continue + } + vDevActivityInfo.VDevAiCore = float64(cgoVDevQueryStru.QueryInfo.Computing.Aic) + dcmiVDevInfo.VDevActivityInfo = append(dcmiVDevInfo.VDevActivityInfo, vDevActivityInfo) + } + return dcmiVDevInfo, nil +} + +// DcGetCardIDDeviceID get card id and device id from logic id +func (d *DcManager) DcGetCardIDDeviceID(logicID int32) (int32, int32, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) + } + var cardID, deviceID C.int + if retCode := C.dcmi_get_card_id_device_id_from_logicid(&cardID, &deviceID, + C.uint(logicID)); int32(retCode) != common.Success { + return common.RetError, common.RetError, + fmt.Errorf("failed to get card id and device id by logicID(%d), errorcode is: %d", logicID, + int32(retCode)) + } + if !common.IsValidCardIDAndDeviceID(int32(cardID), int32(deviceID)) { + return common.RetError, common.RetError, fmt.Errorf("failed to get card id and device id, "+ + "cardID(%d) or deviceID(%d) is invalid", int32(cardID), int32(deviceID)) + } + + return int32(cardID), int32(deviceID), nil +} + +// DcCreateVDevice create virtual device by logic id +func (d *DcManager) DcCreateVDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. + CgoCreateVDevOut, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return common.CgoCreateVDevOut{}, fmt.Errorf("get card id and device id failed, error is: %v", err) + } + + createVDevOut, err := d.DcCreateVirtualDevice(cardID, deviceID, vDevInfo) + if err != nil { + return common.CgoCreateVDevOut{}, fmt.Errorf("create virtual device failed, error is: %v", err) + } + return createVDevOut, nil +} + +// DcGetVDeviceInfo get virtual device info by logic id +func (d *DcManager) DcGetVDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.VirtualDevInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get card id and device id failed, error is: %v", err) + } + + dcmiVDevInfo, err := d.DcVGetDeviceInfo(cardID, deviceID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v", err) + } + return dcmiVDevInfo, nil +} + +// DcDestroyVDevice destroy spec virtual device by logic id +func (d *DcManager) DcDestroyVDevice(logicID int32, vDevID uint32) error { + if !common.IsValidLogicIDOrPhyID(logicID) { + return fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return fmt.Errorf("get card id and device id failed, error is: %v", err) + } + + if err = d.DcSetDestroyVirtualDevice(cardID, deviceID, vDevID); err != nil { + return fmt.Errorf("destroy virtual device failed, error is: %v", err) + } + return nil +} + +// DcGetDeviceVoltage the accuracy is 0.01v. +func (d *DcManager) DcGetDeviceVoltage(cardID, deviceID int32) (float32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var vol C.uint + if retCode := C.dcmi_get_device_voltage(C.int(cardID), C.int(deviceID), &vol); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("failed to obtain the voltage based on card_id(%d) and "+ + "device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) + } + // the voltage's value is error if it's greater than or equal to MaxInt32 + if common.IsGreaterThanOrEqualInt32(int64(vol)) { + return common.RetError, fmt.Errorf("voltage value out of range(max is int32), "+ + "card_id(%d) and device_id(%d), voltage: %d", cardID, deviceID, int64(vol)) + } + + return float32(vol) * common.ReduceOnePercent, nil +} + +// DcGetDevicePowerInfo the accuracy is 0.1w, the result like: 8.2 +func (d *DcManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cpower C.int + if retCode := C.dcmi_get_device_power_info(C.int(cardID), C.int(deviceID), + &cpower); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("failed to obtain the power based on card_id(%d) and device_id(%d)"+ + ", error code: %d", cardID, deviceID, int32(retCode)) + } + parsedPower := float32(cpower) + if parsedPower < 0 { + return common.RetError, fmt.Errorf("get wrong device power, card_id(%d) and device_id(%d), power: %f", + cardID, deviceID, parsedPower) + } + + return parsedPower * common.ReduceTenth, nil + +} + +// DcGetDeviceFrequency get device frequency, unit MHz +// Ascend910B with frequency type: 2,6,7,9 +// Ascend910 with frequency type: 2,6,7,9 +// Ascend310 with frequency type: 1,2,6,7,9 +// Ascend310P with frequency type: 1,2,7,9,12 +// more information see common.DeviceType +func (d *DcManager) DcGetDeviceFrequency(cardID, deviceID int32, devType common.DeviceType) (uint32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cFrequency C.uint + if retCode := C.dcmi_get_device_frequency(C.int(cardID), C.int(deviceID), C.enum_dcmi_freq_type(devType.Code), + &cFrequency); int32(retCode) != common.Success { + return common.UnRetError, + buildDcmiErr(cardID, deviceID, fmt.Sprintf("frequency (name: %v, code:%d)", devType.Name, devType.Code), retCode) + } + // check whether cFrequency is too big + if common.IsGreaterThanOrEqualInt32(int64(cFrequency)) || int64(cFrequency) < 0 { + return common.UnRetError, fmt.Errorf("frequency value out of range [0, int32),card_id(%d) and device_id(%d), "+ + "frequency (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, int64(cFrequency)) + } + return uint32(cFrequency), nil +} + +// DcGetMemoryInfo use v3 interface to query memory info +func (d *DcManager) DcGetMemoryInfo(cardID, deviceID int32) (*common.MemoryInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cmInfoV3 CDcmiMemoryInfoV3 + if retCode := C.dcmi_get_device_memory_info_v3(C.int(cardID), C.int(deviceID), + &cmInfoV3); int32(retCode) != common.Success { + return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ + "%d) and device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) + } + + if uint64(cmInfoV3.memory_size) < uint64(cmInfoV3.memory_available) { + return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ + "%d) and device_id(%d), total memory is less than available memory", cardID, deviceID) + } + + return &common.MemoryInfo{ + MemorySize: uint64(cmInfoV3.memory_size), + MemoryAvailable: uint64(cmInfoV3.memory_available), + Frequency: uint32(cmInfoV3.freq), + Utilization: uint32(cmInfoV3.utiliza), + }, nil + +} + +// FuncDcmiGetDeviceHbmInfo dcmi_get_device_hbm_info function for outer invoke, only for Ascend910 +func FuncDcmiGetDeviceHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cHbmInfo C.struct_dcmi_hbm_info + if retCode := C.dcmi_get_device_hbm_info(C.int(cardID), C.int(deviceID), + &cHbmInfo); int32(retCode) != common.Success { + return nil, buildDcmiErr(cardID, deviceID, "high bandwidth memory info", retCode) + } + hbmTemp := int32(cHbmInfo.temp) + if hbmTemp < 0 { + return nil, fmt.Errorf("get wrong device HBM temporary, card_id(%d) and device_id(%d), HBM.temp: %d", + cardID, deviceID, hbmTemp) + } + return &common.HbmInfo{ + MemorySize: uint64(cHbmInfo.memory_size), + Frequency: uint32(cHbmInfo.freq), + Usage: uint64(cHbmInfo.memory_usage), + Temp: hbmTemp, + BandWidthUtilRate: uint32(cHbmInfo.bandwith_util_rate)}, nil +} + +// DcGetHbmInfo get HBM information A310/A310P not support +func (d *DcManager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { + return &common.HbmInfo{ + MemorySize: 0, + Frequency: 0, + Usage: 0, + Temp: 0, + BandWidthUtilRate: 0}, nil +} + +// DcGetDeviceErrorCode get the error count and errorcode of the device,only return the first errorcode +func (d *DcManager) DcGetDeviceErrorCode(cardID, deviceID int32) (int32, int64, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, + deviceID) + } + var errCount C.int + var errCodeArray [common.MaxErrorCodeCount]C.uint + if retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], + common.MaxErrorCodeCount); int32(retCode) != common.Success { + return common.RetError, common.RetError, fmt.Errorf("failed to obtain the device errorcode based on "+ + "card_id(%d) and device_id(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), + int32(errCount)) + } + + if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { + return common.RetError, common.RetError, fmt.Errorf("get wrong errorcode count, "+ + "card_id(%d) and device_id(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) + } + + return int32(errCount), int64(errCodeArray[0]), nil +} + +// DcGetDeviceCount get device count +func (d *DcManager) DcGetDeviceCount() (int32, error) { + devNum, _, err := d.DcGetLogicIDList() + if err != nil { + return common.RetError, fmt.Errorf("get device count failed, error: %v", err) + } + return devNum, nil +} + +// DcGetLogicIDList get device logic id list +func (d *DcManager) DcGetLogicIDList() (int32, []int32, error) { + logicIDs := make([]int32, 0) + var totalNum int32 + _, cardList, err := d.DcGetCardList() + if err != nil { + return common.RetError, logicIDs, fmt.Errorf("get card list failed, error: %v", err) + } + for _, cardID := range cardList { + devNumInCard, err := d.DcGetDeviceNumInCard(cardID) + if err != nil { + return common.RetError, logicIDs, fmt.Errorf("get device num by cardID: %d failed, error: %v", + cardID, err) + } + totalNum += devNumInCard + if totalNum > common.HiAIMaxDeviceNum*common.HiAIMaxCardNum { + return common.RetError, nil, fmt.Errorf("get device num: %d greater than %d", + totalNum, common.HiAIMaxDeviceNum*common.HiAIMaxCardNum) + } + for devID := int32(0); devID < devNumInCard; devID++ { + logicID, err := d.DcGetDeviceLogicID(cardID, devID) + if err != nil { + return common.RetError, nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ + "failed, error: %v", cardID, devID, err) + } + logicIDs = append(logicIDs, logicID) + } + } + return totalNum, logicIDs, nil +} + +// DcGetDeviceHealth get device health +func (d *DcManager) DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var health C.uint + if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), + &health); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ + "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) + } + if common.IsGreaterThanOrEqualInt32(int64(health)) { + return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ + "health: %d", cardID, deviceID, int64(health)) + } + return int32(health), nil +} + +// DcGetDeviceUtilizationRate get device utils rate by id +func (d *DcManager) DcGetDeviceUtilizationRate(cardID, deviceID int32, devType common.DeviceType) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var rate C.uint + if retCode := C.dcmi_get_device_utilization_rate(C.int(cardID), C.int(deviceID), C.int(devType.Code), + &rate); int32(retCode) != common.Success { + return common.RetError, + buildDcmiErr(cardID, deviceID, fmt.Sprintf("utilization (name: %v, code:%d)", devType.Name, devType.Code), retCode) + } + if !common.IsValidUtilizationRate(uint32(rate)) { + return common.RetError, fmt.Errorf("get wrong device (cardID: %d, deviceID: %d) "+ + "utilization (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, uint32(rate)) + } + return int32(rate), nil +} + +// DcGetDeviceTemperature get the device temperature +func (d *DcManager) DcGetDeviceTemperature(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var temp C.int + if retCode := C.dcmi_get_device_temperature(C.int(cardID), C.int(deviceID), + &temp); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) temperature failed, error "+ + "code is : %d", cardID, deviceID, int32(retCode)) + } + parsedTemp := int32(temp) + if parsedTemp < int32(common.DefaultTemperatureWhenQueryFailed) { + return common.RetError, fmt.Errorf("get wrong device temperature, devcie (cardID: %d, deviceID: %d), "+ + "temperature: %d", cardID, deviceID, parsedTemp) + } + return parsedTemp, nil +} + +func convertUCharToCharArr(cgoArr [maxChipNameLen]C.uchar) []byte { + var charArr []byte + for _, v := range cgoArr { + if v == 0 { + break + } + charArr = append(charArr, byte(v)) + } + return charArr +} + +// DcGetChipInfo get the chip info by cardID and deviceID +func (d *DcManager) DcGetChipInfo(cardID, deviceID int32) (*common.ChipInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var chipInfo C.struct_dcmi_chip_info_v2 + chip := &common.ChipInfo{} + if rCode := C.dcmi_get_device_chip_info_v2(C.int(cardID), C.int(deviceID), &chipInfo); int32(rCode) != common.Success { + hwlog.RunLog.Debugf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ + " error code: %d", cardID, deviceID, int32(rCode)) + var oldChipInfo C.struct_dcmi_chip_info + if rCode = C.dcmi_get_device_chip_info(C.int(cardID), C.int(deviceID), &oldChipInfo); int32(rCode) != common.Success { + return nil, fmt.Errorf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ + " error code: %d", cardID, deviceID, int32(rCode)) + } + chip.Name = string(convertUCharToCharArr(oldChipInfo.chip_name)) + chip.Type = string(convertUCharToCharArr(oldChipInfo.chip_type)) + chip.Version = string(convertUCharToCharArr(oldChipInfo.chip_ver)) + chip.AICoreCnt = int(oldChipInfo.aicore_cnt) + } else { + chip.Name = string(convertUCharToCharArr(chipInfo.chip_name)) + chip.Type = string(convertUCharToCharArr(chipInfo.chip_type)) + chip.Version = string(convertUCharToCharArr(chipInfo.chip_ver)) + chip.AICoreCnt = int(chipInfo.aicore_cnt) + chip.NpuName = string(convertUCharToCharArr(chipInfo.npu_name)) + } + if !common.IsValidChipInfo(chip) { + return nil, fmt.Errorf("get device ChipInfo information failed, chip info is empty,"+ + " cardID(%d), deviceID(%d)", cardID, deviceID) + } + + return chip, nil +} + +// DcGetPhysicIDFromLogicID get physicID from logicID +func (d *DcManager) DcGetPhysicIDFromLogicID(logicID int32) (int32, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, fmt.Errorf("logicID(%d) is invalid", logicID) + } + var physicID C.uint + if rCode := C.dcmi_get_device_phyid_from_logicid(C.uint(logicID), &physicID); int32(rCode) != common.Success { + return common.RetError, fmt.Errorf("get physic id from logicID(%d) failed, error code: %d", logicID, int32(rCode)) + } + if !common.IsValidLogicIDOrPhyID(int32(physicID)) { + return common.RetError, fmt.Errorf("get wrong physicID(%d) from logicID(%d)", uint32(physicID), logicID) + } + return int32(physicID), nil +} + +// DcGetDeviceIPAddress get device IP address by cardID and deviceID +func (d *DcManager) DcGetDeviceIPAddress(cardID, deviceID, ipType int32) (string, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var portType C.enum_dcmi_port_type = 1 + var portID C.int + var ipAddress C.struct_dcmi_ip_addr + var maskAddress C.struct_dcmi_ip_addr + if ipType == ipAddrTypeV6 { + ipAddress.ip_type = ipAddrTypeV6 + } + rCode := C.dcmi_get_device_ip(C.int(cardID), C.int(deviceID), portType, portID, &ipAddress, &maskAddress) + if int32(rCode) != common.Success { + return "", fmt.Errorf("get device IP address failed, cardID(%d), deviceID(%d), error code: %d", + cardID, deviceID, int32(rCode)) + } + if ipType == ipAddrTypeV6 { + return d.buildIPv6Addr(ipAddress) + } + return d.buildIPv4Addr(ipAddress) +} + +func (d *DcManager) buildIPv4Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { + deviceIP := make([]string, 0, net.IPv4len) + for key, val := range ipAddress.u_addr { + if key >= net.IPv4len { + break + } + deviceIP = append(deviceIP, fmt.Sprintf("%v", val)) + } + if netIP := net.ParseIP(strings.Join(deviceIP, ".")); netIP != nil { + return netIP.String(), nil + } + return "", fmt.Errorf("the device IPv4 address is invalid, value: %v", deviceIP) +} + +func (d *DcManager) buildIPv6Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { + deviceIP := make([]byte, 0, net.IPv6len) + for key, val := range ipAddress.u_addr { + if key >= net.IPv6len { + break + } + deviceIP = append(deviceIP, byte(val)) + } + if netIP := net.IP(deviceIP); netIP != nil { + return netIP.String(), nil + } + return "", fmt.Errorf("the device IPv6 address is invalid, value: %v", deviceIP) +} + +func callDcmiGetDeviceNetworkHealth(cardID, deviceID int32, result chan<- common.DeviceNetworkHealth) { + var healthCode C.enum_dcmi_rdfx_detect_result + rCode := C.dcmi_get_device_network_health(C.int(cardID), C.int(deviceID), &healthCode) + result <- common.DeviceNetworkHealth{HealthCode: uint32(healthCode), RetCode: int32(rCode)} +} + +// DcGetDeviceNetWorkHealth get device network health by cardID and deviceID +func (d *DcManager) DcGetDeviceNetWorkHealth(cardID, deviceID int32) (uint32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + result := make(chan common.DeviceNetworkHealth, 1) + go callDcmiGetDeviceNetworkHealth(cardID, deviceID, result) + select { + case res := <-result: + if res.RetCode != common.Success { + return common.UnRetError, fmt.Errorf("get device network healthCode failed, cardID(%d),"+ + " deviceID(%d), ret code: %d, health code: %d", cardID, deviceID, res.RetCode, res.HealthCode) + } + + if int32(res.HealthCode) < 0 || int32(res.HealthCode) > int32(math.MaxInt8) { + return common.UnRetError, fmt.Errorf("get wrong device network healthCode, cardID(%d), deviceID(%d),"+ + " error healthCode: %d", cardID, deviceID, int32(res.HealthCode)) + } + + return res.HealthCode, nil + // dcmi_get_device_network_health is occasionally blocked for a long time, because of retrying, + // after the card dropped. This method is used to interrupt the execution of the dcmi interface, + // if invoking time excceeds 1 second. + case <-time.After(common.DcmiApiTimeout * time.Second): + return common.UnRetError, fmt.Errorf("accessing dcmi_get_device_network_health interface timeout, "+ + "cardID(%d), deviceID(%d)", cardID, deviceID) + } +} + +// DcGetLogicIDFromPhysicID get logicID from physicID +func (d *DcManager) DcGetLogicIDFromPhysicID(physicID int32) (int32, error) { + if !common.IsValidLogicIDOrPhyID(physicID) { + return common.RetError, fmt.Errorf("physicID(%d) is invalid", physicID) + } + var logicID C.uint + if rCode := C.dcmi_get_device_logicid_from_phyid(C.uint(physicID), &logicID); int32(rCode) != common.Success { + return common.RetError, fmt.Errorf("get logicID from physicID(%d) failed, error code: %d", + physicID, int32(rCode)) + } + + if !common.IsValidLogicIDOrPhyID(int32(logicID)) { + return common.RetError, fmt.Errorf("get wrong logicID(%d) from physicID(%d)", uint32(logicID), physicID) + } + return int32(logicID), nil +} + +// FuncDcmiMcuGetPowerInfo dcmi_mcu_get_power_info_new function for outer invoke +func FuncDcmiMcuGetPowerInfo(cardID int32) (float32, error) { + var power C.int + if retCode := C.dcmi_mcu_get_power_info_new(C.int(cardID), &power); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("mcu_get_power_info failed, error code is:%d", int32(retCode)) + } + parsedPower := float32(power) + if parsedPower < 0 { + return common.RetError, fmt.Errorf("get wrong mcu_get_power_info, cardID: %d, power: %f", cardID, + parsedPower) + } + return parsedPower * common.ReduceTenth, nil +} + +// DcGetMcuPowerInfo this function is only for Ascend310P, A910/A310 not support +func (d *DcManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { + return 0, nil +} + +// DcGetProductType get product type by dcmi interface +func (d *DcManager) DcGetProductType(cardID, deviceID int32) (string, error) { + cProductType := C.CString(string(make([]byte, productTypeLen))) + defer C.free(unsafe.Pointer(cProductType)) + err := C.dcmi_get_product_type(C.int(cardID), C.int(deviceID), (*C.char)(cProductType), productTypeLen+1) + if err != 0 { + return "", fmt.Errorf("get product type failed, errCode: %d", int32(err)) + } + return C.GoString(cProductType), nil +} + +// DcGetNpuWorkMode get npu work mode, this function is only for Ascend910, A310/310P not support +func (d *DcManager) DcGetNpuWorkMode(cardID int32) (int, error) { + var cWorkMode C.uchar + err := C.dcmi_get_npu_work_mode(C.int(cardID), &cWorkMode) + if err != 0 { + return common.RetError, fmt.Errorf("get npu work mode failed, errCode: %d", int32(err)) + } + return int(cWorkMode), nil +} + +// DcSetDeviceReset reset spec device chip +func (d *DcManager) DcSetDeviceReset(cardID, deviceID int32) error { + var channelType C.enum_dcmi_reset_channel = C.INBAND_CHANNEL + return d.setDeviceReset(cardID, deviceID, channelType) +} + +// DcGetBrotherCardID get brother card id +func (d *DcManager) DcGetBrotherCardID(cardID, deviceID int32) (int32, error) { + var broCardID C.int + errCode := C.dcmi_get_netdev_brother_device(C.int(cardID), C.int(deviceID), &broCardID) + if errCode != common.Success { + return common.RetError, fmt.Errorf("unable to get brother card, errCode: %v", errCode) + } + return int32(broCardID), nil +} + +// DcGetOutBandChannelState get out band channel state +func (d *DcManager) DcGetOutBandChannelState(cardID, deviceID int32) error { + var channelState C.int + errCode := C.dcmi_get_device_outband_channel_state(C.int(cardID), C.int(deviceID), &channelState) + if errCode != common.Success { + return fmt.Errorf("get out band channel state error, errCode: %v", errCode) + } + if channelState != common.ChannelStateOk { + return fmt.Errorf("chip reset not support, channel state: %v", channelState) + } + return nil +} + +// DcPreResetSoc pre reset soc, used before reset out band +func (d *DcManager) DcPreResetSoc(cardID, deviceID int32) error { + errCode := C.dcmi_pre_reset_soc(C.int(cardID), C.int(deviceID)) + if errCode != common.Success { + return fmt.Errorf("pre reset failed, cardID: %v, deviceID: %v, errCode: %v", cardID, deviceID, errCode) + } + return nil +} + +// DcSetDeviceResetOutBand reset spec device chip out band +func (d *DcManager) DcSetDeviceResetOutBand(cardID, deviceID int32) error { + var channelType C.enum_dcmi_reset_channel = C.OUTBAND_CHANNEL + return d.setDeviceReset(cardID, deviceID, channelType) +} + +// DcRescanSoc trigger soc rescan, non-blocking +func (d *DcManager) DcRescanSoc(cardID, deviceID int32) error { + errCode := C.dcmi_rescan_soc(C.int(cardID), C.int(deviceID)) + if errCode != common.Success { + return fmt.Errorf("fail to rescan chip cardID %d, deviceID %v, errCode: %v", cardID, deviceID, errCode) + } + return nil +} + +func (d *DcManager) setDeviceReset(cardID, deviceID int32, channelType C.enum_dcmi_reset_channel) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if errCode := C.dcmi_set_device_reset(C.int(cardID), C.int(deviceID), channelType); errCode != 0 { + return fmt.Errorf("cardID(%d) and deviceID(%d) hot reset errCode: %v", cardID, deviceID, errCode) + } + return nil +} + +// DcGetDeviceBootStatus get NPU boot status +func (d *DcManager) DcGetDeviceBootStatus(logicID int32) (int, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return common.RetError, fmt.Errorf("failed to get cardID and deviceID by logicID(%d)", logicID) + } + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var bootStatus C.enum_dcmi_boot_status = C.DCMI_BOOT_STATUS_FINISH + if errCode := C.dcmi_get_device_boot_status(C.int(cardID), C.int(deviceID), &bootStatus); errCode != 0 { + return common.RetError, fmt.Errorf("device boot status errCode: %v", errCode) + } + return int(bootStatus), nil +} + +// DcGetDeviceAllErrorCode get the error count and all error codes of the device +func (d *DcManager) DcGetDeviceAllErrorCode(cardID, deviceID int32) (int32, []int64, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, + deviceID) + } + var errCount C.int + var errCodeArray [common.MaxErrorCodeCount]C.uint + retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], + common.MaxErrorCodeCount) + + var health C.uint + healthRetCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), &health) + + if int32(retCode) != common.Success && int32(healthRetCode) != common.DeviceNotReadyErrCode { + return common.RetError, nil, fmt.Errorf("failed to obtain the device errorcode based on cardID("+ + "%d) and deviceID(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), int32(errCount)) + } + + errCodes := make([]int64, 0, len(errCodeArray)) + for _, errCode := range errCodeArray { + if int64(errCode) != 0 { + errCodes = append(errCodes, int64(errCode)) + } + } + + if int32(healthRetCode) == common.DeviceNotReadyErrCode { + hwlog.RunLog.Errorf("device errorcode v2 ret code: %d, device health ret code: %d, device not ready, "+ + "maybe a card drop fault occurred on cardID(%d) and deviceID(%d)", int32(retCode), int32(healthRetCode), + cardID, deviceID) + errCount += 1 + errCodes = append(errCodes, common.CardDropFaultCode) + } + + if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { + return common.RetError, nil, fmt.Errorf("get wrong errorcode count, "+ + "cardID(%d) and deviceID(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) + } + + return int32(errCount), errCodes, nil +} + +// DcSubscribeDeviceFaultEvent subscribe device fault, callback with func 'faultEventCallFunc' +func (d *DcManager) DcSubscribeDeviceFaultEvent(cardID, deviceID int32) error { + if faultEventCallFunc == nil { + return errors.New("callFunc is invalid, can't start subscribe") + } + + var filter C.struct_dcmi_event_filter + if rCode := C.dcmi_subscribe_fault_event(C.int(cardID), C.int(deviceID), filter); int32(rCode) != common.Success { + return fmt.Errorf("subscribe fault event failed, cardID(%d) and deviceID(%d), error code: %d", + cardID, deviceID, int32(rCode)) + } + return nil +} + +// DcSetFaultEventCallFunc set fault event call back func +func (d *DcManager) DcSetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) { + faultEventCallFunc = businessFunc +} + +//export goEventFaultCallBack +func goEventFaultCallBack(event C.struct_dcmi_dms_fault_event) { + if faultEventCallFunc == nil { + hwlog.RunLog.Errorf("no fault event call back func") + return + } + // recovery event recorded fault event occurrence time, the recovery event time cannot be obtained. + // Therefore, all event occurrence time is recorded as the current host time when the event is received. + devFaultInfo := common.DevFaultInfo{ + EventID: int64(event.event_id), + LogicID: int32(event.deviceid), + ModuleType: int8(event.node_type), + ModuleID: int8(event.node_id), + SubModuleType: int8(event.sub_node_type), + SubModuleID: int8(event.sub_node_id), + Severity: int8(event.severity), + Assertion: int8(event.assertion), + AlarmRaisedTime: time.Now().UnixMilli(), + } + faultEventCallFunc(devFaultInfo) +} + +// DcGetDieID get chip die ID, like VDieID or NDieID, only Ascend910 has NDieID +func (d *DcManager) DcGetDieID(cardID, deviceID int32, dcmiDieType DieType) (string, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + if dcmiDieType != VDIE && dcmiDieType != NDIE { + return "", fmt.Errorf("dcmi die type can only be one of %d or %d", VDIE, NDIE) + } + + var dieIDObj C.struct_dcmi_die_id + if retCode := C.dcmi_get_device_die_v2(C.int(cardID), C.int(deviceID), + C.enum_dcmi_die_type(dcmiDieType), &dieIDObj); int32(retCode) != common.Success { + return "", buildDcmiErr(cardID, deviceID, "chip die ID", retCode) + } + + const hexBase = 16 + dieIDStr := make([]string, DieIDCount) + + hwlog.RunLog.Debugf("cardID(%d), deviceID(%d) get die type(%d) value %v", cardID, deviceID, dcmiDieType, + dieIDObj.soc_die) + for i := 0; i < DieIDCount; i++ { + s := strconv.FormatUint(uint64(dieIDObj.soc_die[i]), hexBase) + // Each part of the die id consists of 8 characters, and if the length is not enough, + // zero is added at the beginning + dieIDStr[i] = fmt.Sprintf("%08s", s) + } + return strings.ToUpper(strings.Join(dieIDStr, "-")), nil +} + +// DcGetDevProcessInfo chip process info +func (d *DcManager) DcGetDevProcessInfo(cardID, deviceID int32) (*common.DevProcessInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info + var procNum C.int + + if retCode := C.dcmi_get_device_resource_info(C.int(cardID), C.int(deviceID), &procList[0], + &procNum); int32(retCode) != common.Success { + return nil, buildDcmiErr(cardID, deviceID, "device resource", retCode) + } + + if int32(procNum) < 0 || int32(procNum) > common.MaxProcNum { + return nil, fmt.Errorf("get invalid proccess num (%d), cardID(%d) and deviceID(%d)", int32(procNum), cardID, + deviceID) + } + + return convertToDevResourceInfo(procList, int32(procNum)), nil +} + +func convertToDevResourceInfo(procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info, + procNum int32) *common.DevProcessInfo { + if procNum < 0 || procNum > common.MaxProcNum { + hwlog.RunLog.Errorf("process num %v is not within in the range [0~%v]", procNum, common.MaxProcNum) + return nil + } + + info := new(common.DevProcessInfo) + if procNum == 0 { + return info + } + + info.ProcNum = procNum + for i := int32(0); i < procNum; i++ { + proc := common.DevProcInfo{ + Pid: int32(procList[i].proc_id), + MemUsage: float64(procList[i].proc_mem_usage) / common.UnitMB, // convert byte to MB + } + info.DevProcArray = append(info.DevProcArray, proc) + } + + return info +} + +// DcGetPCIeBusInfo pcie bus info +func (d *DcManager) DcGetPCIeBusInfo(cardID, deviceID int32) (string, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var pcieInfo C.struct_dcmi_pcie_info_all + + if retCode := C.dcmi_get_device_pcie_info_v2(C.int(cardID), + C.int(deviceID), &pcieInfo); int32(retCode) != common.Success { + return "", buildDcmiErr(cardID, deviceID, "pcie bus", retCode) + } + + info := fmt.Sprintf("%04X:%02X:%02X.%-4X", int32(pcieInfo.domain), uint32(pcieInfo.bdf_busid), + uint32(pcieInfo.bdf_deviceid), uint32(pcieInfo.bdf_funcid)) + hwlog.RunLog.Debugf("pcie bus info is: '%s'", info) + + return strings.TrimRight(info, " "), nil +} + +// DcGetDeviceBoardInfo return board info of device +func (d *DcManager) DcGetDeviceBoardInfo(cardID, deviceID int32) (common.BoardInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.BoardInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var cBoardInfo C.struct_dcmi_board_info + + if retCode := C.dcmi_get_device_board_info(C.int(cardID), C.int(deviceID), + &cBoardInfo); int32(retCode) != common.Success { + return common.BoardInfo{}, buildDcmiErr(cardID, deviceID, "board info", retCode) + } + + return common.BoardInfo{ + BoardId: uint32(cBoardInfo.board_id), + PcbId: uint32(cBoardInfo.pcb_id), + BomId: uint32(cBoardInfo.bom_id), + SlotId: uint32(cBoardInfo.slot_id), + }, nil +} + +// DcGetPCIEBandwidth get pcie bandwidth value +func (d *DcManager) DcGetPCIEBandwidth(cardID, deviceID int32, profilingTime int) (common.PCIEBwStat, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.PCIEBwStat{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var dcmiPCIEBandwidth C.struct_dcmi_pcie_link_bandwidth_info + var pcieBandwidth common.PCIEBwStat + dcmiPCIEBandwidth.profiling_time = C.int(profilingTime) + retCode := C.dcmi_get_pcie_link_bandwidth_info(C.int(cardID), C.int(deviceID), &dcmiPCIEBandwidth) + if int32(retCode) != common.Success { + return pcieBandwidth, buildDcmiErr(cardID, deviceID, "PCIEBandwidth", retCode) + } + + pcieBandwidth.PcieRxPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_p_bw) + pcieBandwidth.PcieRxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_np_bw) + pcieBandwidth.PcieRxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_cpl_bw) + + pcieBandwidth.PcieTxPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_p_bw) + pcieBandwidth.PcieTxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_np_bw) + pcieBandwidth.PcieTxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_cpl_bw) + + return pcieBandwidth, nil +} + +func (d *DcManager) convertPcieBw(pcieBwArr [agentdrvProfDataNum]C.uint) common.PcieStatValue { + return common.PcieStatValue{ + PcieMinBw: int32(pcieBwArr[0]), + PcieMaxBw: int32(pcieBwArr[1]), + PcieAvgBw: int32(pcieBwArr[agentdrvProfDataNum-1]), + } +} + +// DcGetDcmiVersion return dcmi version +func (d *DcManager) DcGetDcmiVersion() (string, error) { + cDcmiVer := C.CString(string(make([]byte, dcmiVersionLen))) + defer C.free(unsafe.Pointer(cDcmiVer)) + if retCode := C.dcmi_get_dcmi_version((*C.char)(cDcmiVer), dcmiVersionLen+1); int32(retCode) != common.Success { + return "", fmt.Errorf("get dcmi version failed, errCode: %d", int32(retCode)) + } + return C.GoString(cDcmiVer), nil +} + +// DcGetDeviceEccInfo get ECC info +func (d *DcManager) DcGetDeviceEccInfo(cardID, deviceID int32, inputType common.DcmiDeviceType) ( + *common.ECCInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + dcmiDeviceType, err := d.getInputType(inputType) + if err != nil { + return nil, err + } + var deviceEccInfo C.struct_dcmi_ecc_info + if retCode := C.dcmi_get_device_ecc_info(C.int(cardID), C.int(deviceID), dcmiDeviceType, + &deviceEccInfo); retCode != 0 { + return nil, buildDcmiErr(cardID, deviceID, "dcmi device ECC", retCode) + } + eccInfo := &common.ECCInfo{ + EnableFlag: int32(deviceEccInfo.enable_flag), + SingleBitErrorCnt: int64(deviceEccInfo.single_bit_error_cnt), + DoubleBitErrorCnt: int64(deviceEccInfo.double_bit_error_cnt), + TotalSingleBitErrorCnt: int64(deviceEccInfo.total_single_bit_error_cnt), + TotalDoubleBitErrorCnt: int64(deviceEccInfo.total_double_bit_error_cnt), + SingleBitIsolatedPagesCnt: int64(deviceEccInfo.single_bit_isolated_pages_cnt), + DoubleBitIsolatedPagesCnt: int64(deviceEccInfo.double_bit_isolated_pages_cnt), + } + return eccInfo, nil +} + +// DcGetHccsStatisticInfo get HCCS statistic info +func (d *DcManager) DcGetHccsStatisticInfo(cardID, deviceID int32) (common.HccsStatisticInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) + subCmd := HccsSubCmdGetStatisticInfo + var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info + // Use a secure function to get the address (for cleanCode) + addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) + if err != nil { + return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) + } + size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + addr, &size); int32(retCode) != common.Success { + return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) + } + return convertHccsStatisticInfoStruct(hccsStatisticInfo), nil +} + +// DcGetHccsStatisticInfoU64 get HCCS statistic info +func (d *DcManager) DcGetHccsStatisticInfoU64(cardID, deviceID int32) (common.HccsStatisticInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) + subCmd := HccsSubCmdGetStatisticInfoU64 + var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64 + // Use a secure function to get the address (for cleanCode) + addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) + if err != nil { + return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) + } + size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + addr, &size); int32(retCode) != common.Success { + return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) + } + return convertHccsStatisticInfoStructU64(hccsStatisticInfo), nil +} + +func convertHccsStatisticInfoStruct(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info) common.HccsStatisticInfo { + cgoHccsStatisticInfo := common.HccsStatisticInfo{} + for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { + cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) + cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) + cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) + } + return cgoHccsStatisticInfo +} + +func convertHccsStatisticInfoStructU64(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64) common.HccsStatisticInfo { + cgoHccsStatisticInfo := common.HccsStatisticInfo{} + for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { + cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) + cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) + cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) + } + return cgoHccsStatisticInfo +} + +// DcGetHccsBandwidthInfo get HCCS bandwidth info +func (d *DcManager) DcGetHccsBandwidthInfo(cardID int32, deviceID int32, + profilingTime int) (common.HccsBandwidthInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.HccsBandwidthInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info + hccsBandwidthInfo.profiling_time = C.int(profilingTime) + if retCode := C.dcmi_get_hccs_link_bandwidth_info(C.int(cardID), C.int(deviceID), + &hccsBandwidthInfo); int32(retCode) != common.Success { + return common.HccsBandwidthInfo{}, buildDcmiErr(cardID, deviceID, "hccs bandwidth", retCode) + } + return convertHccsBandwidthInfoStruct(hccsBandwidthInfo), nil +} + +func convertHccsBandwidthInfoStruct(hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info) common.HccsBandwidthInfo { + cgoHccsBWInfo := common.HccsBandwidthInfo{} + cgoHccsBWInfo.ProfilingTime = uint32(hccsBandwidthInfo.profiling_time) + cgoHccsBWInfo.TotalTxbw = float64(hccsBandwidthInfo.total_txbw) + cgoHccsBWInfo.TotalRxbw = float64(hccsBandwidthInfo.total_rxbw) + for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { + cgoHccsBWInfo.TxBandwidth = append(cgoHccsBWInfo.TxBandwidth, float64(hccsBandwidthInfo.tx_bandwidth[i])) + cgoHccsBWInfo.RxBandwidth = append(cgoHccsBWInfo.RxBandwidth, float64(hccsBandwidthInfo.rx_bandwidth[i])) + } + return cgoHccsBWInfo +} + +// DcGetSioInfo get SIO info +func (d *DcManager) DcGetSioInfo(cardID, deviceID int32) (common.SioCrcErrStatisticInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.SioCrcErrStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdSio) + subCmd := SioSubCmdCrcErrStatistics + var sioInfo C.struct_dcmi_sio_crc_err_statistic_info + // Use a secure function to get the address (for cleanCode) + addr, err := getAddrWithOffset(unsafe.Pointer(&sioInfo), unsafe.Sizeof(sioInfo), 0) + if err != nil { + return common.SioCrcErrStatisticInfo{}, fmt.Errorf("get sioInfo addr failed, error is: %v", err) + } + size := C.uint(unsafe.Sizeof(sioInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + addr, &size); int32(retCode) != common.Success { + return common.SioCrcErrStatisticInfo{}, buildDcmiErr(cardID, deviceID, "super pod sio", retCode) + } + return convertSioInfoStruct(sioInfo), nil +} + +func convertSioInfoStruct(sPodSioInfo C.struct_dcmi_sio_crc_err_statistic_info) common.SioCrcErrStatisticInfo { + cgoSPodSioInfo := common.SioCrcErrStatisticInfo{ + TxErrCnt: int64(sPodSioInfo.tx_error_count), + RxErrCnt: int64(sPodSioInfo.rx_error_count), + } + for i := uint32(0); i < dcmiMaxReserveNum; i++ { + cgoSPodSioInfo.Reserved = append(cgoSPodSioInfo.Reserved, uint32(sPodSioInfo.reserved[i])) + } + return cgoSPodSioInfo +} + +func (d *DcManager) getInputType(inputType common.DcmiDeviceType) (C.enum_dcmi_device_type, error) { + switch inputType { + case common.DcmiDeviceTypeDDR: + return C.DCMI_DEVICE_TYPE_DDR, nil + case common.DcmiDeviceTypeSRAM: + return C.DCMI_DEVICE_TYPE_SRAM, nil + case common.DcmiDeviceTypeHBM: + return C.DCMI_DEVICE_TYPE_HBM, nil + case common.DcmiDeviceTypeNPU: + return C.DCMI_DEVICE_TYPE_NPU, nil + case common.DcmiDeviceTypeNONE: + return C.DCMI_DEVICE_TYPE_NONE, nil + default: + return C.DCMI_DEVICE_TYPE_NONE, fmt.Errorf("invalid input type for getting device ecc info") + } +} + +// Define a safe function to get address offsets (for cleanCode) +func getAddrWithOffset(addr unsafe.Pointer, length uintptr, offset uintptr) (unsafe.Pointer, error) { + if offset > length { + return nil, fmt.Errorf("offset(%d) is invalid, length(%d)", offset, length) + } + return (unsafe.Pointer)(uintptr(addr) + offset), nil +} + +// DcGetDeviceMainBoardInfo return mainboardId of device +func (d *DcManager) DcGetDeviceMainBoardInfo(cardID, deviceID int32) (uint32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainBoardId C.uint + if retCode := C.dcmi_get_mainboard_id(C.int(cardID), C.int(deviceID), + &cMainBoardId); int32(retCode) != common.Success { + return 0, buildDcmiErr(cardID, deviceID, "mainBoardId", retCode) + } + + return uint32(cMainBoardId), nil +} +func buildDcmiErr(cardID, deviceID int32, msg string, errCode C.int) error { + errDesc, ok := dcmiErrMap[int32(errCode)] + if !ok { + errDesc = "unknown error code" + } + return fmt.Errorf("cardID(%d),deviceID(%d):get %s info failed,error code: %v,error desc: %v", + cardID, deviceID, msg, errCode, errDesc) +} + +// DcGetSuperPodStatus get super pod status +func (d *DcManager) DcGetSuperPodStatus(cardID, deviceID int32, sdid uint32) (int, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var status C.uint + if retCode := C.dcmi_get_spod_node_status(C.int(cardID), C.int(deviceID), + C.unsigned(sdid), &status); int32(retCode) != common.Success { + return 0, buildDcmiErr(cardID, deviceID, "GetSuperPodStatus", retCode) + } + return int(status), nil +} + +// DcSetSuperPodStatus set super pod status +func (d *DcManager) DcSetSuperPodStatus(cardID, deviceID int32, sdid, status uint32) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if retCode := C.dcmi_set_spod_node_status(C.int(cardID), C.int(deviceID), + C.uint(sdid), C.uint(status)); int32(retCode) != common.Success { + return buildDcmiErr(cardID, deviceID, "DcSetSuperPodStatus", retCode) + } + return nil +} + +// DcGetCardElabelV2 get card elabel information +func (d *DcManager) DcGetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + if !common.IsValidCardID(cardID) { + return common.ElabelInfo{}, fmt.Errorf("cardID(%d) is invalid", cardID) + } + var elabelInfo C.struct_dcmi_elabel_info + if retCode := C.dcmi_get_card_elabel_v2(C.int(cardID), &elabelInfo); int32(retCode) != common.Success { + return common.ElabelInfo{}, fmt.Errorf("cardID(%d): get elabel info failed, error code: %v", cardID, retCode) + } + return common.ElabelInfo{ + ProductName: C.GoString(&elabelInfo.product_name[0]), + Model: C.GoString(&elabelInfo.model[0]), + Manufacturer: C.GoString(&elabelInfo.manufacturer[0]), + ManufacturerDate: C.GoString(&elabelInfo.manufacturer_date[0]), + SerialNumber: C.GoString(&elabelInfo.serial_number[0]), + }, nil +} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h new file mode 100644 index 0000000..7ffe468 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h @@ -0,0 +1,596 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __DCMI_INTERFACE_API_H__ +#define __DCMI_INTERFACE_API_H__ + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif +#endif /* __cplusplus */ + +#define DCMIDLLEXPORT static + +#define MAX_CHIP_NAME_LEN 32 // Maximum length of chip name +#define TEMPLATE_NAME_LEN 32 +#define DIE_ID_COUNT 5 // Number of die ID characters +#define AGENTDRV_PROF_DATA_NUM 3 +#define MAX_LENGTH 256 // Maximum length for elabel info fields + +/*----------------------------------------------* + * Structure description * + *----------------------------------------------*/ +struct dcmi_chip_info { + unsigned char chip_type[MAX_CHIP_NAME_LEN]; + unsigned char chip_name[MAX_CHIP_NAME_LEN]; + unsigned char chip_ver[MAX_CHIP_NAME_LEN]; + unsigned int aicore_cnt; +}; + +struct dcmi_chip_info_v2 { + unsigned char chip_type[MAX_CHIP_NAME_LEN]; + unsigned char chip_name[MAX_CHIP_NAME_LEN]; + unsigned char chip_ver[MAX_CHIP_NAME_LEN]; + unsigned int aicore_cnt; + unsigned char npu_name[MAX_CHIP_NAME_LEN]; +}; + +struct dcmi_pcie_info_all { + unsigned int venderid; /* 厂商id */ + unsigned int subvenderid; /* 厂商子id */ + unsigned int deviceid; /* 设备id */ + unsigned int subdeviceid; /* 设备子id */ + int domain; + unsigned int bdf_busid; + unsigned int bdf_deviceid; + unsigned int bdf_funcid; + unsigned char reserve[32]; /* the size of dcmi_pcie_info_all is 64 */ +}; + +struct dcmi_die_id { + unsigned int soc_die[DIE_ID_COUNT]; +}; + +struct dcmi_ecc_info { + int enable_flag; + unsigned int single_bit_error_cnt; + unsigned int double_bit_error_cnt; + unsigned int total_single_bit_error_cnt; + unsigned int total_double_bit_error_cnt; + unsigned int single_bit_isolated_pages_cnt; + unsigned int double_bit_isolated_pages_cnt; + unsigned int single_bit_next_isolated_pages_cnt; + unsigned int double_bit_next_isolated_pages_cnt; +}; + +struct dcmi_hbm_info { + unsigned long long memory_size; + unsigned int freq; + unsigned long long memory_usage; + int temp; + unsigned int bandwith_util_rate; +}; + +struct dcmi_get_memory_info_stru { + unsigned long long memory_size; /* unit:MB */ + unsigned long long memory_available; /* free + hugepages_free * hugepagesize */ + unsigned int freq; + unsigned long hugepagesize; /* unit:KB */ + unsigned long hugepages_total; + unsigned long hugepages_free; + unsigned int utiliza; /* ddr memory info usages */ + unsigned char reserve[60]; /* the size of dcmi_memory_info is 96 */ +}; + +enum dcmi_ip_addr_type { + DCMI_IPADDR_TYPE_V4 = 0, /** IPv4 */ + DCMI_IPADDR_TYPE_V6 = 1, /** IPv6 */ + DCMI_IPADDR_TYPE_ANY = 2 /** IPv4+IPv6 ("dual-stack") */ +}; + +struct dcmi_ip_addr { + union { + unsigned char ip6[16]; + unsigned char ip4[4]; + } u_addr; + enum dcmi_ip_addr_type ip_type; +}; + +enum dcmi_unit_type { + NPU_TYPE = 0, + MCU_TYPE = 1, + CPU_TYPE = 2, + INVALID_TYPE = 0xFF +}; + +enum dcmi_rdfx_detect_result { + DCMI_RDFX_DETECT_OK = 0, + DCMI_RDFX_DETECT_SOCK_FAIL = 1, + DCMI_RDFX_DETECT_RECV_TIMEOUT = 2, + DCMI_RDFX_DETECT_UNREACH = 3, + DCMI_RDFX_DETECT_TIME_EXCEEDED = 4, + DCMI_RDFX_DETECT_FAULT = 5, + DCMI_RDFX_DETECT_INIT = 6, + DCMI_RDFX_DETECT_THREAD_ERR = 7, + DCMI_RDFX_DETECT_IP_SET = 8, + DCMI_RDFX_DETECT_MAX = 0xFF +}; + +enum dcmi_port_type { + DCMI_VNIC_PORT = 0, + DCMI_ROCE_PORT = 1, + DCMI_INVALID_PORT +}; + +enum dcmi_main_cmd { + DCMI_MAIN_CMD_DVPP = 0, + DCMI_MAIN_CMD_ISP, + DCMI_MAIN_CMD_TS_GROUP_NUM, + DCMI_MAIN_CMD_CAN, + DCMI_MAIN_CMD_UART, + DCMI_MAIN_CMD_UPGRADE = 5, + DCMI_MAIN_CMD_HCCS = 16, + DCMI_MAIN_CMD_TEMP = 50, + DCMI_MAIN_CMD_SVM = 51, + DCMI_MAIN_CMD_VDEV_MNG, + DCMI_MAIN_CMD_SIO = 56, + DCMI_MAIN_CMD_DEVICE_SHARE = 0x8001, + DCMI_MAIN_CMD_MAX +}; + +enum dcmi_freq_type { + DCMI_FREQ_DDR = 1, + DCMI_FREQ_CTRLCPU = 2, + DCMI_FREQ_HBM = 6, + DCMI_FREQ_AICORE_CURRENT_ = 7, + DCMI_FREQ_AICORE_MAX = 9, + DCMI_FREQ_VECTORCORE_CURRENT = 12 +}; + +enum dcmi_reset_channel { + OUTBAND_CHANNEL = 0, // out-of-band reset + INBAND_CHANNEL // in-band reset +}; + +enum dcmi_boot_status { + DCMI_BOOT_STATUS_UNINIT = 0, // not init + DCMI_BOOT_STATUS_BIOS, // BIOS starting + DCMI_BOOT_STATUS_OS, // OS starting + DCMI_BOOT_STATUS_FINISH // started +}; + +enum dcmi_device_type { + DCMI_DEVICE_TYPE_DDR, + DCMI_DEVICE_TYPE_SRAM, + DCMI_DEVICE_TYPE_HBM, + DCMI_DEVICE_TYPE_NPU, + DCMI_DEVICE_TYPE_NONE = 0xff +}; + +enum dcmi_event_type { + DCMI_DMS_FAULT_EVENT = 0, +}; + +enum dcmi_die_type { + NDIE, + VDIE +}; + +#define DCMI_VDEV_RES_NAME_LEN 16 +#define DCMI_VDEV_SIZE 20 +#define DCMI_VDEV_FOR_RESERVE 32 +#define DCMI_SOC_SPLIT_MAX 32 +#define DCMI_MAX_EVENT_NAME_LENGTH 256 +#define DCMI_MAX_EVENT_DATA_LENGTH 32 +#define DCMI_EVENT_FILTER_FLAG_EVENT_ID (1UL << 0) +#define DCMI_EVENT_FILTER_FLAG_SERVERITY (1UL << 1) +#define DCMI_EVENT_FILTER_FLAG_NODE_TYPE (1UL << 2) +#define DCMI_MAX_EVENT_RESV_LENGTH 32 +#define HCCS_MAX_PCS_NUM 16 +#define HCCS_RES_PCS_NUM 64 +#define IP_ADDR_LIST_LEN 1024 +#define HCCS_PING_MESH_MAX_NUM 48 +#define ADDR_MAX_LEN 16 + +struct dcmi_base_resource { + unsigned long long token; + unsigned long long token_max; + unsigned long long task_timeout; + unsigned int vfg_id; + unsigned char vip_mode; + unsigned char reserved[DCMI_VDEV_FOR_RESERVE - 1]; /* bytes aligned */ +}; + +/* total types of computing resource */ +struct dcmi_computing_resource { + /* accelator resource */ + float aic; + float aiv; + unsigned short dsa; + unsigned short rtsq; + unsigned short acsq; + unsigned short cdqm; + unsigned short c_core; + unsigned short ffts; + unsigned short sdma; + unsigned short pcie_dma; + + /* memory resource, MB as unit */ + unsigned long long memory_size; + + /* id resource */ + unsigned int event_id; + unsigned int notify_id; + unsigned int stream_id; + unsigned int model_id; + + /* cpu resource */ + unsigned short topic_schedule_aicpu; + unsigned short host_ctrl_cpu; + unsigned short host_aicpu; + unsigned short device_aicpu; + unsigned short topic_ctrl_cpu_slot; + + /* vnpu resource */ + unsigned int vdev_aicore_utilization; + unsigned long long vdev_memory_total; + unsigned long long vdev_memory_free; + + unsigned char reserved[DCMI_VDEV_FOR_RESERVE-DCMI_VDEV_SIZE]; +}; + +struct dcmi_media_resource { + /* dvpp resource */ + float jpegd; + float jpege; + float vpc; + float vdec; + float pngd; + float venc; + unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; +}; + +struct dcmi_create_vdev_out { + unsigned int vdev_id; + unsigned int pcie_bus; + unsigned int pcie_device; + unsigned int pcie_func; + unsigned int vfg_id; + unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; +}; + +struct dcmi_create_vdev_res_stru { + unsigned int vdev_id; + unsigned int vfg_id; + char template_name[TEMPLATE_NAME_LEN]; + unsigned char reserved[64]; +}; + +struct dcmi_vdev_query_info { + char name[DCMI_VDEV_RES_NAME_LEN]; + unsigned int status; + unsigned int is_container_used; + unsigned int vfid; + unsigned int vfg_id; + unsigned long long container_id; + struct dcmi_base_resource base; + struct dcmi_computing_resource computing; + struct dcmi_media_resource media; +}; + +/* for single search */ +struct dcmi_vdev_query_stru { + unsigned int vdev_id; + struct dcmi_vdev_query_info query_info; +}; + +struct dcmi_soc_free_resource { + unsigned int vfg_num; + unsigned int vfg_bitmap; + struct dcmi_base_resource base; + struct dcmi_computing_resource computing; + struct dcmi_media_resource media; +}; + +struct dcmi_soc_total_resource { + unsigned int vdev_num; + unsigned int vdev_id[DCMI_SOC_SPLIT_MAX]; + unsigned int vfg_num; + unsigned int vfg_bitmap; + struct dcmi_base_resource base; + struct dcmi_computing_resource computing; + struct dcmi_media_resource media; +}; + +struct dcmi_spod_info { + unsigned int sdid; + unsigned int scale_type; + unsigned int super_pod_id; + unsigned int server_id; + unsigned int reserve[8]; +}; + +struct dcmi_dms_fault_event { + unsigned int event_id; /* Event ID */ + unsigned short deviceid; /* Device ID */ + unsigned char node_type; /* Node type */ + unsigned char node_id; /* Node ID */ + unsigned char sub_node_type; /* Subnode type */ + unsigned char sub_node_id; /* Subnode ID */ + unsigned char severity; /* Event severity. 0: warning; 1: minor; 2: major; 3: critical */ + unsigned char assertion; /* Event type. 0: fault recovery; 1: fault generation; 2: one-off event */ + int event_serial_num; /* Alarm serial number */ + int notify_serial_num; /* Notification serial number*/ + /* Time when the event occurs, presenting as the number of seconds that have elapsed since the Unix epoch. */ + unsigned long long alarm_raised_time; + char event_name[DCMI_MAX_EVENT_NAME_LENGTH]; /* Event description */ + char additional_info[DCMI_MAX_EVENT_DATA_LENGTH]; /* Additional event information */ + unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /**< Reserves 32 bytes */ +}; + +struct dcmi_event { + enum dcmi_event_type type; /* Event type */ + union { + struct dcmi_dms_fault_event dms_event; /* Event content */ + } event_t; +}; + +struct dcmi_event_filter { + /* It can be used to enable one or all filter criteria. The filter criteria are as follows: + 0: disables the filter criteria. + DCMI_EVENT_FILTER_FLAG_EVENT_ID: receives only specified events. + DCMI_EVENT_FILTER_FLAG_SERVERITY: receives only the events of a specified level and higher levels. + DCMI_EVENT_FILTER_FLAG_NODE_TYPE: receives only events of a specified node type. */ + unsigned long long filter_flag; + /* Receives a specified event. For details, see the Health Management Error Definition. */ + unsigned int event_id; + /* Receives events of a specified level and higher levels. For details, + see the severity definition in the struct dcmi_dms_fault_event structure. */ + unsigned char severity; + /* Receives only events of a specified node type. For details, see the Health Management Error Definition. */ + unsigned char node_type; + unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /* < Reserves 32 bytes. */ +}; + +struct dcmi_proc_mem_info { + int proc_id; + // unit is byte + unsigned long proc_mem_usage; +}; + +struct dcmi_board_info { + unsigned int board_id; + unsigned int pcb_id; + unsigned int bom_id; + unsigned int slot_id; // slot_id indicates pcie slot ID of the chip +}; + +struct dcmi_pcie_link_bandwidth_info { + int profiling_time; + unsigned int tx_p_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int tx_np_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int tx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int tx_np_lantency[AGENTDRV_PROF_DATA_NUM]; + unsigned int rx_p_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int rx_np_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int rx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; +}; + +struct dcmi_hccs_statistic_info { + unsigned int tx_cnt[HCCS_MAX_PCS_NUM]; + unsigned int rx_cnt[HCCS_MAX_PCS_NUM]; + unsigned int crc_err_cnt[HCCS_MAX_PCS_NUM]; + unsigned int retry_cnt[HCCS_MAX_PCS_NUM]; + unsigned int reserved_field_cnt[HCCS_RES_PCS_NUM]; +}; + +struct dcmi_hccs_statistic_info_u64 { + unsigned long long tx_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long rx_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long crc_err_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long retry_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long reserved[HCCS_RES_PCS_NUM]; +}; + +struct dcmi_hccs_bandwidth_info { + int profiling_time; + double total_txbw; + double total_rxbw; + double tx_bandwidth[HCCS_MAX_PCS_NUM]; + double rx_bandwidth[HCCS_MAX_PCS_NUM]; +}; + +struct dcmi_sio_crc_err_statistic_info { + unsigned short tx_error_count; + unsigned short rx_error_count; + unsigned char reserved[8]; +}; + +struct dcmi_elabel_info { + char product_name[MAX_LENGTH]; + char model[MAX_LENGTH]; + char manufacturer[MAX_LENGTH]; + char manufacturer_date[MAX_LENGTH]; + char serial_number[MAX_LENGTH]; +}; + +struct dcmi_hccsping_mesh_operate { + char dst_addr_list[IP_ADDR_LIST_LEN]; + int pkt_size; + int pkt_send_num; + int pkt_interval; + int timeout; + int task_interval; + int task_id; +}; + +struct dcmi_hccsping_mesh_info { + char dst_addr[HCCS_PING_MESH_MAX_NUM][ADDR_MAX_LEN]; + unsigned int suc_pkt_num[HCCS_PING_MESH_MAX_NUM]; + unsigned int fail_pkt_num[HCCS_PING_MESH_MAX_NUM]; + long max_time[HCCS_PING_MESH_MAX_NUM]; + long min_time[HCCS_PING_MESH_MAX_NUM]; + long avg_time[HCCS_PING_MESH_MAX_NUM]; + long tp95_time[HCCS_PING_MESH_MAX_NUM]; + int reply_stat_num[HCCS_PING_MESH_MAX_NUM]; + unsigned long long ping_total_num[HCCS_PING_MESH_MAX_NUM]; + int dest_num; +}; + +#define DCMI_VERSION_1 +#define DCMI_VERSION_2 + +#if defined DCMI_VERSION_2 + +DCMIDLLEXPORT int dcmi_init(void); + +DCMIDLLEXPORT int dcmi_get_card_list(int *card_num, int *card_list, int list_len); + +DCMIDLLEXPORT int dcmi_get_device_num_in_card(int card_id, int *device_num); + +DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); + +DCMIDLLEXPORT int dcmi_get_device_type(int card_id, int device_id, enum dcmi_unit_type *device_type); + +DCMIDLLEXPORT int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); + +DCMIDLLEXPORT int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info); + +DCMIDLLEXPORT int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); + +DCMIDLLEXPORT int dcmi_get_device_power_info(int card_id, int device_id, int *power); + +DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); + +DCMIDLLEXPORT int dcmi_get_device_errorcode_v2( + int card_id, int device_id, int *error_count, unsigned int *error_code_list, unsigned int list_len); + +DCMIDLLEXPORT int dcmi_get_device_temperature(int card_id, int device_id, int *temperature); + +DCMIDLLEXPORT int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage); + +DCMIDLLEXPORT int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, + struct dcmi_ecc_info *device_ecc_info); + +DCMIDLLEXPORT int dcmi_get_device_frequency( + int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency); + +DCMIDLLEXPORT int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); + +DCMIDLLEXPORT int dcmi_get_device_memory_info_v3(int card_id, int device_id, + struct dcmi_get_memory_info_stru *memory_info); + +DCMIDLLEXPORT int dcmi_get_device_utilization_rate( + int card_id, int device_id, int input_type, unsigned int *utilization_rate); + +DCMIDLLEXPORT int dcmi_get_device_info( + int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, unsigned int *size); + +DCMIDLLEXPORT int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, + struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); + +DCMIDLLEXPORT int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result); + +DCMIDLLEXPORT int dcmi_get_device_logic_id(int *device_logic_id, int card_id, int device_id); + +DCMIDLLEXPORT int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, + struct dcmi_create_vdev_out *out); + +DCMIDLLEXPORT int dcmi_set_destroy_vdevice(int card_id, int device_id, unsigned int vdevid); + +DCMIDLLEXPORT int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid); + +DCMIDLLEXPORT int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid); + +DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id); + +DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_phyid(int *card_id, int *device_id, unsigned int device_phy_id); + +DCMIDLLEXPORT int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size); + +DCMIDLLEXPORT int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type); + +DCMIDLLEXPORT int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state); + +DCMIDLLEXPORT int dcmi_pre_reset_soc(int card_id, int device_id); + +DCMIDLLEXPORT int dcmi_rescan_soc(int card_id, int device_id); + +DCMIDLLEXPORT int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id); + +DCMIDLLEXPORT int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status); + +DCMIDLLEXPORT int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter); + +DCMIDLLEXPORT int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode); + +DCMIDLLEXPORT int dcmi_get_device_die_v2( + int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id); + +DCMIDLLEXPORT int dcmi_get_device_resource_info (int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, + int *proc_num); + +DCMIDLLEXPORT int dcmi_get_device_board_info (int card_id, int device_id, struct dcmi_board_info *board_info); + +DCMIDLLEXPORT int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, + struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); + +DCMIDLLEXPORT int dcmi_get_dcmi_version (char *dcmi_ver, int buf_size); + +DCMIDLLEXPORT int dcmi_get_mainboard_id (int card_id, int device_id, unsigned int *mainboard_id); + +DCMIDLLEXPORT int dcmi_get_hccs_link_bandwidth_info (int card_id, int device_id, struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); + +DCMIDLLEXPORT int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, struct dcmi_hccsping_mesh_operate *hccsping_mesh); + +DCMIDLLEXPORT int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id); + +DCMIDLLEXPORT int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, struct dcmi_hccsping_mesh_info *hccsping_mesh_reply); + +DCMIDLLEXPORT int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, unsigned int *state); + +DCMIDLLEXPORT int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status); + +DCMIDLLEXPORT int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status); + +#endif + +#if defined DCMI_VERSION_1 +/* The following interfaces are V1 version interfaces. In order to ensure the compatibility is temporarily reserved, + * the later version will be deleted. Please switch to the V2 version interface as soon as possible */ + +struct dcmi_memory_info_stru { + unsigned long long memory_size; + unsigned int freq; + unsigned int utiliza; +}; + +DCMIDLLEXPORT int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info); + +DCMIDLLEXPORT int dcmi_get_device_errorcode( + int card_id, int device_id, int *error_count, unsigned int *error_code, int *error_width); + +DCMIDLLEXPORT int dcmi_mcu_get_power_info(int card_id, int *power); + +DCMIDLLEXPORT int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info); +#endif + +#ifdef __cplusplus +#if __cplusplus +} +#endif +#endif /* __cplusplus */ + +#endif /* __DCMI_INTERFACE_API_H__ */ diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager.go b/mind-cluster/component/ascend-common/devmanager/devmanager.go new file mode 100644 index 0000000..fe21931 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager.go @@ -0,0 +1,1197 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager +package devmanager + +import ( + "errors" + "fmt" + "math" + "strings" + "sync" + "time" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// DeviceInterface for common device interface +type DeviceInterface interface { + Init() error + ShutDown() error + GetDcmiVersion() string + GetDeviceCount() (int32, error) + GetCardList() (int32, []int32, error) + GetDeviceNumInCard(cardID int32) (int32, error) + GetDeviceList() (int32, []int32, error) + GetChipBaseInfos() ([]*common.ChipBaseInfo, error) + GetDeviceHealth(logicID int32) (uint32, error) + GetDeviceNetWorkHealth(logicID int32) (uint32, error) + GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) + GetDeviceTemperature(logicID int32) (int32, error) + GetDeviceVoltage(logicID int32) (float32, error) + GetDevicePowerInfo(logicID int32) (float32, error) + GetMcuPowerInfo(cardID int32) (float32, error) + GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) + GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) + GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) + GetDeviceErrorCode(logicID int32) (int32, int64, error) + GetChipInfo(logicID int32) (*common.ChipInfo, error) + GetPhysicIDFromLogicID(logicID int32) (int32, error) + GetLogicIDFromPhysicID(physicID int32) (int32, error) + GetDeviceLogicID(cardID, deviceID int32) (int32, error) + GetCardIDDeviceID(logicID int32) (int32, int32, error) + GetDeviceIPAddress(logicID, ipType int32) (string, error) + CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) + GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) + DestroyVirtualDevice(logicID int32, vDevID uint32) error + GetDevType() string + GetProductTypeArray() []string + GetProductType(cardID, deviceID int32) (string, error) + GetAllProductType() ([]string, error) + GetNpuWorkMode() string + SetDeviceReset(cardID, deviceID int32) error + GetBrotherCardID(int32, int32) (int32, error) + PreResetSoc(int32, int32) error + GetOutBandChannelState(int32, int32) error + SetDeviceResetOutBand(int32, int32) error + RescanSoc(int32, int32) error + GetDeviceBootStatus(logicID int32) (int, error) + GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) + SubscribeDeviceFaultEvent(logicID int32) error + SetFaultEventCallFunc(func(common.DevFaultInfo)) error + GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) + GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) + GetPCIeBusInfo(logicID int32) (string, error) + GetBoardInfo(logicID int32) (common.BoardInfo, error) + GetCardElabelV2(cardID int32) (common.ElabelInfo, error) + GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) + SetIsTrainingCard() error + IsTrainingCard() bool + GetValidChipInfo() (common.ChipInfo, error) + GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) + GetSuperPodInfo(int32) (common.CgoSuperPodInfo, error) + GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) + GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) + GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) + GetMainBoardId() uint32 + GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) + + DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error + DcStopHccsPingMesh(int32, int32, int, uint) error + DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) + DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) + DcGetSuperPodStatus(int32, int32, uint32) (int, error) + DcSetSuperPodStatus(int32, int32, uint32, uint32) error +} + +const ( + // init dcmi interface max retry times + maxRetries = 6 + // init dcmi interface retry delay + defaultRetryDelay = 10 +) + +var ( + devManager *DeviceManager = nil + devManagerOnce sync.Once + idCache sync.Map +) + +// npuIdMapping the mapping between the three IDs +type npuIdMapping struct { + logicId int32 + cardId int32 + deviceId int32 +} + +// GetDeviceManager singleton to init global device manager and init dcmi interface +func GetDeviceManager(resetTimeout int) (*DeviceManager, error) { + devManagerOnce.Do(func() { + // a common dcmi Manager is initiated for init dcmi interface, you can specify an specific manager in later + dcMgr := dcmi.DcManager{} + var retryDelay time.Duration = defaultRetryDelay + hwlog.RunLog.Infof("get card list from dcmi reset timeout is %d", resetTimeout) + for currentTime, retryCount := 0, 0; currentTime <= resetTimeout; currentTime += int(retryDelay) { + if err := dcMgr.DcInit(); err != nil { + hwlog.RunLog.Errorf("deviceManager init failed, prepare dcmi failed, err: %v", err) + return + } + cardNum, cardList, err := dcMgr.DcGetCardList() + if err == nil && int(cardNum) == len(cardList) { + hwlog.RunLog.Infof("deviceManager get cardList is %v, cardList length equal to cardNum: %v", + cardList, cardNum) + break + } + if diffTime := float64(resetTimeout - currentTime); diffTime > 0 { + retryDelay = time.Duration(math.Min(float64(defaultRetryDelay), diffTime)) + } + retryCount++ + hwlog.RunLog.Warnf("deviceManager get card list failed (attempt %d), cardNum=%d, cardList=%v, "+ + "err: %v", retryCount, cardNum, cardList, err) + if currentTime+int(retryDelay) <= resetTimeout { + if err = dcMgr.DcShutDown(); err != nil { + hwlog.RunLog.Errorf("deviceManager shut down failed, err: %v", err) + return + } + time.Sleep(retryDelay * time.Second) + continue + } + if int(cardNum) != len(cardList) { + hwlog.RunLog.Warnf("deviceManager get cardList is %v, but cardNum is %v, "+ + "please check whether the real number of npu matches the cardList", cardList, cardNum) + } + } + devManager = &DeviceManager{} + devManager.DcMgr = &dcMgr + dcmiVer, err := dcMgr.DcGetDcmiVersion() + if err != nil { + hwlog.RunLog.Warnf("deviceManager get dcmi version failed, err: %v", err) + } + hwlog.RunLog.Infof("the dcmi version is %s", dcmiVer) + devManager.dcmiVersion = dcmiVer + }) + if devManager == nil { + return nil, errors.New("device Manager is nil, may encounter an exception during initialization. " + + "You can check the system log to confirm") + } + return devManager, nil +} + +// DeviceManager common device manager for Ascend910/310P/310 +type DeviceManager struct { + // DcMgr for common dev manager + DcMgr dcmi.DcDriverInterface + // DevType the value is the same as the device type corresponding to the DcMgr variable. + // Options: api.Ascend310,api.Ascend310P,api.Ascend910 + DevType string + // ProductTypes product type in server, multi type will be in 310P mix scene + ProductTypes []string + // isTrainingCard whether the device is used for training + isTrainingCard bool + dcmiVersion string + // mainBoardId used to distinguish between A900A3SuperPod and A9000A3SuperPod + mainBoardId uint32 +} + +// GetProductTypeArray return product types +func (d *DeviceManager) GetProductTypeArray() []string { + return d.ProductTypes +} + +// GetDevType return dev type +func (d *DeviceManager) GetDevType() string { + return d.DevType +} + +// AutoInit auto detect npu chip type and return the corresponding processing object +func AutoInit(dType string, resetTimeout int) (*DeviceManager, error) { + chipInfo, boardInfo, err := getDeviceInfoForInit(resetTimeout) + if err != nil { + return nil, fmt.Errorf("auto init failed, err: %s", err) + } + var devMgr *DeviceManager + if devMgr, err = GetDeviceManager(resetTimeout); err != nil || devMgr == nil { + return nil, err + } + mainBoardId, err := getValidMainBoardInfo(devMgr.DcMgr) + if err != nil { + // Non-blocking when the main board ID is not found + hwlog.RunLog.Warn(err) + } + devMgr.mainBoardId = mainBoardId + var devType = common.GetDevType(chipInfo.Name, boardInfo.BoardId) + + switch devType { + case api.Ascend910A, api.Ascend910B, api.Ascend910A3: + devMgr.DcMgr = &A910Manager{} + case api.Ascend310P: + devMgr.DcMgr = &A310PManager{} + case api.Ascend310, api.Ascend310B: + devMgr.DcMgr = &A310Manager{} + default: + return nil, fmt.Errorf("unsupport device type (%s)", devType) + } + hwlog.RunLog.Infof("chipName: %v, devType: %v", chipInfo.Name, devType) + if dType != "" && devType != dType { + return nil, fmt.Errorf("the value of dType(%s) is inconsistent with the actual chip type(%s)", + dType, devType) + } + devMgr.DevType = devType + if err := devMgr.SetIsTrainingCard(); err != nil { + hwlog.RunLog.Errorf("auto recognize training card failed, err: %s", err) + } + + pTypes, err := devMgr.GetAllProductType() + if err != nil { + hwlog.RunLog.Debugf("auto init product types failed, err: %s", err) + } + devMgr.ProductTypes = pTypes + return devMgr, nil +} + +func getDeviceInfoForInit(resetTimeout int) (common.ChipInfo, common.BoardInfo, error) { + var mgr *DeviceManager + var err error + if mgr, err = GetDeviceManager(resetTimeout); err != nil || mgr == nil { + return common.ChipInfo{}, common.BoardInfo{}, fmt.Errorf("get chip info failed, err: %v", err) + } + dcMgr := mgr.DcMgr + chipInfo, err := getValidChipInfo(dcMgr) + if err != nil { + hwlog.RunLog.Error(err) + return common.ChipInfo{}, common.BoardInfo{}, err + } + boardInfo, err := getValidBoardInfo(dcMgr) + if err != nil { + hwlog.RunLog.Error(err) + return chipInfo, common.BoardInfo{}, err + } + + return chipInfo, boardInfo, nil +} + +func getValidChipInfo(dcMgr dcmi.DcDriverInterface) (common.ChipInfo, error) { + // get card list + cardNum, cardList, err := dcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return common.ChipInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) + } + if cardNum == 0 { + return common.ChipInfo{}, fmt.Errorf("get chip info failed, no card found") + } + // get device in card, then get chip info by cardID and deviceID + for _, cardID := range cardList { + devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) + if err != nil || devNum == 0 { + hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) + continue + } + for devID := int32(0); devID < devNum; devID++ { + chipInfo, err := dcMgr.DcGetChipInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Debugf("get chip info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + if !common.IsValidChipInfo(chipInfo) { + hwlog.RunLog.Debugf("invalid chip info by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + return *chipInfo, nil + } + } + return common.ChipInfo{}, errors.New("cannot get valid chip info") +} + +func getValidBoardInfo(dcMgr dcmi.DcDriverInterface) (common.BoardInfo, error) { + // get card list + cardNum, cardList, err := dcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return common.BoardInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) + } + if cardNum == 0 { + return common.BoardInfo{}, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) + } + // get device in card, then get board info by cardID and deviceID + for _, cardID := range cardList { + devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) + if err != nil || devNum == 0 { + hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) + continue + } + for devID := int32(0); devID < devNum; devID++ { + boardInfo, err := dcMgr.DcGetDeviceBoardInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Debugf("get board info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + if !common.IsValidBoardInfo(&boardInfo) { + hwlog.RunLog.Debugf("invalid board info by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + return boardInfo, nil + } + } + return common.BoardInfo{}, errors.New("cannot get valid board info") +} +func getValidMainBoardInfo(dcMgr dcmi.DcDriverInterface) (uint32, error) { + // get card list + cardNum, cardList, err := dcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return 0, fmt.Errorf(common.ErrMsgInitCardListFailed) + } + if cardNum == 0 { + return 0, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) + } + // get device in card, then get board info by cardID and deviceID + for _, cardID := range cardList { + devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) + if err != nil || devNum == 0 { + hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) + continue + } + for devID := int32(0); devID < devNum; devID++ { + mainBoardId, err := dcMgr.DcGetDeviceMainBoardInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Debug(err) + continue + } + if !common.IsValidMainBoardInfo(mainBoardId) { + hwlog.RunLog.Warnf("invalid mainBoardId info by cardID(%d), deviceID(%d), error: %v", cardID, devID, err) + continue + } + return mainBoardId, nil + } + } + return 0, errors.New("cannot get main board id") +} + +// Init load symbol and initialize dcmi +func (d *DeviceManager) Init() error { + return d.DcMgr.DcInit() +} + +// ShutDown clean the dynamically loaded resource +func (d *DeviceManager) ShutDown() error { + return d.DcMgr.DcShutDown() +} + +// GetDeviceCount get npu device count +func (d *DeviceManager) GetDeviceCount() (int32, error) { + return d.DcMgr.DcGetDeviceCount() +} + +// GetCardList get all card list +func (d *DeviceManager) GetCardList() (int32, []int32, error) { + return d.DcMgr.DcGetCardList() +} + +// GetDeviceNumInCard get all device list in one card +func (d *DeviceManager) GetDeviceNumInCard(cardID int32) (int32, error) { + return d.DcMgr.DcGetDeviceNumInCard(cardID) +} + +// GetDeviceList get all device logicID list +func (d *DeviceManager) GetDeviceList() (int32, []int32, error) { + return d.DcMgr.DcGetLogicIDList() +} + +// GetDeviceHealth query npu device health status +func (d *DeviceManager) GetDeviceHealth(logicID int32) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get health code by logicID(%d)", logicID) + } + healthCode, err := d.DcMgr.DcGetDeviceHealth(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, err + } + + return uint32(healthCode), nil +} + +// GetDeviceNetWorkHealth query npu device network health status +func (d *DeviceManager) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get network health code by logicID(%d)", logicID) + } + healthCode, err := d.DcMgr.DcGetDeviceNetWorkHealth(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, err + } + + return healthCode, nil +} + +// GetDeviceUtilizationRate get npu device utilization +func (d *DeviceManager) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get utilization by logicID(%d)", logicID) + } + rate, err := d.DcMgr.DcGetDeviceUtilizationRate(cardID, deviceID, deviceType) + if err != nil { + return common.UnRetError, err + } + + return uint32(rate), nil +} + +// GetDeviceTemperature get npu device temperature +func (d *DeviceManager) GetDeviceTemperature(logicID int32) (int32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) + } + temp, err := d.DcMgr.DcGetDeviceTemperature(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) + } + + return temp, nil +} + +// GetDeviceVoltage get npu device voltage +func (d *DeviceManager) GetDeviceVoltage(logicID int32) (float32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) + } + voltage, err := d.DcMgr.DcGetDeviceVoltage(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) + } + + return voltage, nil +} + +// GetDevicePowerInfo get npu device power info +func (d *DeviceManager) GetDevicePowerInfo(logicID int32) (float32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) + } + power, err := d.DcMgr.DcGetDevicePowerInfo(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) + } + + return power, nil +} + +// GetDeviceFrequency get npu device work frequency +func (d *DeviceManager) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) + } + frequency, err := d.DcMgr.DcGetDeviceFrequency(cardID, deviceID, deviceType) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) + } + + return frequency, nil +} + +// GetDeviceMemoryInfo get npu memory information +func (d *DeviceManager) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) + } + + // 910B and 910A3 don't have DDR module. Therefore, DDR information cannot be queried. + if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { + hwlog.RunLog.Debugf("%v doesn't have DDR module. Therefore, DDR information cannot be queried", d.DevType) + return nil, nil + } + + memInfo, err := d.DcMgr.DcGetMemoryInfo(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) + } + + return memInfo, nil +} + +// GetDeviceHbmInfo get npu HBM module memory and frequency information +func (d *DeviceManager) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get hbm info by logicID(%d)", logicID) + } + hbmInfo, err := d.DcMgr.DcGetHbmInfo(cardID, deviceID) + if err != nil { + return nil, err + } + + return hbmInfo, nil +} + +// GetDeviceErrorCode get npu device error code +func (d *DeviceManager) GetDeviceErrorCode(logicID int32) (int32, int64, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", + logicID) + } + errCount, errCode, err := d.DcMgr.DcGetDeviceErrorCode(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", + logicID) + } + + return errCount, errCode, nil +} + +// GetChipInfo get npu device error code +func (d *DeviceManager) GetChipInfo(logicID int32) (*common.ChipInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d), error: %v", logicID, err) + } + chipInfo, err := d.DcMgr.DcGetChipInfo(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get chip info code by logicID(%d)", logicID) + } + + return chipInfo, nil +} + +// GetPhysicIDFromLogicID get device physic id from logic id +func (d *DeviceManager) GetPhysicIDFromLogicID(logicID int32) (int32, error) { + physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get physicID by logicID(%d)", logicID) + } + + return physicID, nil +} + +// GetLogicIDFromPhysicID get device logic id from physic id +func (d *DeviceManager) GetLogicIDFromPhysicID(physicID int32) (int32, error) { + logicID, err := d.DcMgr.DcGetLogicIDFromPhysicID(physicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get logicID by physicID(%d)", physicID) + } + + return logicID, nil +} + +// GetDeviceLogicID get device logic id from card id and device id +func (d *DeviceManager) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { + return d.DcMgr.DcGetDeviceLogicID(cardID, deviceID) +} + +// GetDeviceIPAddress get device ip address +func (d *DeviceManager) GetDeviceIPAddress(logicID, ipType int32) (string, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return "", fmt.Errorf("failed to get cardID and deviceID by logicID(%d), %w", logicID, err) + } + return d.DcMgr.DcGetDeviceIPAddress(cardID, deviceID, ipType) +} + +// CreateVirtualDevice create virtual device +func (d *DeviceManager) CreateVirtualDevice( + logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { + if !common.IsValidTemplateName(d.DevType, vDevInfo.TemplateName) { + return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid template name: %s", vDevInfo.TemplateName) + } + return d.DcMgr.DcCreateVDevice(logicID, vDevInfo) +} + +// GetVirtualDeviceInfo get virtual device info +func (d *DeviceManager) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + cgoVDevInfo, err := d.DcMgr.DcGetVDeviceInfo(logicID) + if err != nil { + hwlog.RunLog.Debug(err) + return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v "+ + "and vdev num is: %d", err, int32(cgoVDevInfo.TotalResource.VDevNum)) + } + for _, vDevInfo := range cgoVDevInfo.VDevInfo { + if !common.IsValidTemplateName(d.DevType, vDevInfo.QueryInfo.Name) { + return common.VirtualDevInfo{}, fmt.Errorf("vdevice id %d, it's template name is invalid: %s", + vDevInfo.VDevID, vDevInfo.QueryInfo.Name) + } + } + return cgoVDevInfo, nil +} + +// DestroyVirtualDevice destroy virtual device +func (d *DeviceManager) DestroyVirtualDevice(logicID int32, vDevID uint32) error { + return d.DcMgr.DcDestroyVDevice(logicID, vDevID) +} + +// GetMcuPowerInfo get mcu power info for cardID +func (d *DeviceManager) GetMcuPowerInfo(cardID int32) (float32, error) { + return d.DcMgr.DcGetMcuPowerInfo(cardID) +} + +// GetCardIDDeviceID get cardID and deviceID by logicID +func (d *DeviceManager) GetCardIDDeviceID(logicID int32) (int32, int32, error) { + return d.getCardIdAndDeviceId(logicID) +} + +// GetProductType get product type by cardID and deviceID +func (d *DeviceManager) GetProductType(cardID, deviceID int32) (string, error) { + return d.DcMgr.DcGetProductType(cardID, deviceID) +} + +// GetAllProductType get all product type +func (d *DeviceManager) GetAllProductType() ([]string, error) { + productTypes := make([]string, 0) + cardNum, cardList, err := d.GetCardList() + if err != nil || cardNum == 0 { + hwlog.RunLog.Errorf("failed to get card list, err: %v", err) + return productTypes, err + } + for _, cardID := range cardList { + devNum, err := d.GetDeviceNumInCard(cardID) + if err != nil { + hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) + continue + } + if devNum == 0 { + hwlog.RunLog.Debugf("not found device on card %d", cardID) + continue + } + for devID := int32(0); devID < devNum; devID++ { + productType, err := d.GetProductType(cardID, devID) + if err != nil { + hwlog.RunLog.Debugf("get product type by card %d deviceID %d failed, err: %v", cardID, devID, err) + continue + } + productTypes = append(productTypes, productType) + break + } + } + if len(productTypes) != 0 { + productTypes = common.RemoveDuplicate(&productTypes) + } + return productTypes, nil +} + +// GetNpuWorkMode get work mode of NPU +func (d *DeviceManager) GetNpuWorkMode() string { + if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { + hwlog.RunLog.Warnf("only AMP mode is available on %s", d.DevType) + return common.AMPMode + } + + _, cardList, err := d.DcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return "" + } + if len(cardList) > 0 { + mode, err := d.DcMgr.DcGetNpuWorkMode(cardList[0]) + if err != nil { + hwlog.RunLog.Error(err) + return "" + } + if mode == 0 { + return common.AMPMode + } + return common.SMPMode + } + return "" +} + +// SetDeviceReset reset spec device +func (d *DeviceManager) SetDeviceReset(cardID, deviceID int32) error { + return d.DcMgr.DcSetDeviceReset(cardID, deviceID) +} + +// GetBrotherCardID get brother card id +func (d *DeviceManager) GetBrotherCardID(cardID, deviceID int32) (int32, error) { + return d.DcMgr.DcGetBrotherCardID(cardID, deviceID) +} + +// GetOutBandChannelState get out band channel state +func (d *DeviceManager) GetOutBandChannelState(cardID, deviceID int32) error { + return d.DcMgr.DcGetOutBandChannelState(cardID, deviceID) +} + +// PreResetSoc pre reset soc, used before reset out band +func (d *DeviceManager) PreResetSoc(cardID, deviceID int32) error { + return d.DcMgr.DcPreResetSoc(cardID, deviceID) +} + +// SetDeviceResetOutBand reset spec device out band +func (d *DeviceManager) SetDeviceResetOutBand(cardID, deviceID int32) error { + return d.DcMgr.DcSetDeviceResetOutBand(cardID, deviceID) +} + +// RescanSoc trigger soc rescan, non-blocking +func (d *DeviceManager) RescanSoc(cardID, deviceID int32) error { + return d.DcMgr.DcRescanSoc(cardID, deviceID) +} + +// GetDeviceBootStatus get device boot status +func (d *DeviceManager) GetDeviceBootStatus(logicID int32) (int, error) { + return d.DcMgr.DcGetDeviceBootStatus(logicID) +} + +// GetDeviceAllErrorCode get npu device all error code +func (d *DeviceManager) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", + logicID) + } + errCount, errCodes, err := d.DcMgr.DcGetDeviceAllErrorCode(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, nil, fmt.Errorf("failed to get device error code by logicID(%d)", logicID) + } + return errCount, errCodes, nil +} + +// SubscribeDeviceFaultEvent get npu device error code by subscribe +func (d *DeviceManager) SubscribeDeviceFaultEvent(logicID int32) error { + var cardID, deviceID int32 + if logicID == common.SubscribeAllDevice { + cardID = common.SubscribeAllDevice + deviceID = common.SubscribeAllDevice + } else { + var err error + cardID, deviceID, err = d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return fmt.Errorf("failed to get cardID in subscribe device error code by logicID(%d)", logicID) + } + } + if err := d.DcMgr.DcSubscribeDeviceFaultEvent(cardID, deviceID); err != nil { + hwlog.RunLog.Error(err) + return fmt.Errorf("failed to subscribe device error code by logicID(%d)", logicID) + } + return nil +} + +// SetFaultEventCallFunc set fault event call func +func (d *DeviceManager) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { + if businessFunc == nil { + return errors.New("business func can't be nil") + } + d.DcMgr.DcSetFaultEventCallFunc(businessFunc) + return nil +} + +// GetDieID return die id by dcmi die type, vdie id or ndie id +func (d *DeviceManager) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetDieID(cardID, deviceID, dcmiDieType) +} + +// GetDevProcessInfo get process and process memory in device side +func (d *DeviceManager) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetDevProcessInfo(cardID, deviceID) +} + +// GetPCIeBusInfo pcie bus info +func (d *DeviceManager) GetPCIeBusInfo(logicID int32) (string, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetPCIeBusInfo(cardID, deviceID) +} + +// GetBoardInfo return board info of device +func (d *DeviceManager) GetBoardInfo(logicID int32) (common.BoardInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.BoardInfo{}, fmt.Errorf("failed to get cardID in "+ + "get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetDeviceBoardInfo(cardID, deviceID) +} + +// GetCardElabelV2 get card elabel information +func (d *DeviceManager) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + return d.DcMgr.DcGetCardElabelV2(cardID) +} + +// GetPCIEBandwidth get pcie bandwidth +func (d *DeviceManager) GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.PCIEBwStat{}, fmt.Errorf("get cardID(deviceID) failed, error by logicID(%d)", logicID) + } + pciePCIEBw, err := d.DcMgr.DcGetPCIEBandwidth(cardID, deviceID, profilingTime) + if err != nil { + return common.PCIEBwStat{}, err + } + return pciePCIEBw, nil +} + +// SetIsTrainingCard identifies whether it is a training card according to the usage of card +func (d *DeviceManager) SetIsTrainingCard() error { + devType := d.GetDevType() + if strings.HasPrefix(devType, api.Ascend310) { + d.isTrainingCard = false + return nil + } + + boardInfo := common.BoardInfo{} + cardNum, cardList, err := d.GetCardList() + if err != nil || cardNum == 0 { + hwlog.RunLog.Errorf("failed to get card list when set 'IsTrainingCard' err: %v", err) + return err + } + for _, cardID := range cardList { + devNum, err := d.GetDeviceNumInCard(cardID) + if err != nil { + hwlog.RunLog.Warnf("get device num by cardID(%d) failed when set 'IsTrainingCard', error: %v", cardID, err) + continue + } + if devNum == 0 { + hwlog.RunLog.Warnf("not found device on card %d when set 'IsTrainingCard'", cardID) + continue + } + + for devID := int32(0); devID < devNum; devID++ { + boardInfo, err = d.DcMgr.DcGetDeviceBoardInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Warnf("get board info by card %d deviceID %d failed, err: %v", cardID, devID, err) + continue + } + break + } + if err == nil { + break + } + } + + if devType == api.Ascend910B && + (boardInfo.BoardId == common.A300IA2BoardId || boardInfo.BoardId == common.A300IA2GB64BoardId) { + d.isTrainingCard = false + return nil + } + + d.isTrainingCard = true + return nil +} + +// IsTrainingCard return true if it is a training card +func (d *DeviceManager) IsTrainingCard() bool { + return d.isTrainingCard +} + +// GetDcmiVersion get dcmi version +func (d *DeviceManager) GetDcmiVersion() string { + return d.dcmiVersion +} + +// GetMainBoardId get mainBoardId +func (d *DeviceManager) GetMainBoardId() uint32 { + return d.mainBoardId +} + +// GetValidChipInfo find a valid chip info from all cards +func (d *DeviceManager) GetValidChipInfo() (common.ChipInfo, error) { + chipInfo, err := getValidChipInfo(d.DcMgr) + if err != nil { + hwlog.RunLog.Error("failed to get valid chip info") + return common.ChipInfo{}, err + } + return chipInfo, nil +} + +// GetDeviceEccInfo query device ECC info +func (d *DeviceManager) GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Errorf("get cardID and deviceID by logicID(%d) failed, error: %v", logicID, err) + return nil, err + } + return d.DcMgr.DcGetDeviceEccInfo(cardID, deviceID, dcmiDeviceType) +} + +// GetSuperPodInfo get 910A3 super pod info +func (d *DeviceManager) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.CgoSuperPodInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) + } + + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get super pod info, error: %v", logicID, err) + } + cgoSuperPodInfo, err := d.DcMgr.DcGetSuperPodInfo(cardID, deviceID) + if err != nil { + return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get super pod info by logicID(%d), error: %v", + logicID, err) + } + + return cgoSuperPodInfo, nil +} + +// GetSioInfo get SIO info +func (d *DeviceManager) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return nil, fmt.Errorf("input invalid logicID when get sio info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) when get sio info , error: %v", logicID, err) + } + cgoSPodSioInfo, err := d.DcMgr.DcGetSioInfo(cardID, deviceID) + if err != nil { + return nil, err + } + + return &cgoSPodSioInfo, nil +} + +// GetHccsStatisticInfo get HCCS statistic info +func (d *DeviceManager) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get hccs statistic info, error: %v", logicID, err) + } + cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfo(cardID, deviceID) + if err != nil { + return buildFailedHccsInfo(), err + + } + + return &cgoHccsStatusInfo, nil +} + +// GetHccsStatisticInfoInU64 get hccs statistic info in u64 +func (d *DeviceManager) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get hccs statistic info, error: %v", logicID, err) + } + cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfoU64(cardID, deviceID) + if err != nil { + return buildFailedHccsInfo(), err + } + return &cgoHccsStatusInfo, nil +} + +// GetHccsBandwidthInfo get hccs bandwidth info +func (d *DeviceManager) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + + if !common.IsValidLogicIDOrPhyID(logicID) { + return buildFailedHccsBWInfo(), fmt.Errorf("input invalid logicID when get hccs bandwidth info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return buildFailedHccsBWInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get hccs bandwidth info, error: %v", logicID, err) + } + cgoHccsBandwidthInfo, err := d.DcMgr.DcGetHccsBandwidthInfo(cardID, deviceID, common.HccsBWProfilingTime) + if err != nil { + return buildFailedHccsBWInfo(), fmt.Errorf("failed to get hccs bandwidth info by cardId(%d) deviceID(%d), error: %v", + cardID, deviceID, err) + } + + return &cgoHccsBandwidthInfo, nil +} + +// buildFailedHccsInfo build failed hccs info +func buildFailedHccsInfo() *common.HccsStatisticInfo { + errorResult := &common.HccsStatisticInfo{ + TxCnt: make([]uint64, 8), + RxCnt: make([]uint64, 8), + CrcErrCnt: make([]uint64, 8), + } + for i := 0; i < 8; i++ { + errorResult.TxCnt[i] = common.FailedValue + errorResult.RxCnt[i] = common.FailedValue + errorResult.CrcErrCnt[i] = common.FailedValue + } + return errorResult +} + +// buildFailedHccsBWInfo build failed hccs bandwidth info +func buildFailedHccsBWInfo() *common.HccsBandwidthInfo { + errorResult := &common.HccsBandwidthInfo{ + ProfilingTime: uint32(common.HccsBWProfilingTime), + TotalTxbw: common.FailedValue, + TotalRxbw: common.FailedValue, + TxBandwidth: make([]float64, 8), + RxBandwidth: make([]float64, 8), + } + for i := 0; i < 8; i++ { + errorResult.TxBandwidth[i] = common.FailedValue + errorResult.RxBandwidth[i] = common.FailedValue + } + return errorResult +} + +func (d *DeviceManager) getCardIdAndDeviceId(logicID int32) (int32, int32, error) { + + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) + } + + result, ok := idCache.Load(logicID) + if !ok { + return d.doGetCardIDAndDeviceID(logicID) + } + idMapping, ok := result.(npuIdMapping) + if !ok { + idCache.Delete(logicID) + return d.doGetCardIDAndDeviceID(logicID) + } + hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from cache, cardId:%v, deviceId:%v", + logicID, idMapping.cardId, idMapping.deviceId) + return idMapping.cardId, idMapping.deviceId, nil +} + +func (d *DeviceManager) doGetCardIDAndDeviceID(logicID int32) (int32, int32, error) { + cardId, deviceId, err := d.DcMgr.DcGetCardIDDeviceID(logicID) + if err != nil { + hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID, + "failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err) + return common.RetError, common.RetError, err + } + hwlog.ResetErrCnt(common.DomainForLogicIdErr, logicID) + hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from dcmi, cardId:%v, deviceId:%v", + logicID, cardId, deviceId) + idCache.Store(logicID, npuIdMapping{logicId: logicID, cardId: cardId, deviceId: deviceId}) + return cardId, deviceId, nil +} + +// GetChipBaseInfos get chip base info +func (d *DeviceManager) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { + _, cardList, err := d.DcMgr.DcGetCardList() + if err != nil { + return nil, fmt.Errorf("get card list failed, error: %v", err) + } + var chips = []*common.ChipBaseInfo{} + for _, cardID := range cardList { + devNumInCard, err := d.DcMgr.DcGetDeviceNumInCard(cardID) + if err != nil { + return nil, fmt.Errorf("get device num by cardID: %d failed, error: %v", + cardID, err) + } + for devID := int32(0); devID < devNumInCard; devID++ { + logicID, err := d.DcMgr.DcGetDeviceLogicID(cardID, devID) + if err != nil { + return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ + "failed, error: %v", cardID, devID, err) + } + physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) + if err != nil { + return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) physic id "+"failed, error: %v", + cardID, devID, err) + } + hwlog.RunLog.Infof("get chip base info, cardID: %d, deviceID: %d, logicID: %d, physicID: %d", cardID, + devID, logicID, physicID) + chips = append(chips, &common.ChipBaseInfo{ + PhysicID: physicID, + LogicID: logicID, + CardID: cardID, + DeviceID: devID, + }) + } + } + return chips, nil +} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DeviceManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, + operate common.HccspingMeshOperate) error { + return d.DcMgr.DcStartHccsPingMesh(cardID, deviceID, portID, operate) +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DeviceManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { + return d.DcMgr.DcStopHccsPingMesh(cardID, deviceID, portID, taskID) +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DeviceManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, + taskID uint) (*common.HccspingMeshInfo, error) { + return d.DcMgr.DcGetHccsPingMeshInfo(cardID, deviceID, portID, taskID) +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DeviceManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { + return d.DcMgr.DcGetHccsPingMeshState(cardID, deviceID, portID, taskID) +} + +// DcGetSuperPodStatus get super pod status +func (d *DeviceManager) DcGetSuperPodStatus(cardID int32, deviceID int32, sdid uint32) (int, error) { + var err error + var status int + for i := 0; i < maxRetries; i++ { + if status, err = d.DcMgr.DcGetSuperPodStatus(cardID, deviceID, sdid); err != nil { + hwlog.RunLog.Errorf("get super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ + "sdid: %d, error: %v", i, cardID, deviceID, sdid, err) + continue + } + break + } + return status, err +} + +// DcSetSuperPodStatus set super pod status +func (d *DeviceManager) DcSetSuperPodStatus(cardID int32, deviceID int32, sdid, status uint32) error { + var err error + for i := 0; i < maxRetries; i++ { + if err = d.DcMgr.DcSetSuperPodStatus(cardID, deviceID, sdid, status); err != nil { + hwlog.RunLog.Errorf("set super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ + "sdid: %d, status: %d, error: %v", i, cardID, deviceID, sdid, status, err) + continue + } + break + } + return err +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go new file mode 100644 index 0000000..ca7121b --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go @@ -0,0 +1,30 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager mock +package devmanager + +import ( + "ascend-common/api" +) + +// DeviceManager910A3Mock common device manager mock for Ascend910A3 +type DeviceManager910A3Mock struct { + DeviceManagerMock +} + +// GetDevType return mock type +func (d *DeviceManager910A3Mock) GetDevType() string { + return api.Ascend910A3 +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go new file mode 100644 index 0000000..817f06e --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go @@ -0,0 +1,43 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager error mock +package devmanager + +import ( + "errors" + + "ascend-common/api" + "ascend-common/devmanager/common" +) + +// DeviceManager910A3MockErr common device manager mock error for Ascend910A3 +type DeviceManager910A3MockErr struct { + DeviceManagerMockErr +} + +// GetDevType return mock type +func (d *DeviceManager910A3MockErr) GetDevType() string { + return api.Ascend910A3 +} + +// GetHccsStatisticInfo get hccs statistic info +func (d *DeviceManager910A3MockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + return &common.HccsStatisticInfo{}, errors.New(errorMsg) +} + +// GetHccsBandwidthInfo get hccs statistic info +func (d *DeviceManager910A3MockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + return &common.HccsBandwidthInfo{}, errors.New(errorMsg) +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go new file mode 100644 index 0000000..3d7fff4 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go @@ -0,0 +1,166 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager for device driver manager +package devmanager + +import ( + "errors" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +const ( + mockLogicID int32 = 0 + mockCardID int32 = 0 + mockDeviceID int32 = 0 + invalidLogicID int32 = -1 + mockErrorMsg string = "mock error" + hccsArrayLen int = 8 +) + +type getHccsStatisticInfoInU64TestCase struct { + name string + logicID int32 + isValidID bool + getCardIDErr error + dcmiCallErr error + expectedErr bool +} + +func TestGetHccsStatisticInfoInU64(t *testing.T) { + testCases := buildGetHccsStatisticInfoInU64TestCases() + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + + clearIdCache(tc.logicID) + manager := createMockDeviceManager() + setupGetHccsStatisticInfoInU64Patches(patches, manager, tc) + result, err := manager.GetHccsStatisticInfoInU64(tc.logicID) + verifyGetHccsStatisticInfoInU64Result(result, err, tc) + }) + } +} + +func clearIdCache(logicID int32) { + idCache.Delete(logicID) +} + +func buildGetHccsStatisticInfoInU64TestCases() []getHccsStatisticInfoInU64TestCase { + return []getHccsStatisticInfoInU64TestCase{ + {name: "should return failed info when logicID is invalid", + logicID: invalidLogicID, + isValidID: false, + expectedErr: true}, + {name: "should return failed info when getCardIdAndDeviceId fails", + logicID: mockLogicID, + isValidID: true, + getCardIDErr: errors.New(mockErrorMsg), + expectedErr: true}, + {name: "should return failed info when DcGetHccsStatisticInfoU64 fails", + logicID: mockLogicID, + isValidID: true, + dcmiCallErr: errors.New(mockErrorMsg), + expectedErr: true}, + {name: "should return success info when all operations succeed", + logicID: mockLogicID, + isValidID: true, + expectedErr: false}, + } +} + +func createMockDeviceManager() *DeviceManager { + return &DeviceManager{ + DcMgr: &dcmi.DcManager{}, + } +} + +func setupGetHccsStatisticInfoInU64Patches(patches *gomonkey.Patches, + manager *DeviceManager, tc getHccsStatisticInfoInU64TestCase) { + patches.ApplyFuncReturn(common.IsValidLogicIDOrPhyID, tc.isValidID) + if !tc.isValidID { + return + } + if tc.getCardIDErr != nil { + patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", + mockCardID, mockDeviceID, tc.getCardIDErr) + } else { + patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", + mockCardID, mockDeviceID, nil) + if tc.dcmiCallErr != nil { + patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", + common.HccsStatisticInfo{}, tc.dcmiCallErr) + } else { + mockHccsInfo := createMockHccsStatisticInfo() + patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", + mockHccsInfo, nil) + } + } +} + +func createMockHccsStatisticInfo() common.HccsStatisticInfo { + txCnt := make([]uint64, hccsArrayLen) + rxCnt := make([]uint64, hccsArrayLen) + crcErrCnt := make([]uint64, hccsArrayLen) + for i := 0; i < hccsArrayLen; i++ { + txCnt[i] = uint64(i + 1) + rxCnt[i] = uint64(i + 1) + crcErrCnt[i] = 0 + } + return common.HccsStatisticInfo{ + TxCnt: txCnt, + RxCnt: rxCnt, + CrcErrCnt: crcErrCnt, + } +} + +func verifyGetHccsStatisticInfoInU64Result(result *common.HccsStatisticInfo, + err error, tc getHccsStatisticInfoInU64TestCase) { + if tc.expectedErr { + convey.So(err, convey.ShouldNotBeNil) + convey.So(result, convey.ShouldNotBeNil) + verifyFailedHccsInfo(result) + } else { + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldNotBeNil) + verifySuccessHccsInfo(result) + } +} + +func verifyFailedHccsInfo(result *common.HccsStatisticInfo) { + convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) + for i := 0; i < hccsArrayLen; i++ { + convey.So(result.TxCnt[i], convey.ShouldEqual, common.FailedValue) + convey.So(result.RxCnt[i], convey.ShouldEqual, common.FailedValue) + convey.So(result.CrcErrCnt[i], convey.ShouldEqual, common.FailedValue) + } +} + +func verifySuccessHccsInfo(result *common.HccsStatisticInfo) { + convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(result.TxCnt[0], convey.ShouldEqual, uint64(1)) + convey.So(result.RxCnt[0], convey.ShouldEqual, uint64(1)) +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go new file mode 100644 index 0000000..c3bde2b --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go @@ -0,0 +1,370 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager mock +package devmanager + +import ( + "ascend-common/api" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// DeviceManagerMock common device manager mock for Ascend910/310P/310 +type DeviceManagerMock struct { +} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DeviceManagerMock) DcStartHccsPingMesh(i int32, i2 int32, i3 int, operate common.HccspingMeshOperate) error { + return nil +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DeviceManagerMock) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { + return nil +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DeviceManagerMock) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, error) { + return &common.HccspingMeshInfo{}, nil +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DeviceManagerMock) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { + return 0, nil +} + +// Init load symbol and initialize dcmi +func (d *DeviceManagerMock) Init() error { + return nil +} + +// ShutDown clean the dynamically loaded resource +func (d *DeviceManagerMock) ShutDown() error { + return nil +} + +// GetDevType return mock type +func (d *DeviceManagerMock) GetDevType() string { + return api.Ascend910A +} + +// GetDeviceCount get npu device count +func (d *DeviceManagerMock) GetDeviceCount() (int32, error) { + return 1, nil +} + +// GetCardList get all card list +func (d *DeviceManagerMock) GetCardList() (int32, []int32, error) { + return 1, []int32{0}, nil +} + +// GetDeviceNumInCard get all device list in one card +func (d *DeviceManagerMock) GetDeviceNumInCard(cardID int32) (int32, error) { + return 1, nil +} + +// GetDeviceList get all device logicID list +func (d *DeviceManagerMock) GetDeviceList() (int32, []int32, error) { + return 1, []int32{0}, nil +} + +// GetDeviceHealth query npu device health status +func (d *DeviceManagerMock) GetDeviceHealth(logicID int32) (uint32, error) { + return 0, nil +} + +// GetDeviceNetWorkHealth query npu device network health status +func (d *DeviceManagerMock) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { + return 0, nil +} + +// GetDeviceUtilizationRate get npu device utilization +func (d *DeviceManagerMock) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, nil +} + +// GetDeviceTemperature get npu device temperature +func (d *DeviceManagerMock) GetDeviceTemperature(logicID int32) (int32, error) { + return 1, nil +} + +// GetDeviceVoltage get npu device voltage +func (d *DeviceManagerMock) GetDeviceVoltage(logicID int32) (float32, error) { + return 1, nil +} + +// GetDevicePowerInfo get npu device power info +func (d *DeviceManagerMock) GetDevicePowerInfo(logicID int32) (float32, error) { + return 1, nil +} + +// GetDeviceFrequency get npu device work frequency +func (d *DeviceManagerMock) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, nil +} + +// GetDeviceMemoryInfo get npu memory information +func (d *DeviceManagerMock) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { + return &common.MemoryInfo{ + MemorySize: 1, + MemoryAvailable: 1, + Frequency: 1, + Utilization: 1, + }, nil +} + +// GetDeviceHbmInfo get npu HBM module memory and frequency information +func (d *DeviceManagerMock) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { + return &common.HbmInfo{ + MemorySize: 1, + Frequency: 1, + Usage: 1, + Temp: 1, + BandWidthUtilRate: 1, + }, nil +} + +// GetDeviceErrorCode get npu device error code +func (d *DeviceManagerMock) GetDeviceErrorCode(logicID int32) (int32, int64, error) { + return int32(0), int64(0), nil +} + +// GetChipInfo get npu device error code +func (d *DeviceManagerMock) GetChipInfo(logicID int32) (*common.ChipInfo, error) { + chip := &common.ChipInfo{ + Type: "ascend", + Name: common.Chip910, + Version: "v1", + } + return chip, nil +} + +// GetPhysicIDFromLogicID get device physic id from logic id +func (d *DeviceManagerMock) GetPhysicIDFromLogicID(logicID int32) (int32, error) { + return 1, nil +} + +// GetLogicIDFromPhysicID get device logic id from physic id +func (d *DeviceManagerMock) GetLogicIDFromPhysicID(physicID int32) (int32, error) { + return 1, nil +} + +// GetDeviceLogicID get device logic id from card id and device id +func (d *DeviceManagerMock) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { + return 1, nil +} + +// GetDeviceIPAddress get device ip address +func (d *DeviceManagerMock) GetDeviceIPAddress(logicID, ipType int32) (string, error) { + if ipType == 0 { + return "127.0.0.1", nil + } + return "::1", nil +} + +// CreateVirtualDevice create virtual device +func (d *DeviceManagerMock) CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. + CgoCreateVDevOut, error) { + return common.CgoCreateVDevOut{}, nil +} + +// GetVirtualDeviceInfo get virtual device info +func (d *DeviceManagerMock) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + return common.VirtualDevInfo{}, nil +} + +// DestroyVirtualDevice destroy virtual device +func (d *DeviceManagerMock) DestroyVirtualDevice(logicID int32, vDevID uint32) error { + return nil +} + +// GetMcuPowerInfo get mcu power info for cardID +func (d *DeviceManagerMock) GetMcuPowerInfo(cardID int32) (float32, error) { + return 1, nil +} + +// GetCardIDDeviceID get cardID and deviceID by logicID +func (d *DeviceManagerMock) GetCardIDDeviceID(logicID int32) (int32, int32, error) { + return 0, 0, nil +} + +// GetProductType get product type success +func (d *DeviceManagerMock) GetProductType(cardID, deviceID int32) (string, error) { + return "", nil +} + +// GetAllProductType get all product type success +func (d *DeviceManagerMock) GetAllProductType() ([]string, error) { + return []string{}, nil +} + +// GetNpuWorkMode get npu chip work mode SMP success +func (d *DeviceManagerMock) GetNpuWorkMode() string { + return common.SMPMode +} + +// SetDeviceReset set device reset success +func (d *DeviceManagerMock) SetDeviceReset(cardID, deviceID int32) error { + return nil +} + +// GetDeviceBootStatus get device boot status success +func (d *DeviceManagerMock) GetDeviceBootStatus(logicID int32) (int, error) { + return common.BootStartFinish, nil +} + +// GetDeviceAllErrorCode get device all error code success +func (d *DeviceManagerMock) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { + return 0, []int64{}, nil +} + +// SubscribeDeviceFaultEvent subscribe device fault event success +func (d *DeviceManagerMock) SubscribeDeviceFaultEvent(logicID int32) error { + return nil +} + +// SetFaultEventCallFunc set fault event call func success +func (d *DeviceManagerMock) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { + return nil +} + +// GetDieID get die id success +func (d *DeviceManagerMock) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { + return "ABCDEFGHIGKLMNOPQRSTUVWXYZ01234567890123", nil +} + +// GetDevProcessInfo get process info +func (d *DeviceManagerMock) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { + return &common.DevProcessInfo{}, nil +} + +// GetPCIeBusInfo get pcie bus info +func (d *DeviceManagerMock) GetPCIeBusInfo(logicID int32) (string, error) { + return "0000:61:00.0", nil +} + +// GetBoardInfo Get board info +func (d *DeviceManagerMock) GetBoardInfo(logicID int32) (common.BoardInfo, error) { + return common.BoardInfo{}, nil +} + +// GetCardElabelV2 get card elabel information +func (d *DeviceManagerMock) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + return common.ElabelInfo{}, nil +} + +// GetProductTypeArray test for get product type array +func (d *DeviceManagerMock) GetProductTypeArray() []string { + return []string{common.Atlas200ISoc} +} + +// GetPCIEBandwidth get pcie bandwidth +func (d *DeviceManagerMock) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { + return common.PCIEBwStat{}, nil +} + +// SetIsTrainingCard set IsTrainingCard +func (d *DeviceManagerMock) SetIsTrainingCard() error { + return nil +} + +// IsTrainingCard get IsTrainingCard +func (d *DeviceManagerMock) IsTrainingCard() bool { + return true +} + +// GetDcmiVersion get dcmi version +func (d *DeviceManagerMock) GetDcmiVersion() string { + return "v1" +} + +// GetValidChipInfo get valid chip info from all npu +func (d *DeviceManagerMock) GetValidChipInfo() (common.ChipInfo, error) { + return common.ChipInfo{}, nil +} + +// GetDeviceEccInfo get device ECC info +func (d *DeviceManagerMock) GetDeviceEccInfo(logicID int32, + dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { + return &common.ECCInfo{EnableFlag: 1}, nil +} + +// GetSuperPodInfo get super pod info +func (d *DeviceManagerMock) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { + return common.CgoSuperPodInfo{}, nil +} + +// GetSioInfo get sio info +func (d *DeviceManagerMock) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { + return &common.SioCrcErrStatisticInfo{ + TxErrCnt: 0, + RxErrCnt: 0, + }, nil +} + +// GetHccsStatisticInfo get hccs statistic info +func (d *DeviceManagerMock) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + return &common.HccsStatisticInfo{}, nil +} + +// GetHccsStatisticInfoInU64 get hccs statistic info in u64 +func (d *DeviceManagerMock) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { + return &common.HccsStatisticInfo{}, nil +} + +// GetMainBoardId get main board id +func (d *DeviceManagerMock) GetMainBoardId() uint32 { + return 0 +} + +// GetHccsBandwidthInfo get hccs statistic info +func (d *DeviceManagerMock) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + return &common.HccsBandwidthInfo{}, nil +} + +// GetBrotherCardID get brother card id +func (d *DeviceManagerMock) GetBrotherCardID(cardID, deviceID int32) (int32, error) { + const noneBroCard = -1 + return noneBroCard, nil +} + +// GetOutBandChannelState get out band channel state +func (d *DeviceManagerMock) GetOutBandChannelState(cardID, deviceID int32) error { + return nil +} + +// PreResetSoc pre reset soc, used before reset out band +func (d *DeviceManagerMock) PreResetSoc(cardID, deviceID int32) error { + return nil +} + +// SetDeviceResetOutBand reset spec device out band +func (d *DeviceManagerMock) SetDeviceResetOutBand(cardID, deviceID int32) error { + return nil +} + +// RescanSoc trigger soc rescan, non-blocking +func (d *DeviceManagerMock) RescanSoc(cardID, deviceID int32) error { + return nil +} + +// GetChipBaseInfos get chip base info +func (d *DeviceManagerMock) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { + return nil, nil +} + +func (d *DeviceManagerMock) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } + +func (d *DeviceManagerMock) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go new file mode 100644 index 0000000..8ad8d7c --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go @@ -0,0 +1,369 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager error mock +package devmanager + +import ( + "errors" + + "ascend-common/api" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +var errorMsg = "mock error" + +// DeviceManagerMockErr common device manager mock error for Ascend910/310P/310 +type DeviceManagerMockErr struct { +} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DeviceManagerMockErr) DcStartHccsPingMesh(i int32, i2 int32, i3 int, + operate common.HccspingMeshOperate) error { + return errors.New(errorMsg) +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DeviceManagerMockErr) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { + return errors.New(errorMsg) +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DeviceManagerMockErr) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, + error) { + return nil, errors.New(errorMsg) +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DeviceManagerMockErr) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { + return 1, errors.New(errorMsg) +} + +// Init load symbol and initialize dcmi +func (d *DeviceManagerMockErr) Init() error { + return errors.New(errorMsg) +} + +// ShutDown clean the dynamically loaded resource +func (d *DeviceManagerMockErr) ShutDown() error { + return errors.New(errorMsg) +} + +// GetDevType return mock type +func (d *DeviceManagerMockErr) GetDevType() string { + return api.Ascend910A +} + +// GetDeviceCount get npu device count +func (d *DeviceManagerMockErr) GetDeviceCount() (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetCardList get all card list +func (d *DeviceManagerMockErr) GetCardList() (int32, []int32, error) { + return 1, []int32{0}, errors.New(errorMsg) +} + +// GetDeviceNumInCard get all device list in one card +func (d *DeviceManagerMockErr) GetDeviceNumInCard(cardID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceList get all device logicID list +func (d *DeviceManagerMockErr) GetDeviceList() (int32, []int32, error) { + return 1, []int32{0}, errors.New(errorMsg) +} + +// GetDeviceHealth query npu device health status +func (d *DeviceManagerMockErr) GetDeviceHealth(logicID int32) (uint32, error) { + return 0, errors.New(errorMsg) +} + +// GetDeviceNetWorkHealth query npu device network health status +func (d *DeviceManagerMockErr) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { + return 0, errors.New(errorMsg) +} + +// GetDeviceUtilizationRate get npu device utilization +func (d *DeviceManagerMockErr) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceTemperature get npu device temperature +func (d *DeviceManagerMockErr) GetDeviceTemperature(logicID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceVoltage get npu device voltage +func (d *DeviceManagerMockErr) GetDeviceVoltage(logicID int32) (float32, error) { + return 1, errors.New(errorMsg) +} + +// GetDevicePowerInfo get npu device power info +func (d *DeviceManagerMockErr) GetDevicePowerInfo(logicID int32) (float32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceFrequency get npu device work frequency +func (d *DeviceManagerMockErr) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceMemoryInfo get npu memory information +func (d *DeviceManagerMockErr) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { + return &common.MemoryInfo{ + MemorySize: 1, + MemoryAvailable: 1, + Frequency: 1, + Utilization: 1, + }, errors.New(errorMsg) +} + +// GetDeviceHbmInfo get npu HBM module memory and frequency information +func (d *DeviceManagerMockErr) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { + return &common.HbmInfo{ + MemorySize: 1, + Frequency: 1, + Usage: 1, + Temp: 1, + BandWidthUtilRate: 1, + }, errors.New(errorMsg) +} + +// GetDeviceErrorCode get npu device error code +func (d *DeviceManagerMockErr) GetDeviceErrorCode(logicID int32) (int32, int64, error) { + return int32(0), int64(0), errors.New(errorMsg) +} + +// GetChipInfo get npu device error code +func (d *DeviceManagerMockErr) GetChipInfo(logicID int32) (*common.ChipInfo, error) { + chip := &common.ChipInfo{ + Type: "ascend", + Name: common.Chip910, + Version: "v1", + } + return chip, errors.New(errorMsg) +} + +// GetPhysicIDFromLogicID get device physic id from logic id +func (d *DeviceManagerMockErr) GetPhysicIDFromLogicID(logicID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetLogicIDFromPhysicID get device logic id from physic id +func (d *DeviceManagerMockErr) GetLogicIDFromPhysicID(physicID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceLogicID get device logic id from card id and device id +func (d *DeviceManagerMockErr) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceIPAddress get device ip address +func (d *DeviceManagerMockErr) GetDeviceIPAddress(logicID, ipType int32) (string, error) { + return "127.0.0.1", errors.New(errorMsg) +} + +// CreateVirtualDevice create virtual device +func (d *DeviceManagerMockErr) CreateVirtualDevice(logicID int32, + vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { + return common.CgoCreateVDevOut{}, errors.New(errorMsg) +} + +// GetVirtualDeviceInfo get virtual device info +func (d *DeviceManagerMockErr) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + return common.VirtualDevInfo{}, errors.New(errorMsg) +} + +// DestroyVirtualDevice destroy virtual device +func (d *DeviceManagerMockErr) DestroyVirtualDevice(logicID int32, vDevID uint32) error { + return errors.New(errorMsg) +} + +// GetMcuPowerInfo get mcu power info for cardID +func (d *DeviceManagerMockErr) GetMcuPowerInfo(cardID int32) (float32, error) { + return 1, errors.New(errorMsg) +} + +// GetCardIDDeviceID get cardID and deviceID by logicID +func (d *DeviceManagerMockErr) GetCardIDDeviceID(logicID int32) (int32, int32, error) { + return 0, 0, errors.New(errorMsg) +} + +// GetProductType get product type failed +func (d *DeviceManagerMockErr) GetProductType(cardID, deviceID int32) (string, error) { + return "", errors.New("not found product type name") +} + +// GetAllProductType get all product type failed +func (d *DeviceManagerMockErr) GetAllProductType() ([]string, error) { + return []string{}, errors.New("not found product type name") +} + +// GetNpuWorkMode get npu work mode failed +func (d *DeviceManagerMockErr) GetNpuWorkMode() string { + return "" +} + +// SetDeviceReset set device reset failed +func (d *DeviceManagerMockErr) SetDeviceReset(cardID, deviceID int32) error { + return errors.New(errorMsg) +} + +// GetDeviceBootStatus get device boot status failed +func (d *DeviceManagerMockErr) GetDeviceBootStatus(logicID int32) (int, error) { + return common.RetError, errors.New(errorMsg) +} + +// GetDeviceAllErrorCode get device all error code failed +func (d *DeviceManagerMockErr) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { + return common.RetError, nil, errors.New(errorMsg) +} + +// SubscribeDeviceFaultEvent subscribe device fault event failed +func (d *DeviceManagerMockErr) SubscribeDeviceFaultEvent(logicID int32) error { + return errors.New(errorMsg) +} + +// SetFaultEventCallFunc set fault event call func failed +func (d *DeviceManagerMockErr) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { + return errors.New(errorMsg) +} + +// GetDieID get die id failed +func (d *DeviceManagerMockErr) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { + return "", errors.New(errorMsg) +} + +// GetDevProcessInfo get process info +func (d *DeviceManagerMockErr) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetPCIeBusInfo get PCIe bus info +func (d *DeviceManagerMockErr) GetPCIeBusInfo(logicID int32) (string, error) { + return "", errors.New(errorMsg) +} + +// GetBoardInfo get board info +func (d *DeviceManagerMockErr) GetBoardInfo(logicID int32) (common.BoardInfo, error) { + return common.BoardInfo{}, errors.New(errorMsg) +} + +// GetProductTypeArray test for get empty product type array +func (d *DeviceManagerMockErr) GetProductTypeArray() []string { + return nil +} + +// GetPCIEBandwidth get pcie bandwidth +func (d *DeviceManagerMockErr) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { + return common.PCIEBwStat{}, errors.New(errorMsg) +} + +// SetIsTrainingCard set IsTrainingCard +func (d *DeviceManagerMockErr) SetIsTrainingCard() error { + return errors.New(errorMsg) +} + +// IsTrainingCard get IsTrainingCard +func (d *DeviceManagerMockErr) IsTrainingCard() bool { + return false +} + +// GetDcmiVersion get dcmi version failed +func (d *DeviceManagerMockErr) GetDcmiVersion() string { + return "" +} + +// GetValidChipInfo get valid chip info from all npu +func (d *DeviceManagerMockErr) GetValidChipInfo() (common.ChipInfo, error) { + return common.ChipInfo{}, errors.New("failed to find chip info") +} + +// GetDeviceEccInfo get device ECC info +func (d *DeviceManagerMockErr) GetDeviceEccInfo(logicID int32, + dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { + return nil, errors.New("failed to get device ECC info") +} + +// GetSuperPodInfo get super pod info +func (d *DeviceManagerMockErr) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { + return common.CgoSuperPodInfo{}, nil +} + +// GetSioInfo get sio info +func (d *DeviceManagerMockErr) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetHccsStatisticInfo get hccs statistic info +func (d *DeviceManagerMockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetHccsStatisticInfoInU64 get hccs statistic info in u64 +func (d *DeviceManagerMockErr) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetMainBoardId get main board id +func (d *DeviceManagerMockErr) GetMainBoardId() uint32 { + return 0 +} + +// GetHccsBandwidthInfo get hccs statistic info +func (d *DeviceManagerMockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetBrotherCardID get brother card id +func (d *DeviceManagerMockErr) GetBrotherCardID(cardID, deviceID int32) (int32, error) { + return -1, nil +} + +// GetOutBandChannelState get out band channel state +func (d *DeviceManagerMockErr) GetOutBandChannelState(cardID, deviceID int32) error { + return nil +} + +// PreResetSoc pre reset soc, used before reset out band +func (d *DeviceManagerMockErr) PreResetSoc(cardID, deviceID int32) error { + return nil +} + +// SetDeviceResetOutBand reset spec device out band +func (d *DeviceManagerMockErr) SetDeviceResetOutBand(cardID, deviceID int32) error { + return nil +} + +// RescanSoc trigger soc rescan, non-blocking +func (d *DeviceManagerMockErr) RescanSoc(cardID, deviceID int32) error { + return nil +} + +// GetChipBaseInfos get chip base info +func (d *DeviceManagerMockErr) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { + return nil, errors.New(errorMsg) +} + +func (d *DeviceManagerMockErr) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } + +func (d *DeviceManagerMockErr) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } + +// GetCardElabelV2 get card elabel information +func (d *DeviceManagerMockErr) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + return common.ElabelInfo{}, nil +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_test.go new file mode 100644 index 0000000..221a812 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_test.go @@ -0,0 +1,78 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package devmanager for device driver manager +package devmanager + +import ( + "errors" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// TestGetCardIdAndDeviceId test the getCardIdAndDeviceId function +func TestGetCardIdAndDeviceId(t *testing.T) { + + var ( + cardId, deviceId = int32(0), int32(0) + err error + returnValue = int32(0) + errReturnValue = int32(-1) + ) + manager := &DeviceManager{DcMgr: &dcmi.DcManager{}} + convey.Convey("failed to get info by dcmi", t, func() { + mk2 := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", + errReturnValue, errReturnValue, errors.New("mock err")) + defer mk2.Reset() + cardId, deviceId, err = manager.getCardIdAndDeviceId(0) + + convey.So(cardId, convey.ShouldEqual, common.RetError) + convey.So(deviceId, convey.ShouldEqual, common.RetError) + convey.So(err, convey.ShouldNotBeNil) + + }) + + mk := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", returnValue, returnValue, nil) + defer mk.Reset() + + convey.Convey("get info from dcmi", t, func() { + testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) + }) + convey.Convey("get info from cache", t, func() { + testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) + }) + +} + +func testGetCardIdAndDeviceId(t *testing.T, cardId int32, deviceId int32, err error, manager *DeviceManager) { + cardId, deviceId, err = manager.getCardIdAndDeviceId(0) + + convey.So(cardId, convey.ShouldEqual, 0) + convey.So(deviceId, convey.ShouldEqual, 0) + convey.So(err, convey.ShouldBeNil) + +} +func init() { + config := hwlog.LogConfig{ + OnlyToStdout: true, + } + hwlog.InitRunLogger(&config, nil) +} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go new file mode 100644 index 0000000..b6388f4 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go @@ -0,0 +1,335 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hccn this for npu hccn info +package hccn + +import ( + "fmt" + "os" + "os/exec" + "strconv" + "strings" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/limiter" + "ascend-common/common-utils/utils" + "ascend-common/devmanager/common" +) + +const ( + space = " " + newLine = "\n" + colon = ":" + + // LinkUp npu interface up + LinkUp string = "UP" + // LinkDown npu interface down + LinkDown string = "DOWN" + + opticalPartLen = 2 + secondIndex = 2 + linkStatusPart = 3 + base64 = 64 + + cardHealthy = 0 + + normalCode = 1 + abnormalCode = 0 + + naValue = "NA" + notSupport = "not supported" + unknownStr = "Unknown!" + + limitSize = 1024 * 1024 +) + +func getInfoFromHccnTool(args ...string) (string, error) { + const hccnTool = "/usr/local/Ascend/driver/tools/hccn_tool" + if _, err := utils.CheckPath(hccnTool); err != nil { + return "", err + } + cmd := exec.Command(hccnTool, args...) + cmd.Env = []string{ + "PATH=" + os.Getenv("PATH"), + utils.LdLibPath + "=" + os.Getenv(utils.LdLibPath), + } + limitStdout := limiter.NewLimitedWriter(limitSize) + cmd.Stdout = limitStdout + cmd.Stderr = limiter.NewLimitedWriter(limitSize) + err := cmd.Run() + if err != nil { + return "", err + } + + return string(limitStdout.GetBufferBytes()), nil +} + +// GetNPULinkStatus exec "hccn_tool -i * -link -g" to get link status +func GetNPULinkStatus(phyID int32) (string, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-link", "-g"} + // command example: hccn_tool -i 0 -link -g + // success result example is: link status: DOWN + outStr, err := getInfoFromHccnTool(args...) + hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) + if err != nil { + return common.Abnormal, buildHccnErr(phyID, "link status", err) + } + replacedStr := strings.ReplaceAll(outStr, newLine, "") + outArr := strings.Split(replacedStr, space) + if len(outArr) != linkStatusPart { + return common.Abnormal, buildHccnErr(phyID, "link status", + fmt.Errorf("length of output %v is not equal to %v", outArr, linkStatusPart)) + } + + status := outArr[secondIndex] + hwlog.RunLog.Debugf("hccn_tool get npu link status: %s", status) + return status, nil +} + +// GetNPULinkSpeed exec "hccn_tool -i * -speed -g" to get link speed +func GetNPULinkSpeed(phyID int32) (int, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-speed", "-g"} + // command example: hccn_tool -i 0 -speed -g + // success result example is: Speed: 100000 Mb/s + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return common.RetError, buildHccnErr(phyID, "link speed", err) + } + return getSpeedFromOutStr(outStr, phyID) +} + +func getSpeedFromOutStr(outStr string, phyID int32) (int, error) { + if strings.Contains(outStr, unknownStr) { + return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("npu link speed is unknown")) + } + replacedStr := strings.ReplaceAll(outStr, newLine, "") + outArr := strings.Split(replacedStr, space) + if len(outArr) != linkStatusPart { + return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("length of output %v is not equal to %v", + outArr, linkStatusPart)) + } + const midIndex = 1 + speed, err := strconv.Atoi(outArr[midIndex]) + if err != nil { + return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("covert speed from string failed: %s", err)) + } + + return speed, nil +} + +// GetNPULinkUpNum exec "hccn_tool -i * -link_stat -g" to get link up count +func GetNPULinkUpNum(phyID int32) (int, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-link_stat", "-g"} + // command example: hccn_tool -i 0 -link_stat -g + // success result include: [device x]link up count : y + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return common.RetError, buildHccnErr(phyID, "link stat", err) + } + + const ( + linkUpArrLen = 6 + linkUpStr = "link up count" + ) + linkUPCount := 0 + lines := strings.Split(outStr, newLine) + for _, line := range lines { + if line == "" || !strings.Contains(line, linkUpStr) { + continue + } + + linkUpArr := strings.Fields(line) + if len(linkUpArr) != linkUpArrLen { + return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("length of output %v is not "+ + "equal to %v", linkUpArr, linkUpArrLen)) + } + if linkUPCount, err = strconv.Atoi(linkUpArr[linkUpArrLen-1]); err != nil { + return common.RetError, buildHccnErr(phyID, "link up num", + fmt.Errorf("covert link up num from string failed: %s", err)) + } + return linkUPCount, nil + } + + return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("did not find link up count")) +} + +// GetNPUStatInfo exec "hccn_tool -i * -stat -g" to get stat info +func GetNPUStatInfo(phyID int32) (map[string]int, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-stat", "-g"} + // command example: hccn_tool -i 0 -stat -g + // success result include: [device x]link up count : y + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return nil, buildHccnErr(phyID, "stat", err) + } + lines := strings.Split(outStr, newLine) + statInfoMap := make(map[string]int) + const statPartLen = 2 + for _, line := range lines { + statParts := strings.Split(line, colon) + if len(statParts) != statPartLen || statParts[1] == "" { + continue + } + statNum, err := strconv.Atoi(statParts[1]) + if err != nil { + hwlog.RunLog.Errorf("covert stat num of [%s] from string failed: %s", statParts[1], err) + continue + } + statInfoMap[statParts[0]] = statNum + } + + return statInfoMap, nil +} + +// GetNPUOpticalInfo exec "hccn_tool -i * -optical -g" to get optical info +func GetNPUOpticalInfo(phyID int32) (map[string]string, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-optical", "-g"} + // command example: hccn_tool -i 0 -optical -g + // success result include: [device x]link up count : y + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return nil, buildHccnErr(phyID, "optical", err) + } + lines := strings.Split(outStr, newLine) + opticalInfoMap := make(map[string]string) + for _, line := range lines { + opticalParts := strings.Split(line, colon) + if len(opticalParts) != opticalPartLen { + continue + } + opticalKey := strings.ReplaceAll(strings.TrimSpace(opticalParts[0]), space, "_") + opticalValue := strings.TrimSpace(opticalParts[1]) + opticalInfoMap[opticalKey] = opticalValue + } + + return opticalInfoMap, nil +} + +// GetNPUInterfaceTraffic exec "hccn_tool -i * -bandwidth -g" to get bandwidth info +func GetNPUInterfaceTraffic(phyID int32) (float64, float64, error) { + const ( + noTraffic = common.RetError + trafficPartLen = 4 + txStr = "TX:" + rxStr = "RX:" + ) + + args := []string{"-i", strconv.Itoa(int(phyID)), "-bandwidth", "-g"} + // command example: hccn_tool -i 0 -bandwidth -g + // success result has two lines: + // Bandwidth TX: 0.00 MB/sec + // Bandwidth RX: 0.00 MB/sec + outStr, err := getInfoFromHccnTool(args...) + hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) + if err != nil { + return noTraffic, noTraffic, buildHccnErr(phyID, "interface traffic", err) + } + + var ( + tx = float64(noTraffic) + rx = float64(noTraffic) + ) + + lines := strings.Split(outStr, newLine) + for _, line := range lines { + if line == "" { + continue + } + + trafficArr := strings.Fields(line) + hwlog.RunLog.Debugf("npu bandwidth split as: %v", trafficArr) + if len(trafficArr) != trafficPartLen { + continue + } + if strings.Contains(line, txStr) { + tmpTx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) + if err != nil { + hwlog.RunLog.Errorf("get float data from Bandwidth TX err: %s", err) + continue + } + tx = tmpTx + } + if strings.Contains(line, rxStr) { + tmpRx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) + if err != nil { + hwlog.RunLog.Errorf("get float data from Bandwidth RX err: %s", err) + continue + } + rx = tmpRx + } + } + return tx, rx, nil +} + +// GetFloatDataFromStr get float data from string with space +func GetFloatDataFromStr(str, dataType string) float64 { + if str == "" || strings.Contains(str, naValue) || strings.Contains(str, notSupport) { + return common.RetError + } + dataParts := strings.Split(str, space) + if len(dataParts) != opticalPartLen { + errMsg := fmt.Sprintf("convert %v optical data type failed, "+ + "the length of optical data %v is %v not equal to %d. ", dataType, dataParts, len(dataParts), opticalPartLen) + hwlog.RunLog.Error(errMsg) + return common.RetError + } + floatData, err := strconv.ParseFloat(dataParts[0], base64) + if err != nil { + hwlog.RunLog.Errorf("convert %v optical data type to a floating-point number failed, "+ + "get float data from string %v failed, err: %v", dataType, dataParts[0], err) + return common.RetError + } + return floatData +} + +// GetHealthCode return union healthy code +func GetHealthCode(healthCode uint32) int { + if healthCode == common.UnRetError { + return common.RetError + } + + if healthCode == cardHealthy { + return normalCode + } + return abnormalCode +} + +// GetLinkStatusCode return union link status code +func GetLinkStatusCode(status string) int { + if status == common.Abnormal { + return common.RetError + } + + if status == LinkUp { + return normalCode + } + return abnormalCode +} + +// GetNetworkHealthy return union network healthy code +func GetNetworkHealthy(netCode uint32) int { + if netCode == common.UnRetError { + return common.RetError + } + + if netCode == common.NetworkInit || netCode == common.NetworkSuccess { + return normalCode + } + return abnormalCode +} + +func buildHccnErr(phyID int32, msg string, err error) error { + return fmt.Errorf("phyID(%d),get npu %s info failed,error is :%v", phyID, msg, err) +} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go new file mode 100644 index 0000000..7d4fe17 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go @@ -0,0 +1,49 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hccn this for npu hccn info +package hccn + +import ( + "fmt" + "strings" + "testing" +) + +func TestBuildHccnErr(t *testing.T) { + t.Run("normal error", func(t *testing.T) { + phyID := int32(1) + msg := "status" + originalErr := fmt.Errorf("permission denied") + + err := buildHccnErr(phyID, msg, originalErr) + + if !strings.Contains(err.Error(), "phyID(1)") { + t.Error("should contain phyID") + } + if !strings.Contains(err.Error(), "npu status") { + t.Error("should contain npu message") + } + if !strings.Contains(err.Error(), "permission denied") { + t.Error("should contain original error") + } + }) + + t.Run("nil error", func(t *testing.T) { + err := buildHccnErr(0, "", nil) + if !strings.Contains(err.Error(), "error is :nil") { + t.Error("should handle nil error") + } + }) +} diff --git a/mind-cluster/component/ascend-common/go.mod b/mind-cluster/component/ascend-common/go.mod new file mode 100644 index 0000000..e1e3bbb --- /dev/null +++ b/mind-cluster/component/ascend-common/go.mod @@ -0,0 +1,55 @@ +module ascend-common + +go 1.18 + +require ( + github.com/agiledragon/gomonkey/v2 v2.8.0 + github.com/fsnotify/fsnotify v1.6.0 + github.com/kubeflow/common v0.4.3 + github.com/smartystreets/goconvey v1.6.4 + k8s.io/api v0.25.3 + k8s.io/apimachinery v0.25.3 + k8s.io/client-go v0.25.3 +) + +require ( + github.com/PuerkitoBio/purell v1.1.1 // indirect + github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.8.0 // indirect + github.com/go-logr/logr v1.2.3 // indirect + github.com/go-openapi/jsonpointer v0.19.5 // indirect + github.com/go-openapi/jsonreference v0.19.5 // indirect + github.com/go-openapi/swag v0.19.14 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/gnostic v0.5.7-v3refs // indirect + github.com/google/go-cmp v0.5.8 // indirect + github.com/google/gofuzz v1.1.0 // indirect + github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/jtolds/gls v4.20.0+incompatible // indirect + github.com/mailru/easyjson v0.7.6 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect + golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect + golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect + golang.org/x/sys v0.8.0 // indirect + golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect + golang.org/x/text v0.3.7 // indirect + golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.28.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.70.1 // indirect + k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect + k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect + sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) diff --git a/mind-cluster/component/ascend-common/go.sum b/mind-cluster/component/ascend-common/go.sum new file mode 100644 index 0000000..000ced7 --- /dev/null +++ b/mind-cluster/component/ascend-common/go.sum @@ -0,0 +1,492 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= +cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= +cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= +cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= +cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= +cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= +cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= +cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= +cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= +cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= +cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= +cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= +cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= +cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= +cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= +cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= +cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= +cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= +github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= +github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/emicklei/go-restful/v3 v3.8.0 h1:eCZ8ulSerjdAiaNpF7GxXIE7ZCMo1moN1qX+S609eVw= +github.com/emicklei/go-restful/v3 v3.8.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= +github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= +github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= +github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonreference v0.19.5 h1:1WJP/wi4OjB4iV8KVbH73rQaoialJrqv8gitZLxGLtM= +github.com/go-openapi/jsonreference v0.19.5/go.mod h1:RdybgQwPxbL4UEjuAruzK1x3nE69AqPYEJeo/TWfEeg= +github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/go-openapi/swag v0.19.14 h1:gm3vOOXfiuw5i9p5N9xJvfjvuofpyvLA9Wr6QfK5Fng= +github.com/go-openapi/swag v0.19.14/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/gnostic v0.5.7-v3refs h1:FhTMOKj2VhjpouxvWJAV1TL304uMlb9zcDqkl6cEI54= +github.com/google/gnostic v0.5.7-v3refs/go.mod h1:73MKFl6jIHelAJNaBGFzt3SPtZULs9dYrGFt8OiIsHQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= +github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= +github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubeflow/common v0.4.3 h1:vVoOMNPOZK4wzZvQ4rsRLvC3SDi+J1fVKNHSXC/QRvU= +github.com/kubeflow/common v0.4.3/go.mod h1:Qb/5aON7/OWVkN8OnjRqqT0i8X/XzMekRIZ8lkLosj4= +github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= +github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= +golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= +golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 h1:RerP+noqYHUQ8CMRcPlC2nvTa4dcBIjegkuWdcUDuqg= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= +golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= +golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= +google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= +google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= +google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= +google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= +google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw= +google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +k8s.io/api v0.25.3 h1:Q1v5UFfYe87vi5H7NU0p4RXC26PPMT8KOpr1TLQbCMQ= +k8s.io/api v0.25.3/go.mod h1:o42gKscFrEVjHdQnyRenACrMtbuJsVdP+WVjqejfzmI= +k8s.io/apimachinery v0.25.3 h1:7o9ium4uyUOM76t6aunP0nZuex7gDf8VGwkR5RcJnQc= +k8s.io/apimachinery v0.25.3/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= +k8s.io/client-go v0.25.3 h1:oB4Dyl8d6UbfDHD8Bv8evKylzs3BXzzufLiO27xuPs0= +k8s.io/client-go v0.25.3/go.mod h1:t39LPczAIMwycjcXkVc+CB+PZV69jQuNx4um5ORDjQA= +k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= +k8s.io/klog/v2 v2.70.1 h1:7aaoSdahviPmR+XkS7FyxlkkXs6tHISSG03RxleQAVQ= +k8s.io/klog/v2 v2.70.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= +k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkIFQtZShWqoha7snGixVgEA= +k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= +k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= +k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= +rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= +sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= +sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/mind-cluster/component/npu-exporter/.gitignore b/mind-cluster/component/npu-exporter/.gitignore new file mode 100644 index 0000000..723ef36 --- /dev/null +++ b/mind-cluster/component/npu-exporter/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/LICENSE b/mind-cluster/component/npu-exporter/LICENSE new file mode 100644 index 0000000..f49a4e1 --- /dev/null +++ b/mind-cluster/component/npu-exporter/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/README.md b/mind-cluster/component/npu-exporter/README.md new file mode 100644 index 0000000..4bde4a9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/README.md @@ -0,0 +1,42 @@ +# NPU-Exporter + +# 组件介绍 + + +Prometheus(普罗米修斯)是一个开源的系统监测和警报工具包,Exporter就是专门为Prometheus提供数据源的组件。由于Prometheus社区的活跃和大量的使用,已经有很多厂商或者服务提供了Exporter,如Prometheus官方的Node Exporter,MySQL官方出的MySQL Server Exporter和NVIDA的NVIDIA GPU Exporter。这些Exporter负责将特定监测对象的指标,转成Prometheus能够识别的数据格式,供Prometheus集成。NPU-Expoter是华为自研的专门收集华为NPU各种监测信息和指标,并封装成Prometheus专用数据格式的一个服务组件。 + + +# 编译NPU-Exporter + +1. 通过git拉取源码,获得npu-exporter。 + + 示例:Npu-Exporter源码放在/home/mind-cluster/component/npu-exporter目录下 + +2. 执行以下命令,进入Npu-Exporter构建目录,执行构建脚本,在“output“目录下生成二进制npu-exporter、yaml文件和Dockerfile等文件。 + + **cd** _/home/mind-cluster/component/_**npu-exporter/build/** + + **chmod +x build.sh** + + **./build.sh** + +3. 执行以下命令,查看**output**生成的软件列表。 + + **ll** _/home/mind-cluster/component/_**npu-exporter/output** + + ``` + drwxr-xr-x 2 root root 4096 Feb 23 07:10 . + drwxr-xr-x 10 root root 4096 Feb 23 07:10 .. + -r-------- 1 root root 623 Feb 23 07:10 Dockerfile + -r-------- 1 root root 623 Feb 23 07:10 Dockerfile-310P-1usoc + -r-------- 1 root root 623 Feb 23 07:10 metricConfiguration.json + -r-x------ 1 root root 25481072 Feb 23 07:10 npu-exporter + -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-310P-1usoc-v6.0.0.yaml + -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-v6.0.0.yaml + -r-------- 1 root root 623 Feb 23 07:10 pluginConfiguration.json + -r-x------ 1 root root 2579 Feb 23 07:10 run_for_310P_1usoc.sh + ``` + +# 说明 + +1. 当前Npu-Exporter仅支持http启动,如果需要使用https启动,请自行完成代码修改并适配Prometheus \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile b/mind-cluster/component/npu-exporter/build/Dockerfile new file mode 100644 index 0000000..24f9943 --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/Dockerfile @@ -0,0 +1,21 @@ +FROM ubuntu:22.04 + +RUN useradd -d /home/HwHiAiUser -u 1000 -m -s /usr/sbin/nologin HwHiAiUser &&\ + usermod root -s /usr/sbin/nologin + +COPY ./npu-exporter /usr/local/bin/ +COPY ./metricConfiguration.json /usr/local/metricConfiguration.json +COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json + +RUN chown root:root /usr/local/bin/npu-exporter &&\ + chmod 750 -R /home/HwHiAiUser &&\ + chmod 550 /usr/local/bin/ &&\ + chmod 500 /usr/local/bin/npu-exporter &&\ + chmod 440 /usr/local/metricConfiguration.json &&\ + chmod 440 /usr/local/pluginConfiguration.json &&\ + echo 'umask 027' >> /etc/profile && \ + echo 'source /etc/profile' >> ~/.bashrc +ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi + +CMD /usr/local/bin/npu-exporter + diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc b/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc new file mode 100644 index 0000000..5927f7d --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc @@ -0,0 +1,31 @@ +FROM ubuntu:22.04 + +RUN groupadd -g 1000 HwHiAiUser && useradd -u 1000 -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser &&\ + groupadd -g 1101 HwDmUser && useradd -u 1101 -g HwDmUser -d /home/HwDmUser -m HwDmUser &&\ + groupadd -g 1102 HwBaseUser && useradd -u 1102 -g HwBaseUser -d /home/HwBaseUser -m HwBaseUser &&\ + usermod -a -G HwBaseUser HwHiAiUser &&\ + usermod -a -G HwDmUser HwHiAiUser &&\ + usermod -a -G HwBaseUser HwDmUser &&\ + usermod -a -G HwHiAiUser HwDmUser &&\ + usermod root -s /usr/sbin/nologin + +COPY ./npu-exporter /usr/local/bin/ +COPY ./run_for_310P_1usoc.sh / +COPY ./metricConfiguration.json /usr/local/metricConfiguration.json +COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json + +RUN chown root:root /usr/local/bin/npu-exporter &&\ + chmod 500 /run_for_310P_1usoc.sh &&\ + chmod 550 /usr/local/bin/ &&\ + chmod 500 /usr/local/bin/npu-exporter &&\ + chmod 440 /usr/local/metricConfiguration.json &&\ + chmod 440 /usr/local/pluginConfiguration.json &&\ + echo 'umask 027' >> /etc/profile && \ + echo 'source /etc/profile' >> ~/.bashrc + +RUN ln -s /lib /lib64 2>&1 >> /dev/null &&\ + mkdir -m 750 /var/driver -m 750 /var/dmp -m 750 /usr/slog -p -m 750 /home/drv/hdc_ppc &&\ + chown HwDmUser:HwDmUser /var/dmp &&\ + chown HwHiAiUser:HwHiAiUser /var/driver &&\ + chown HwHiAiUser:HwHiAiUser /home/drv/hdc_ppc &&\ + chown HwHiAiUser:HwHiAiUser /usr/slog \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/build.sh b/mind-cluster/component/npu-exporter/build/build.sh new file mode 100644 index 0000000..16c101d --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/build.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Perform build npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2020-2023. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +set -e +CUR_DIR=$(dirname $(readlink -f $0)) +TOP_DIR=$(realpath "${CUR_DIR}"/..) +export GO111MODULE="on" +VER_FILE="${TOP_DIR}"/service_config.ini +build_version="v6.0.0" +if [ -f "$VER_FILE" ]; then + line=$(sed -n '1p' "$VER_FILE" 2>&1) + #cut the chars after ':' and add char 'v', the final example is v3.0.0 + build_version="v"${line#*=} +fi + +arch=$(arch 2>&1) +echo "Build Architecture is" "${arch}" + +OUTPUT_NAME="npu-exporter" +DOCKER_FILE_NAME="Dockerfile" +A200ISOC_DOCKER_FILE_NAME="Dockerfile-310P-1usoc" +A200ISOC_RUN_SHELL="run_for_310P_1usoc.sh" + +function clean() { + rm -rf "${TOP_DIR}"/output + mkdir -p "${TOP_DIR}"/output +} + +function build() { + cd "${TOP_DIR}/cmd/npu-exporter" + CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ + -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ + -o ${OUTPUT_NAME} + ls ${OUTPUT_NAME} + if [ $? -ne 0 ]; then + echo "fail to find npu-exporter" + exit 1 + fi +} + +function mv_file() { + mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + cp "${TOP_DIR}"/build/npu-exporter-310P-1usoc.yaml "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml + cp "${TOP_DIR}"/build/metricConfiguration.json "${TOP_DIR}"/output/ + cp "${TOP_DIR}"/build/pluginConfiguration.json "${TOP_DIR}"/output/ + sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml + cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/${A200ISOC_DOCKER_FILE_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/${A200ISOC_RUN_SHELL} "${TOP_DIR}"/output + chmod 400 "${TOP_DIR}"/output/* + chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} + chmod 500 "${TOP_DIR}"/output/${A200ISOC_RUN_SHELL} + +} + +function main() { + clean + build + mv_file +} + +main diff --git a/mind-cluster/component/npu-exporter/build/build_ch.sh b/mind-cluster/component/npu-exporter/build/build_ch.sh new file mode 100644 index 0000000..878fcbd --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/build_ch.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Perform build npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2025-2025. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +set -e +CUR_DIR=$(dirname $(readlink -f $0)) +TOP_DIR=$(realpath "${CUR_DIR}"/..) +export GO111MODULE="on" +VER_FILE="${TOP_DIR}"/service_config.ini +build_version="v6.0.0" +if [ -f "$VER_FILE" ]; then + line=$(sed -n '1p' "$VER_FILE" 2>&1) + #cut the chars after ':' and add char 'v', the final example is v3.0.0 + build_version="v"${line#*=} +fi + +arch=$(arch 2>&1) +echo "Build Architecture is" "${arch}" + +OUTPUT_NAME="npu-exporter" +DOCKER_FILE_NAME="Dockerfile" + + +function clean() { + rm -rf "${TOP_DIR}"/output + mkdir -p "${TOP_DIR}"/output +} + +function build() { + cd "${TOP_DIR}/cmd/npu-exporter" + CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ + -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ + -o ${OUTPUT_NAME} + ls ${OUTPUT_NAME} + if [ $? -ne 0 ]; then + echo "fail to find npu-exporter" + exit 1 + fi +} + +function mv_file() { + mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + sed -i "s/ascend*/alan/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + + cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output + chmod 400 "${TOP_DIR}"/output/* + chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} + +} + +function main() { + clean + build + mv_file +} + +main diff --git a/mind-cluster/component/npu-exporter/build/metricConfiguration.json b/mind-cluster/component/npu-exporter/build/metricConfiguration.json new file mode 100644 index 0000000..3dbd82b --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/metricConfiguration.json @@ -0,0 +1,13 @@ +[ + {"metricsGroup": "ddr", "state": "ON"}, + {"metricsGroup": "hccs", "state": "ON"}, + {"metricsGroup": "npu", "state": "ON"}, + {"metricsGroup": "network", "state": "ON"}, + {"metricsGroup": "pcie", "state": "ON"}, + {"metricsGroup": "roce", "state": "ON"}, + {"metricsGroup": "sio", "state": "ON"}, + {"metricsGroup": "vnpu", "state": "ON"}, + {"metricsGroup": "version", "state": "ON"}, + {"metricsGroup": "optical", "state": "ON"}, + {"metricsGroup": "hbm", "state": "ON"} +] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml new file mode 100644 index 0000000..3b6e22f --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml @@ -0,0 +1,167 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: npu-exporter +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: exporter-network-policy + namespace: npu-exporter +spec: + podSelector: + matchLabels: + app: npu-exporter + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus + egress: + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npu-exporter-310p-1usoc + namespace: npu-exporter +spec: + selector: + matchLabels: + app: npu-exporter + template: + metadata: + ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. + annotations: + seccomp.security.alpha.kubernetes.io/pod: runtime/default + labels: + app: npu-exporter + spec: + ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile +# securityContext: +# seccompProfile: +# type: RuntimeDefault + automountServiceAccountToken: false + nodeSelector: + workerselector: dls-worker-node + servertype: soc + containers: + - name: npu-exporter + image: npu-exporter:v5.0.RC1 + resources: + requests: + memory: 1000Mi + cpu: 1000m + limits: + memory: 1000Mi + cpu: 1000m + imagePullPolicy: Never + command: [ "/bin/bash", "-c", "/run_for_310P_1usoc.sh"] + # pair firstly + securityContext: + privileged: true + readOnlyRootFilesystem: true + runAsUser: 0 + runAsGroup: 0 + ports: + - name: http + containerPort: 8082 + protocol: TCP + volumeMounts: + - name: log-npu-exporter + mountPath: /var/log/mindx-dl/npu-exporter + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: ascend-driver + mountPath: /usr/local/Ascend/driver + readOnly: true + - name: ascend-dcmi + mountPath: /usr/local/dcmi + readOnly: true + - name: libyaml + mountPath: /usr/lib64/libyaml-0.so.2 + readOnly: true + - name: docker-shim # delete when only use containerd + mountPath: /run/dockershim.sock + readOnly: true + - name: docker # delete when only use containerd + mountPath: /run/docker/containerd/containerd.sock + readOnly: true + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + mountPath: /var/run/cri-dockerd.sock + readOnly: true + - name: containerd + mountPath: /run/containerd + readOnly: true + - name: tmp + mountPath: /tmp + - name: dmp + mountPath: /var/dmp_daemon + readOnly: true + - name: slogd + mountPath: /var/slogd + readOnly: true + - name: hbasic + mountPath: /etc/hdcBasic.cfg + readOnly: true + - name: slogconf + mountPath: /etc/slog.conf + readOnly: true + volumes: + - name: log-npu-exporter + hostPath: + path: /var/log/mindx-dl/npu-exporter + type: Directory + - name: localtime + hostPath: + path: /etc/localtime + - name: libyaml + hostPath: + path: /usr/lib64/libyaml-0.so.2 + type: File + - name: ascend-driver + hostPath: + path: /usr/local/Ascend/driver + - name: ascend-dcmi + hostPath: + path: /usr/local/dcmi + - name: docker-shim # delete when only use containerd + hostPath: + path: /run/dockershim.sock + - name: docker # delete when only use containerd + hostPath: + path: /run/docker/containerd/containerd.sock + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + hostPath: + path: /var/run/cri-dockerd.sock + - name: containerd + hostPath: + path: /run/containerd + - name: tmp + hostPath: + path: /tmp + - name: dmp + hostPath: + path: /var/dmp_daemon + type: File + - name: slogd + hostPath: + path: /var/slogd + type: File + - name: hbasic + hostPath: + path: /etc/hdcBasic.cfg + type: File + - name: slogconf + hostPath: + path: /etc/slog.conf + type: File \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter.yaml new file mode 100644 index 0000000..970e3cf --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/npu-exporter.yaml @@ -0,0 +1,140 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: npu-exporter +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: exporter-network-policy + namespace: npu-exporter +spec: + podSelector: + matchLabels: + app: npu-exporter + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus + egress: + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npu-exporter + namespace: npu-exporter +spec: + selector: + matchLabels: + app: npu-exporter + template: + metadata: + ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. + annotations: + seccomp.security.alpha.kubernetes.io/pod: runtime/default + labels: + app: npu-exporter + spec: + ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile +# securityContext: +# seccompProfile: +# type: RuntimeDefault + automountServiceAccountToken: false + nodeSelector: + workerselector: dls-worker-node + containers: + - name: npu-exporter + image: npu-exporter:v5.0.RC1 + resources: + requests: + memory: 1000Mi + cpu: 1000m + limits: + memory: 1000Mi + cpu: 1000m + imagePullPolicy: Never + command: [ "/bin/bash", "-c", "--"] + # pair firstly + args: [ "umask 027;npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 + -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker" ] + securityContext: + privileged: true + readOnlyRootFilesystem: true + runAsUser: 0 + runAsGroup: 0 + ports: + - name: http + containerPort: 8082 + protocol: TCP + volumeMounts: + - name: log-npu-exporter + mountPath: /var/log/mindx-dl/npu-exporter + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: ascend-driver + mountPath: /usr/local/Ascend/driver + readOnly: true + - name: ascend-dcmi + mountPath: /usr/local/dcmi + readOnly: true + - name: docker-shim # delete when only use containerd or isula + mountPath: /var/run/dockershim.sock + readOnly: true + - name: docker # delete when only use containerd or isula + mountPath: /var/run/docker + readOnly: true + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + mountPath: /var/run/cri-dockerd.sock + readOnly: true + - name: containerd # delete when only use isula + mountPath: /run/containerd + readOnly: true + - name: isulad # delete when use containerd or docker + mountPath: /run/isulad.sock + readOnly: true + - name: tmp + mountPath: /tmp + volumes: + - name: log-npu-exporter + hostPath: + path: /var/log/mindx-dl/npu-exporter + type: Directory + - name: localtime + hostPath: + path: /etc/localtime + - name: ascend-driver + hostPath: + path: /usr/local/Ascend/driver + - name: ascend-dcmi + hostPath: + path: /usr/local/dcmi + - name: docker-shim # delete when only use containerd or isula + hostPath: + path: /var/run/dockershim.sock + - name: docker # delete when only use containerd or isula + hostPath: + path: /var/run/docker + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + hostPath: + path: /var/run/cri-dockerd.sock + - name: containerd # delete when only use isula + hostPath: + path: /run/containerd + - name: isulad # delete when use containerd or docker + hostPath: + path: /run/isulad.sock + - name: tmp + hostPath: + path: /tmp + diff --git a/mind-cluster/component/npu-exporter/build/pluginConfiguration.json b/mind-cluster/component/npu-exporter/build/pluginConfiguration.json new file mode 100644 index 0000000..68823e0 --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/pluginConfiguration.json @@ -0,0 +1,4 @@ +[ + {"metricsGroup": "MyPlugin", "state": "OFF"}, + {"metricsGroup": "text", "state": "ON"} +] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh b/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh new file mode 100644 index 0000000..055ed41 --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Perform build npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2022-2022. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +set -e + +# log process run in background +echo -e "[INFO]\t $(date +"%F %T:%N")\t start slogd server in background" +su - HwHiAiUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/slogd -d &" +echo -e "[INFO]\t $(date +"%F %T:%N")\t start dmp_daemon server in background" +# dcmi interface process run in background +su - HwDmUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/dmp_daemon -I -M -U 8087 &" + +export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi +# the host is openEuler, so the parameters "endpoint" and "containerd" are set to adapt to "-containerMode=docker" in default +# in openEuler os, the path of parameters "endpoint" and "containerd" are not in the default place +echo -e "[INFO]\t $(date +"%F %T:%N")\t start npu-exporter server" +/usr/local/bin/npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker -endpoint=/run/dockershim.sock -containerd=/run/docker/containerd/containerd.sock + diff --git a/mind-cluster/component/npu-exporter/build/test.sh b/mind-cluster/component/npu-exporter/build/test.sh new file mode 100644 index 0000000..097eb3a --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/test.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Perform test for npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2020-2020. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +set -e + + +# execute go test and echo result to report files +function execute_test() { + if ! (go test -v -race -coverprofile cov.out "${TOP_DIR}"/... >./"$file_input") + then + echo '****** go test cases error! ******' + cat $file_input + exit 1 + else + gocov convert cov.out | gocov-html >"$file_detail_output" + gotestsum --junitfile unit-tests.xml "${TOP_DIR}"/... + + total_coverage=$(go tool cover -func=cov.out | grep "total:" | awk '{print $3}'| sed 's/%//') + # round up + coverage=$(echo "$total_coverage" | awk '{if ($1 >= 0) print ($1 == int($1)) ? int($1) : int($1) + 1;\ + else print ($1 == int($1)) ? int($1) : int($1)}') + if [[ $coverage -ge 80 ]]; then + echo "coverage passed: $coverage%" + exit 0 + else + echo "coverage failed: $coverage%, it needs to be greater than 80%." + exit 1 + fi + fi +} + + +export GO111MODULE="on" +export PATH=$GOPATH/bin:$PATH +export GOFLAGS="-gcflags=all=-l" +unset GOPATH +# if didn't install the following tools, please install firstly +#go get -insecure github.com/axw/gocov/gocov +#go get github.com/matm/gocov-html +CUR_DIR=$(dirname "$(readlink -f "$0")") +TOP_DIR=$(realpath "${CUR_DIR}"/..) + +file_input='testExporter.txt' +file_detail_output='api.html' + +if [ -f "${TOP_DIR}"/test ]; then + rm -rf "${TOP_DIR}"/test +fi +mkdir -p "${TOP_DIR}"/test +cd "${TOP_DIR}"/test +echo "clean old version test results" + +if [ -f "$file_input" ]; then + rm -rf "$file_input" +fi +if [ -f "$file_detail_output" ]; then + rm -rf "$file_detail_output" +fi + +echo "************************************* Start LLT Test *************************************" +execute_test +echo "************************************* End LLT Test *************************************" diff --git a/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go b/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go new file mode 100644 index 0000000..700b248 --- /dev/null +++ b/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go @@ -0,0 +1,545 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package main +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "log" + "net" + "net/http" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "github.com/influxdata/telegraf/plugins/common/shim" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/limiter" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/config" + "huawei.com/npu-exporter/v6/collector/container" + _ "huawei.com/npu-exporter/v6/platforms/inputs/npu" + "huawei.com/npu-exporter/v6/platforms/prom" + "huawei.com/npu-exporter/v6/plugins" + "huawei.com/npu-exporter/v6/utils/logger" + "huawei.com/npu-exporter/v6/versions" +) + +var ( + port int + updateTime int + ip = "" + version bool + concurrency int + containerMode = "" + containerd = "" + endpoint = "" + limitIPReq = "" + platform = "" + textMetricsFilePath = "" + limitIPConn int + limitTotalConn int + cacheSize int + profilingTime int + hccsBWProfilingTime int + pollInterval time.Duration + deviceResetTimeout int +) + +const ( + portConst = 8082 + updateTimeConst = 5 + cacheTime = 100 * time.Second + portLeft = 1025 + portRight = 40000 + oneMinute = 60 + defaultConcurrency = 5 + defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" + containerModeDocker = "docker" + containerModeContainerd = "containerd" + containerModeIsula = "isula" + unixPre = "unix://" + timeout = 10 + maxHeaderBytes = 1024 + // tenDays ten days + tenDays = 10 + maxIPConnLimit = 128 + maxConcurrency = 512 + defaultConnection = 20 + maxProfilingTime = 2000 + minHccsBWProfilingTime = 1 + maxHccsBWProfilingTime = 1000 + defaultShutDownTimeout = 30 * time.Second +) + +const ( + prometheusPlatform = "Prometheus" + telegrafPlatform = "Telegraf" + pollIntervalStr = "poll_interval" + platformStr = "platform" + defaultProfilingTime = 200 + defaultHccsBwProfilingTime = 200 +) + +func main() { + flag.Parse() + if version { + fmt.Printf("NPU-exporter version: %s \n", versions.BuildVersion) + return + } + err := logger.InitLogger(platform) + if err != nil { + fmt.Fprintf(os.Stderr, "%v", err) + return + } + initPaprams() + err = paramValid(platform) + if err != nil { + return + } + dmgr, err := devmanager.AutoInit("", deviceResetTimeout) + if err != nil { + logger.Errorf("new npu collector failed, error is %v", err) + return + } + logger.Infof("npu exporter starting and the version is %s", versions.BuildVersion) + deviceParser := container.MakeDevicesParser(readCntMonitoringFlags()) + defer deviceParser.Close() + + if err := deviceParser.Init(); err != nil { + logger.Errorf("failed to init devices parser: %v", err) + } + deviceParser.Timeout = time.Duration(updateTime) * time.Second + + colcommon.Collector = colcommon.NewNpuCollector(cacheTime, time.Duration(updateTime)*time.Second, deviceParser, dmgr) + plugins.InitTextMetricsDesc(textMetricsFilePath) + plugins.RegisterPlugin() + config.Register(colcommon.Collector) + + ctx, cancel := context.WithCancel(context.Background()) + wg := &sync.WaitGroup{} + colcommon.InitCardInfo(wg, ctx, colcommon.Collector) + colcommon.StartContainerInfoCollect(ctx, cancel, wg, colcommon.Collector) + + colcommon.StartCollect(wg, ctx, colcommon.Collector) + switch platform { + case prometheusPlatform: + prometheusProcss(wg, ctx, cancel) + case telegrafPlatform: + telegrafProcess() + default: + err = fmt.Errorf("err platform input") + } + wg.Wait() +} + +func prometheusProcss(wg *sync.WaitGroup, ctx context.Context, cancel context.CancelFunc) { + c := prom.NewPrometheusCollector(colcommon.Collector) + reg := prometheus.NewRegistry() + reg.MustRegister(c) + + wg.Add(1) + go func() { + startServe(ctx, cancel, reg) + wg.Done() + }() +} + +func initPaprams() { + common.SetHccsBWProfilingTime(hccsBWProfilingTime) + common.SetExternalParams(profilingTime) +} + +func paramValid(platform string) error { + var err error + switch platform { + case prometheusPlatform: + err = paramValidInPrometheus() + case telegrafPlatform: + err = paramValidInTelegraf() + default: + err = fmt.Errorf("err platform input") + } + if err != nil { + logger.Error(err) + return err + } + return nil +} + +func initConfig() *limiter.HandlerConfig { + conf := &limiter.HandlerConfig{ + PrintLog: true, + Method: http.MethodGet, + LimitBytes: limiter.DefaultDataLimit, + TotalConCurrency: concurrency, + IPConCurrency: limitIPReq, + CacheSize: limiter.DefaultCacheSize, + } + return conf +} + +func newServerAndListener(conf *limiter.HandlerConfig) (*http.Server, net.Listener) { + handler, err := limiter.NewLimitHandlerV2(http.DefaultServeMux, conf) + if err != nil { + hwlog.RunLog.Error(err) + return nil, nil + } + s := &http.Server{ + Addr: ip + ":" + strconv.Itoa(port), + Handler: handler, + ReadTimeout: timeout * time.Second, + WriteTimeout: timeout * time.Second, + MaxHeaderBytes: maxHeaderBytes, + ErrorLog: log.New(&hwlog.SelfLogWriter{}, "", log.Lshortfile), + } + ln, err := net.Listen("tcp", s.Addr) + if err != nil { + logger.Errorf("listen ip and port error: %v", err) + return nil, nil + } + limitLs, err := limiter.LimitListener(ln, limitTotalConn, limitIPConn, limiter.DefaultCacheSize) + if err != nil { + hwlog.RunLog.Error(err) + return nil, nil + } + return s, limitLs +} + +func readCntMonitoringFlags() container.CntNpuMonitorOpts { + opts := container.CntNpuMonitorOpts{UseOciBackup: true, UseCriBackup: true} + switch containerMode { + case containerModeDocker: + opts.EndpointType = container.EndpointTypeDockerd + opts.OciEndpoint = container.DefaultDockerAddr + opts.CriEndpoint = container.DefaultDockerShim + case containerModeContainerd: + opts.EndpointType = container.EndpointTypeContainerd + opts.OciEndpoint = container.DefaultContainerdAddr + opts.CriEndpoint = container.DefaultContainerdAddr + case containerModeIsula: + opts.EndpointType = container.EndpointTypeIsula + opts.OciEndpoint = container.DefaultIsuladAddr + opts.CriEndpoint = container.DefaultIsuladAddr + default: + hwlog.RunLog.Error("invalid container mode setting,reset to docker") + opts.EndpointType = container.EndpointTypeDockerd + opts.OciEndpoint = container.DefaultDockerAddr + opts.CriEndpoint = container.DefaultDockerShim + } + if containerd != "" { + opts.OciEndpoint = containerd + opts.UseOciBackup = false + } + if endpoint != "" { + opts.CriEndpoint = endpoint + opts.UseCriBackup = false + } + return opts +} + +func checkIPAndPortInPrometheus() error { + if port < portLeft || port > portRight { + return errors.New("the port is invalid") + } + parsedIP := net.ParseIP(ip) + if parsedIP == nil { + return errors.New("the listen ip is invalid") + } + ip = parsedIP.String() + logger.Infof("listen on: %s", ip) + return nil +} + +func paramValidInPrometheus() error { + checks := []func() error{ + checkIPAndPortInPrometheus, + checkUpdateTime, + containerSockCheck, + checkLimitIPReqFormat, + checkLimitIPConn, + checkLimitTotalConn, + checkCacheSize, + checkConcurrency, + checkProfilingTime, + checkHccsBWProfilingTime, + checkDeviceResetTimeout, + checkPollIntervalInCmdLine, + } + + for _, check := range checks { + if err := check(); err != nil { + return err + } + } + return nil +} + +func checkUpdateTime() error { + if updateTime > oneMinute || updateTime < 1 { + return errors.New("the updateTime is invalid") + } + return nil +} + +func checkLimitIPReqFormat() error { + reg := regexp.MustCompile(limiter.IPReqLimitReg) + if !reg.Match([]byte(limitIPReq)) { + return errors.New("limitIPReq format error") + } + return nil +} + +func checkLimitIPConn() error { + if limitIPConn < 1 || limitIPConn > maxIPConnLimit { + return errors.New("limitIPConn is invalid") + } + return nil +} + +func checkLimitTotalConn() error { + if limitTotalConn < 1 || limitTotalConn > maxConcurrency { + return errors.New("limitTotalConn is invalid") + } + return nil +} + +func checkCacheSize() error { + if cacheSize < 1 || cacheSize > limiter.DefaultCacheSize*tenDays { + return errors.New("cacheSize is invalid") + } + return nil +} + +func checkConcurrency() error { + if concurrency < 1 || concurrency > maxConcurrency { + return errors.New("concurrency is invalid") + } + return nil +} + +func checkProfilingTime() error { + if profilingTime < 1 || profilingTime > maxProfilingTime { + return errors.New("profilingTime range error") + } + return nil +} + +func checkHccsBWProfilingTime() error { + if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { + return errors.New("hccsBWProfilingTime range error") + } + return nil +} + +func checkDeviceResetTimeout() error { + if deviceResetTimeout < api.MinDeviceResetTimeout || deviceResetTimeout > api.MaxDeviceResetTimeout { + return errors.New("deviceResetTimeout range error") + } + return nil +} + +func checkPollIntervalInCmdLine() error { + cmdLine := strings.Join(os.Args[1:], "") + if strings.Contains(cmdLine, pollIntervalStr) { + return fmt.Errorf("%s is not support this scene", pollIntervalStr) + } + return nil +} + +func containerSockCheck() error { + if endpoint != "" && !strings.Contains(endpoint, ".sock") { + return errors.New("endpoint file is not sock address") + } + if containerd != "" && !strings.Contains(containerd, ".sock") { + return errors.New("containerd file is not sock address") + } + if endpoint != "" && !strings.Contains(endpoint, unixPre) { + endpoint = unixPre + endpoint + } + if containerd != "" && !strings.Contains(containerd, unixPre) { + containerd = unixPre + containerd + } + return nil +} + +func init() { + flag.IntVar(&port, "port", portConst, + "The server port of the http service,range[1025-40000]") + flag.StringVar(&ip, "ip", "", + "The listen ip of the service,0.0.0.0 is not recommended when install on Multi-NIC host") + flag.IntVar(&updateTime, "updateTime", updateTimeConst, + "Interval (seconds) to update the npu metrics cache,range[1-60]") + flag.BoolVar(&version, "version", false, + "If true,query the version of the program (default false)") + flag.StringVar(&containerMode, "containerMode", containerModeDocker, + "Set 'docker' for monitoring docker containers or 'containerd' for CRI & containerd") + flag.StringVar(&containerd, "containerd", "", + "The endpoint of containerd used for listening containers' events") + flag.StringVar(&endpoint, "endpoint", "", + "The endpoint of the CRI server to which will be connected") + flag.IntVar(&concurrency, "concurrency", defaultConcurrency, + "The max concurrency of the http server, range is [1-512]") + // hwlog configuration + flag.IntVar(&logger.HwLogConfig.LogLevel, "logLevel", 0, + "Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)") + flag.IntVar(&logger.HwLogConfig.MaxAge, "maxAge", hwlog.DefaultMinSaveAge, + "Maximum number of days for backup log files, range [7, 700] days") + flag.StringVar(&logger.HwLogConfig.LogFileName, "logFile", defaultLogFile, + "Log file path. If the file size exceeds 20MB, will be rotated") + flag.IntVar(&logger.HwLogConfig.MaxBackups, "maxBackups", hwlog.DefaultMaxBackups, + "Maximum number of backup log files, range is (0, 30]") + flag.IntVar(&cacheSize, "cacheSize", limiter.DefaultCacheSize, "the cacheSize for ip limit,"+ + "range is [1,1024000],keep default normally") + flag.IntVar(&limitIPConn, "limitIPConn", defaultConcurrency, "the tcp connection limit for each Ip,"+ + "range is [1,128]") + flag.IntVar(&limitTotalConn, "limitTotalConn", defaultConnection, "the tcp connection limit for all"+ + " request,range is [1,512]") + flag.StringVar(&limitIPReq, "limitIPReq", "20/1", + "the http request limit counts for each Ip,20/1 means allow 20 request in 1 seconds") + flag.StringVar(&platform, "platform", "Prometheus", "the data reporting platform, "+ + "just support Prometheus and Telegraf") + flag.StringVar(&textMetricsFilePath, "textMetricsFilePath", "", + "text indicator collection path, only support specified one file path") + flag.DurationVar(&pollInterval, pollIntervalStr, 1*time.Second, + "how often to send metrics when use Telegraf plugin, "+ + "needs to be used with -platform=Telegraf, otherwise, it does not take effect") + flag.IntVar(&profilingTime, "profilingTime", defaultProfilingTime, + "config pcie bandwidth profiling time, range is [1, 2000]") + flag.IntVar(&hccsBWProfilingTime, api.HccsBWProfilingTimeStr, defaultHccsBwProfilingTime, + "config "+api.Hccs+" bandwidth profiling time, range is [1, 1000]") + flag.IntVar(&deviceResetTimeout, api.DeviceResetTimeout, api.DefaultDeviceResetTimeout, + "when npu-exporter starts, if the number of chips is insufficient, the maximum duration to wait for "+ + "the driver to report all chips, unit second, range [10, 600]") +} + +func indexHandler(w http.ResponseWriter, _ *http.Request) { + var proposal = "http" + _, err := w.Write([]byte( + ` + NPU-Exporter + +

NPU-Exporter

+

Welcome to use NPU-Exporter,the Prometheus metrics url is ` + proposal + `://ip:` + + strconv.Itoa(port) + `/metrics: Metrics

+ + `)) + if err != nil { + logger.Errorf("Write to response error: %v", err) + } +} + +func prometheusProcess() { + +} + +func startServe(ctx context.Context, cancel context.CancelFunc, reg *prometheus.Registry) { + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{ErrorHandling: promhttp.ContinueOnError})) + http.Handle("/", http.HandlerFunc(indexHandler)) + conf := initConfig() + s, limitLs := newServerAndListener(conf) + if s == nil || limitLs == nil { + cancel() + return + } + + go func() { + logger.Warn("enable unsafe http server") + if err := s.Serve(limitLs); err != nil { + logger.Errorf("Http server error: %v and stopped", err) + cancel() + } + }() + + <-ctx.Done() + shutErr := func() error { + logger.Info("received stop signal, STOP http server") + ctxShutDown, timeOut := context.WithTimeout(context.Background(), defaultShutDownTimeout) + defer timeOut() + return s.Shutdown(ctxShutDown) + }() + if shutErr != nil { + logger.Errorf("shutdown http server error: %v", shutErr) + } +} + +func paramValidInTelegraf() error { + // cmdLine here must contain "-platform=Telegraf", otherwise, it will enter the Prometheus process + cmdLine := os.Args[1:] + + // store the preset parameter names in the map + presetParamsMap := map[string]bool{ + platformStr: true, + pollIntervalStr: true, + api.HccsBWProfilingTimeStr: true, + } + + if len(cmdLine) > len(presetParamsMap) { + return errors.New("too many parameters") + } + + var paramLen = 2 + // check every input params + for _, param := range cmdLine { + param = strings.TrimPrefix(param, "-") + split := strings.Split(param, "=") + if len(split) != paramLen { + return fmt.Errorf("the param [%s] is a wrong format", param) + } + paramName := split[0] + if !presetParamsMap[paramName] { + return fmt.Errorf("not support [%s] in Telegraf", paramName) + } + } + + if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { + return errors.New(api.Hccs + "BWProfilingTime range error") + } + return nil +} + +func telegrafProcess() { + // create the shim. This is what will run your plugins. + shim := shim.New() + + // If no config is specified, all imported plugins are loaded. + // otherwise follow what the config asks for. + // Check for settings from a config toml file, + // (or just use whatever plugins were imported above) + configFile := "" + err := shim.LoadConfig(&configFile) + if err != nil { + fmt.Fprintf(os.Stderr, "Err loading input: %s\n", err) + return + } + + // run the input plugin(s) until stdin closes, or we receive a termination signal + if err := shim.Run(pollInterval); err != nil { + fmt.Fprintf(os.Stderr, "Err: %s\n", err) + return + } +} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go new file mode 100644 index 0000000..af46251 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go @@ -0,0 +1,109 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "context" + "strings" + "sync" + "time" + + "ascend-common/common-utils/hwlog" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +// StartContainerInfoCollect start collect container info +func StartContainerInfoCollect(ctx context.Context, cancelFunc context.CancelFunc, group *sync.WaitGroup, + n *NpuCollector) { + group.Add(1) + + go func() { + defer group.Done() + retryCount := 0 + collectContainerInfo := func() { + logger.Info("start to collect container info") + n.devicesParser.FetchAndParse(nil) + select { + case result := <-n.devicesParser.RecvResult(): + if err := n.cache.Set(containersDevicesCacheKey, result, n.cacheTime); err != nil { + logger.Error(err) + } + logger.Infof(UpdateCachePattern, containersDevicesCacheKey) + retryCount = 0 + case err := <-n.devicesParser.RecvErr(): + logger.Errorf("received error from device parser: %v", err) + if strings.Contains(err.Error(), "connection refused") { + retryCount++ + if retryCount == connectRefusedMaxRetry { + logger.Error("connection refused, task shutdown") + cancelFunc() + } + } + } + } + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop container info collect") + return + default: + collectContainerInfo() + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, containersDevicesCacheKey) + return + } + } + } + }() +} + +// GetContainerNPUInfo get container npu info +func GetContainerNPUInfo(n *NpuCollector) map[int32]container.DevicesInfo { + obj, err := n.cache.Get(containersDevicesCacheKey) + // only run once to prevent wait when container info get failed + npuContainerInfoInit.Do(func() { + if err != nil { + logger.Warn("containers' devices info not found in cache, rebuilding") + resultChan := make(chan container.DevicesInfos, 1) + n.devicesParser.FetchAndParse(resultChan) + select { + case obj = <-resultChan: + case <-time.After(time.Second): + logger.Warn("rebuild container info cache timeout") + return + } + logger.Info("rebuild cache successfully") + } + }) + cntNpuInfos, ok := obj.(container.DevicesInfos) + if !ok { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: DomainForContainerInfo, ID: 0}, + "error container npu info cache and convert failed") + return nil + } + hwlog.ResetErrCnt(DomainForContainerInfo, 0) + res := make(map[int32]container.DevicesInfo, initSize) + for _, v := range cntNpuInfos { + for _, deviceID := range v.Devices { + res[int32(deviceID)] = v + } + } + return res +} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go new file mode 100644 index 0000000..6412e12 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go @@ -0,0 +1,137 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "sync" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/cache" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + testCacheTime = 60 * time.Second + testUpdateTime = 10 * time.Millisecond + testDeviceID0 = 0 + testDeviceID1 = 1 + testDeviceID2 = 2 + testContainerID1 = "container1" + testContainerID2 = "container2" + testContainerName1 = "test-container-1" + testContainerName2 = "test-container-2" +) + +var ( + testDevicesInfos = container.DevicesInfos{ + testContainerID1: { + ID: testContainerID1, + Name: testContainerName1, + Devices: []int{testDeviceID0, testDeviceID1}, + }, + testContainerID2: { + ID: testContainerID2, + Name: testContainerName2, + Devices: []int{testDeviceID2}, + }, + } +) + +func createTestNpuCollector() *NpuCollector { + parser := &container.DevicesParser{} + return &NpuCollector{ + cache: cache.New(cacheSize), + devicesParser: parser, + updateTime: testUpdateTime, + cacheTime: testCacheTime, + } +} + +func resetNpuContainerInfoInit() { + npuContainerInfoInit = sync.Once{} +} + +type getContainerNPUInfoTestCase struct { + name string + setupCache func(*NpuCollector) + mockParser func(*gomonkey.Patches, *container.DevicesParser) + expectedResult map[int32]container.DevicesInfo +} + +func createGetContainerNPUInfoTestCases() []getContainerNPUInfoTestCase { + return []getContainerNPUInfoTestCase{ + { + name: "should return container npu info when cache exists", + setupCache: func(n *NpuCollector) { + n.cache.Set(containersDevicesCacheKey, testDevicesInfos, testCacheTime) + }, + mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, + expectedResult: map[int32]container.DevicesInfo{ + int32(testDeviceID0): testDevicesInfos[testContainerID1], + int32(testDeviceID1): testDevicesInfos[testContainerID1], + int32(testDeviceID2): testDevicesInfos[testContainerID2], + }, + }, + { + name: "should rebuild cache when cache not exists", + setupCache: func(n *NpuCollector) {}, + mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) { + patches.ApplyMethod(parser, "FetchAndParse", + func(p *container.DevicesParser, resultOut chan<- container.DevicesInfos) { + if resultOut != nil { + resultOut <- testDevicesInfos + } + }) + }, + expectedResult: map[int32]container.DevicesInfo{ + int32(testDeviceID0): testDevicesInfos[testContainerID1], + int32(testDeviceID1): testDevicesInfos[testContainerID1], + int32(testDeviceID2): testDevicesInfos[testContainerID2], + }, + }, + { + name: "should return nil when cache type conversion failed", + setupCache: func(n *NpuCollector) { + n.cache.Set(containersDevicesCacheKey, "invalid type", testCacheTime) + }, + mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, + expectedResult: nil, + }, + } +} + +func TestGetContainerNPUInfo(t *testing.T) { + testCases := createGetContainerNPUInfoTestCases() + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + resetNpuContainerInfoInit() + n := createTestNpuCollector() + tc.setupCache(n) + + patches := gomonkey.NewPatches() + defer patches.Reset() + tc.mockParser(patches, n.devicesParser) + + result := GetContainerNPUInfo(n) + convey.So(result, convey.ShouldResemble, tc.expectedResult) + }) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/common/constants.go b/mind-cluster/component/npu-exporter/collector/common/constants.go new file mode 100644 index 0000000..d7e1409 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/constants.go @@ -0,0 +1,140 @@ +/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general constants +package common + +// metric label name +const ( + npuID = "id" + modelName = "model_name" + npuUUID = "vdie_id" + npuPCIEInfo = "pcie_bus_info" + namespace = "namespace" + podName = "pod_name" + cntrName = "container_name" +) + +const ( + // Healthy status of Health + Healthy = "Healthy" + // UnHealthy status of unhealth + UnHealthy = "UnHealthy" + // Abnormal status of Abnormal + Abnormal = "Abnormal" + + // LinkUp npu interface up + LinkUp = "UP" + // LinkDown npu interface down + LinkDown = "DOWN" + + // Base convert base + Base = 10 + // ContainerNameLen container name length + ContainerNameLen = 3 + // npuListCacheKey Cache key + npuListCacheKey = "npu-exporter-npu-list" + // Cache key for parsing-device result + containersDevicesCacheKey = "npu-exporter-containers-devices" + initSize = 8 + tickerFailedPattern = "%s ticker failed, task shutdown" + // UpdateCachePattern Update cache pattern + UpdateCachePattern = "update Cache,key is %s" + connectRefusedMaxRetry = 3 +) + +const ( + cacheSize = 128 + // NameSpaceIdx is the index of namespace in container name + NameSpaceIdx = 0 + // PodNameIdx is the index of pod name in container name + PodNameIdx = 1 + // ConNameIdx is the index of container name in container name + ConNameIdx = 2 + + // DecimalPlaces is the decimal places of float64 + DecimalPlaces = 2 + // BitSize is the bit size of float64 + BitSize = 64 + // GeneralDevTagKey is the default value of devTagKey in telegraf, it means the metric is not related to any device + GeneralDevTagKey = "GeneralDevTagKey" +) + +// log limit domains for metrics +const ( + // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID + DomainForLogicIdErr = "logicID" + + // DomainForHccs domain for hccs + DomainForHccs = "hccs" + + // DomainForDDR domain for DDR + DomainForDDR = "DDR" + + // DomainForSio domain for sio + DomainForSio = "sio" + + // DomainForHBM domain for HBM + DomainForHBM = "hbm" + + // DomainForHBMECC domain for hbmEcc + DomainForHBMECC = "hbmEcc" + + // DomainForHccsBW domain for hccs bandwidth + DomainForHccsBW = "hccsBw" + + // DomainForOptical domain for Optical + DomainForOptical = "optical" + + // DomainForLinkState domain for linkState + DomainForLinkState = "linkState" + + // DomainForBandwidth domain for bandwidth + DomainForBandwidth = "bandwidth" + + // DomainForLinkStat domain for linkStat + DomainForLinkStat = "linkStat" + + // DomainForLinkSpeed domain for linkSpeed + DomainForLinkSpeed = "linkSpeed" + + // DomainForRoce domain for roce + DomainForRoce = "roce" + + // DomainForMcuPower domain for mcu power + DomainForMcuPower = "mcuPower" + + // DomainForChipPower domain for chip power + DomainForChipPower = "chipPower" + + // DomainForAICoreUtilization domain for ai core utilization + DomainForAICoreUtilization = "AICoreUtilization" + + // DomainForVectorCoreUtilization domain for vector core utilization + DomainForVectorCoreUtilization = "vectorCoreUtilization" + + // DomainForProcess domain for process info + DomainForProcess = "processInfo" + + // DomainForHbmUtilization domain for High Bandwidth Memory Utilization + DomainForHbmUtilization = "hbmUtilization" + + // DomainForOverallUtilization domain for overall utilization + DomainForOverallUtilization = "overallUtilization" + + // DomainForPcieBandwidth domain for pcie bandwidth + DomainForPcieBandwidth = "pcieBandwidth" + // DomainForContainerInfo domain for pcie container info + DomainForContainerInfo = "containerInfo" +) diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go new file mode 100644 index 0000000..d891649 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go @@ -0,0 +1,192 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "reflect" + "strings" + "sync" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + // CardLabel general card label + CardLabel = []string{npuID, modelName, npuUUID, npuPCIEInfo, namespace, podName, cntrName} + + noNeedToPrintUpdateLog = map[string]bool{ + "NetworkCollector": true, + "RoceCollector": true, + "OpticalCollector": true, + } +) + +// BuildDescSlice build desc slice +func BuildDescSlice(slice *[]*prometheus.Desc, name string, help string) { + *slice = append(*slice, BuildDesc(name, help)) +} + +// BuildDesc build desc +func BuildDesc(name string, help string) *prometheus.Desc { + return prometheus.NewDesc(name, help, CardLabel, nil) +} + +// BuildDescWithLabel build desc with label +func BuildDescWithLabel(name string, help string, label []string) *prometheus.Desc { + return prometheus.NewDesc(name, help, label, nil) +} + +// MetricsCollector metrics collector +type MetricsCollector interface { + // Describe report metrics to prometheus + Describe(ch chan<- *prometheus.Desc) + + // CollectToCache collect data to cache + CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) + + // UpdatePrometheus update prometheus + UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, containerMap map[int32]container.DevicesInfo, + chips []HuaWeiAIChip) + + // UpdateTelegraf update telegraf + UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} + + // PreCollect pre handle before collect + PreCollect(*NpuCollector, []HuaWeiAIChip) + + // PostCollect post handle after collect + PostCollect(*NpuCollector) + + // IsSupported Check whether the current hardware supports this metric + IsSupported(*NpuCollector) bool +} + +// MetricsCollectorAdapter base collector for metrics collector +type MetricsCollectorAdapter struct { + LocalCache sync.Map + Is910Series bool + ContainerMap map[int32]container.DevicesInfo + Chips []HuaWeiAIChip +} + +// Describe report metrics to prometheus +func (c *MetricsCollectorAdapter) Describe(ch chan<- *prometheus.Desc) { +} + +// CollectToCache collect data to cache +func (c *MetricsCollectorAdapter) CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) { +} + +// UpdatePrometheus update prometheus +func (c *MetricsCollectorAdapter) UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) { +} + +// UpdateTelegraf update telegraf +func (c *MetricsCollectorAdapter) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} { + return fieldsMap +} + +// PreCollect pre handle before collect +func (c *MetricsCollectorAdapter) PreCollect(n *NpuCollector, chipList []HuaWeiAIChip) { + if strings.Contains(n.Dmgr.GetDevType(), api.Ascend910A) { + c.Is910Series = true + } +} + +// PostCollect post handle after collect +func (c *MetricsCollectorAdapter) PostCollect(*NpuCollector) { +} + +// IsSupported Check whether the current hardware supports this metric +func (c *MetricsCollectorAdapter) IsSupported(*NpuCollector) bool { + return true +} + +// UpdateCache update cache +func UpdateCache[T any](n *NpuCollector, cacheKey string, localCache *sync.Map) { + var cacheInfo = make(map[int32]T) + obj, err := n.cache.Get(cacheKey) + if err != nil { + logger.Debugf("get info of %s failed: %v, use initial data", cacheKey, err) + } else { + if oldCacheInfo, ok := obj.(map[int32]T); ok { + cacheInfo = copyMap(oldCacheInfo) + } else { + logger.Debug("cache format invalid, reset") + } + } + + localCache.Range(func(key, value interface{}) bool { + finalKey, okKey := key.(int32) + finalValue, okValue := value.(T) + if okKey && okValue { + cacheInfo[finalKey] = finalValue + } + return true + }) + + err = n.cache.Set(cacheKey, cacheInfo, n.cacheTime) + if noNeedToPrintUpdateLog[cacheKey] { + return + } + if err != nil { + logger.Error(err) + } +} + +func copyMap[T any](oldCacheInfo map[int32]T) map[int32]T { + var cacheInfo = make(map[int32]T) + for key, value := range oldCacheInfo { + cacheInfo[key] = value + } + return cacheInfo +} + +// GetInfoFromCache get info from cache +func GetInfoFromCache[T any](n *NpuCollector, cacheKey string) map[int32]T { + res := make(map[int32]T) + obj, err := n.cache.Get(cacheKey) + if err != nil { + logger.Warn("cache not found, please wait for rebuild") + return res + } + + if data, ok := obj.(map[int32]T); ok { + return data + } + logger.Error("cache type mismatch") + return res +} + +// GetCacheKey Obtain the name of the struct pointer as the key of the cache +func GetCacheKey(ptr interface{}) string { + v := reflect.ValueOf(ptr) + if v.Kind() != reflect.Ptr { + return "" + } + v = v.Elem() + if v.Kind() != reflect.Struct { + return "" + } + return v.Type().Name() +} diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go new file mode 100644 index 0000000..f66ceb5 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go @@ -0,0 +1,231 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "reflect" + "sync" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" +) + +// TestCopyMap test copyMap +func TestCopyMap(t *testing.T) { + type testStruct struct { + name string + age int + } + mockString := "mock" + tests := []struct { + name string + input map[int32]testStruct + validate func(*testing.T, interface{}) + }{ + {name: "NilInput", input: (map[int32]testStruct)(nil), + validate: func(t *testing.T, got interface{}) { + g, ok := got.(map[int32]testStruct) + if !ok || g == nil || len(g) != 0 { + t.Errorf("should return empty map for nil input") + } + }}, + {name: "EmptyMap", input: map[int32]testStruct{}, + validate: func(t *testing.T, got interface{}) { + if len(got.(map[int32]testStruct)) != 0 { + t.Errorf("expected empty map") + } + }}, + {name: "SingleElement", input: map[int32]testStruct{1: {name: mockString, age: 1}}, + validate: func(t *testing.T, got interface{}) { + g, ok := got.(map[int32]testStruct) + if !ok || g[1].name != mockString || g[1].age != 1 || len(g) != 1 { + t.Errorf("element mismatch") + } + }}, + {name: "MultipleElements", input: map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}}, + validate: func(t *testing.T, got interface{}) { + expected := map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}} + if !reflect.DeepEqual(got, expected) { + t.Errorf("deepEqual failed") + } + }}, + } + + for _, tt := range tests { + convey.Convey(tt.name, t, func() { + got := copyMap[testStruct](tt.input) + tt.validate(t, got) + }) + } +} + +func TestPreCollect(t *testing.T) { + tests := []struct { + name string + deviceType string + expected bool + }{ + {name: "TestPreCollect_" + api.Ascend910, + deviceType: api.Ascend910, + expected: true, + }, + {name: "TestPreCollect_" + api.Ascend310, + deviceType: api.Ascend310, + expected: false, + }, + } + convey.Convey("TestPreCollect", t, func() { + n := mockNewNpuCollector() + adapter := MetricsCollectorAdapter{ + Is910Series: false, + ContainerMap: nil, + Chips: nil, + } + for _, tt := range tests { + convey.Convey(tt.name, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.deviceType) + adapter.PreCollect(n, nil) + convey.So(adapter.Is910Series, convey.ShouldEqual, tt.expected) + }) + } + }) +} + +type cacheCase struct { + name string + cacheKey string + preHandle func() + expected int +} + +func buildTestsForUpdateCache(expected int) []cacheCase { + tests := []cacheCase{ + {name: "TestUpdateCache_save info to cache", + cacheKey: "mockKey1", + preHandle: func() {}, + expected: expected, + }, + {name: "TestUpdateCache_update old cache", + cacheKey: "mockKey2", + preHandle: func() { + noNeedToPrintUpdateLog["mockKey2"] = true + }, + expected: expected, + }, + {name: "TestUpdateCache_old cache is in incorrect type", + cacheKey: "mockKey3", + preHandle: func() {}, + expected: expected, + }, + } + return tests +} + +func TestUpdateCache(t *testing.T) { + const key = int32(0) + const expected = 1 + tests := buildTestsForUpdateCache(expected) + + n := mockNewNpuCollector() + // data init + n.cache.Set("mockKey2", map[int32]string{key: "0"}, n.cacheTime) + n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) + + convey.Convey("TestUpdateCache", t, func() { + + for _, tt := range tests { + convey.Convey(tt.name, func() { + localCache := sync.Map{} + localCache.Store(key, "mockValue") + tt.preHandle() + UpdateCache[string](n, tt.cacheKey, &localCache) + + data, err := n.cache.Get(tt.cacheKey) + convey.So(err, convey.ShouldBeNil) + map2, ok := data.(map[int32]string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(len(map2), convey.ShouldEqual, tt.expected) + }) + } + + }) +} + +func TestGetInfoFromCache(t *testing.T) { + const key = int32(0) + tests := []struct { + name string + cacheKey string + expected int + }{ + {name: "TestGetInfoFromCache_no info in cache", + cacheKey: "mockKey1", + expected: 0, + }, + {name: "TestGetInfoFromCache_correct", + cacheKey: "mockKey2", + expected: 1, + }, + {name: "TestGetInfoFromCache_info in cache is in incorrect type", + cacheKey: "mockKey3", + expected: 0, + }, + } + n := mockNewNpuCollector() + // data init + n.cache.Set("mockKey2", map[int32]string{key: "mockValue"}, n.cacheTime) + n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) + for _, tt := range tests { + convey.Convey(tt.name, t, func() { + cache := GetInfoFromCache[string](n, tt.cacheKey) + convey.So(len(cache), convey.ShouldEqual, tt.expected) + }) + } +} + +func TestGetCacheKey(t *testing.T) { + tests := []struct { + name string + args interface{} + expected string + }{ + {name: "TestGetCacheKey_ptr", + args: &MetricsCollectorAdapter{}, + expected: "MetricsCollectorAdapter", + }, + {name: "TestGetCacheKey_int", + args: 0, + expected: "", + }, + {name: "TestGetCacheKey_struct", + args: MetricsCollectorAdapter{}, + expected: "", + }, + } + + convey.Convey("TestGetCacheKey", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + convey.So(GetCacheKey(tt.args), convey.ShouldEqual, tt.expected) + }) + } + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector.go new file mode 100644 index 0000000..fee5312 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/npu_collector.go @@ -0,0 +1,423 @@ +/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "context" + "sync" + "time" + + "ascend-common/api" + "ascend-common/common-utils/cache" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + npuContainerInfoInit sync.Once + npuChipInfoInit sync.Once + // Collector base collector for prometheus and telegraf + Collector *NpuCollector + + // ChainForSingleGoroutine a list of collectors for single goroutine + ChainForSingleGoroutine []MetricsCollector + + // ChainForMultiGoroutine a list of collectors for multi goroutine + ChainForMultiGoroutine []MetricsCollector + + // ChainForCustomPlugin a list of collectors for plugin + ChainForCustomPlugin []MetricsCollector + + updateTimeForCardIds = time.Minute +) + +const ( + maxCollectTimeout = 10 * time.Second +) + +// NpuCollector for collect metrics +type NpuCollector struct { + cache *cache.ConcurrencyLRUCache + devicesParser *container.DevicesParser + updateTime time.Duration + cacheTime time.Duration + Dmgr *devmanager.DeviceManager +} + +// NewNpuCollector create a new collector +func NewNpuCollector(cacheTime time.Duration, updateTime time.Duration, + deviceParser *container.DevicesParser, dmgr *devmanager.DeviceManager) *NpuCollector { + CommonCollector := &NpuCollector{ + cache: cache.New(cacheSize), + cacheTime: cacheTime, + updateTime: updateTime, + devicesParser: deviceParser, + Dmgr: dmgr, + } + return CommonCollector +} + +// StartCollect start collect +func StartCollect(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + npuChipInfoInitAtFirstTime(n) + startCollectSingleGoroutine(group, ctx, n) + startCollectForMultiGoroutine(group, ctx, n) + startCollectForPluginGoroutine(group, ctx, n) +} + +func startCollectForPluginGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + group.Add(1) + go func() { + defer group.Done() + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + goroutinePreCollect(ChainForCustomPlugin, n) + defer goroutinePostCollect(ChainForCustomPlugin, n) + runPluginCollect(ctx, n, ticker) + }() +} + +func runPluginCollect(ctx context.Context, n *NpuCollector, ticker *time.Ticker) { + for { + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop plugin collect") + return + default: + collectPluginMetrics(n) + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, "handling plugin collectors") + return + } + } + } +} + +func collectPluginMetrics(n *NpuCollector) { + chipList := getChipListCache(n) + for _, c := range ChainForCustomPlugin { + resultChan := make(chan struct{}, 1) + go func(cur MetricsCollector) { + cur.CollectToCache(n, chipList) + resultChan <- struct{}{} + }(c) + select { + case <-resultChan: + continue + case <-time.After(maxCollectTimeout): + logger.Errorf("collect timeout for %v", GetCacheKey(c)) + continue + } + + } +} + +func startCollectForMultiGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + chips := getChipListCache(n) + + group.Add(len(chips)) + for _, chip := range chips { + go func(chip HuaWeiAIChip) { + defer group.Done() + runChipCollector(ctx, n, chip) + }(chip) + } +} + +func runChipCollector(ctx context.Context, n *NpuCollector, chip HuaWeiAIChip) { + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + goroutinePreCollect(ChainForMultiGoroutine, n) + defer goroutinePostCollect(ChainForMultiGoroutine, n) + for { + select { + case <-ctx.Done(): + logger.Infof("received the stop signal,stop collect network info of npu(%d)", chip.LogicID) + return + default: + singleChipSlice := []HuaWeiAIChip{chip} + for _, c := range ChainForMultiGoroutine { + c.CollectToCache(n, singleChipSlice) + } + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, "collect for multigroutine ") + return + } + } + } +} + +func goroutinePreCollect(collectors []MetricsCollector, n *NpuCollector) { + chipList := getChipListCache(n) + for _, c := range collectors { + c.PreCollect(n, chipList) + } +} + +func goroutinePostCollect(collectors []MetricsCollector, n *NpuCollector) { + for _, c := range collectors { + c.PostCollect(n) + } +} + +func startCollectSingleGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + group.Add(1) + go func() { + defer group.Done() + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + goroutinePreCollect(ChainForSingleGoroutine, n) + defer goroutinePostCollect(ChainForSingleGoroutine, n) + for { + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop npu base info collect") + return + default: + chipList := getChipListCache(n) + for _, c := range ChainForSingleGoroutine { + c.CollectToCache(n, chipList) + } + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, "handling all collectors") + return + } + } + } + }() +} + +// npuChipInfoInitAtFirstTime When first enter, the cache data is empty, +// need to get the data from the device, and build the cache +func npuChipInfoInitAtFirstTime(n *NpuCollector) { + npuChipInfoInit.Do(func() { + _, err := n.cache.Get(npuListCacheKey) + if err != nil { + logger.Debug("no cache in first time, start to collect chip list and rebuild cache") + + npuInfo := getNPUChipList(n.Dmgr) + if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { + logger.Error(err) + } else { + logger.Infof(UpdateCachePattern, npuListCacheKey) + } + logger.Debug("rebuild cache successfully") + } + }) +} + +// InitCardInfo init card info +func InitCardInfo(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + + group.Add(1) + go func() { + defer group.Done() + ticker := time.NewTicker(updateTimeForCardIds) + defer ticker.Stop() + for { + logger.Info("start to collect npu chip list info") + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop card info collect") + return + default: + npuInfo := getNPUChipList(n.Dmgr) + if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { + logger.Error(err) + } else { + logger.Infof(UpdateCachePattern, npuListCacheKey) + } + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, npuListCacheKey) + return + } + } + } + }() +} + +func getNPUChipList(dmgr devmanager.DeviceInterface) (npuInfo []HuaWeiAIChip) { + chipList := make([]HuaWeiAIChip, 0) + + cardNum, cards, err := dmgr.GetCardList() + if err != nil || cardNum == 0 { + logger.Errorf("failed to get npu info, error is: %v", err) + return chipList + } + + chipListIDs := make([]int32, 0) + + for _, cardID := range cards { + deviceNum, _ := dmgr.GetDeviceNumInCard(cardID) + for deviceID := int32(0); deviceID < deviceNum; deviceID++ { + var chip HuaWeiAIChip + // get logicID + logicID, err := dmgr.GetDeviceLogicID(cardID, deviceID) + if err != nil { + logger.Errorf("get logic ID of card: %v device:%v failed: %v", cardID, deviceID, err) + continue + } + + chip.LogicID = logicID + chip.CardId = cardID + chip.MainBoardId = dmgr.GetMainBoardId() + + setPhyId(&chip, dmgr, cardID, deviceID) + setChipInfo(&chip, dmgr, cardID, deviceID) + setBoardInfo(&chip, dmgr, cardID, deviceID) + setVdieID(&chip, dmgr, cardID, deviceID) + assemblevNPUInfo(dmgr, logicID, &chip) + setPCIeBusInfo(logicID, dmgr, &chip) + setElabelInfo(&chip, dmgr, cardID) + + chipList = append(chipList, chip) + chipListIDs = append(chipListIDs, logicID) + } + } + + logger.Debugf("flush chip info list successed,chip num is : %v, chipLogicIDs: %v", + len(chipList), chipListIDs) + return chipList +} + +func setBoardInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + boardInfo, err := dmgr.GetBoardInfo(chip.LogicID) + if err != nil { + logger.Errorf("get board info of card: %v device:%v failed: %v", cardID, deviceID, err) + boardInfo = common.BoardInfo{} + } + chip.BoardInfo = &boardInfo +} +func setVdieID(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + vdieID, err := dmgr.GetDieID(chip.LogicID, dcmi.VDIE) + if err != nil { + logger.Debug(err) + } + chip.VDieID = vdieID +} + +func setPhyId(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + phyID, err := dmgr.GetPhysicIDFromLogicID(chip.LogicID) + if err != nil { + logger.Errorf("get phy ID of card: %v device:%v failed: %v", cardID, deviceID, err) + } + chip.PhyId = phyID + chip.DeviceID = phyID +} +func setChipInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + // get chip info + chipInfo, err := dmgr.GetChipInfo(chip.LogicID) + if err != nil { + logger.Errorf("get chip info of card: %v device:%v failed: %v", cardID, deviceID, err) + chipInfo = &common.ChipInfo{} + } + chip.ChipInfo = chipInfo +} + +func setPCIeBusInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *HuaWeiAIChip) { + productTypes := dmgr.GetProductTypeArray() + pcieInfo, err := dmgr.GetPCIeBusInfo(logicID) + if err != nil { + if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { + logger.Debugf("pcie bus info is not supported on %s", common.Atlas200ISoc) + hwChip.PCIeBusInfo = "" + return + } + logger.Error(err) + pcieInfo = "" + } + hwChip.PCIeBusInfo = pcieInfo +} + +func setElabelInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32) { + elabelInfo, err := dmgr.GetCardElabelV2(cardID) + if err != nil { + logger.Errorf("get elabel info of card: %v failed: %v", cardID, err) + chip.ElabelInfo = &common.ElabelInfo{SerialNumber: "NA"} + return + } + chip.ElabelInfo = &common.ElabelInfo{ + SerialNumber: elabelInfo.SerialNumber, + } +} + +func assemblevNPUInfo(dmgr devmanager.DeviceInterface, logicID int32, baseChipInfo *HuaWeiAIChip) { + if dmgr.GetDevType() != api.Ascend310P { + return + } + vDevInfos, err := dmgr.GetVirtualDeviceInfo(logicID) + if err != nil { + logger.Warnf("failed to get virtual device info,logicID(%d),err: %v", logicID, err) + baseChipInfo.VDevInfos = nil + } + if vDevInfos.TotalResource.VDevNum == 0 { + baseChipInfo.VDevInfos = &common.VirtualDevInfo{} + } + baseChipInfo.VDevInfos = &vDevInfos +} + +// GetChipListWithVNPU get chip list with vnpu +func GetChipListWithVNPU(n *NpuCollector) []HuaWeiAIChip { + result := make([]HuaWeiAIChip, 0) + chips := getChipListCache(n) + + for _, chipInfo := range chips { + isNeedHandleVnpu := n.Dmgr.GetDevType() == api.Ascend310P && chipInfo.VDevInfos != nil && + len(chipInfo.VDevInfos.VDevActivityInfo) > 0 + + if !isNeedHandleVnpu { + result = append(result, chipInfo) + continue + } + + for _, activityVDev := range chipInfo.VDevInfos.VDevActivityInfo { + vDevInfo := chipInfo + activityVDevCopy := activityVDev + vDevInfo.VDevActivityInfo = &activityVDevCopy + result = append(result, vDevInfo) + } + } + + return result + +} +func getChipListCache(n *NpuCollector) []HuaWeiAIChip { + obj, err := n.cache.Get(npuListCacheKey) + if err != nil { + logger.Errorf("get npu chip list from cache failed,err is : %v", err) + return make([]HuaWeiAIChip, 0) + } + if obj == nil { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "getChipListCache"}, + "there is no chip list info in cache,please check collect logs") + return make([]HuaWeiAIChip, 0) + } + + chipList, ok := obj.([]HuaWeiAIChip) + if !ok { + logger.Errorf("error npu chip info cache and convert failed,real type is (%T)", obj) + n.cache.Delete(npuListCacheKey) + return make([]HuaWeiAIChip, 0) + } + // if cache is empty or nil, return empty list + if len(chipList) == 0 { + return make([]HuaWeiAIChip, 0) + } + return chipList +} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go new file mode 100644 index 0000000..722079b --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go @@ -0,0 +1,547 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package common for general collector +package common + +import ( + "context" + "errors" + "strconv" + "sync" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "github.com/stretchr/testify/assert" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + mockErr = errors.New("mockErr") + testError = errors.New(testErrorMsg) +) + +const ( + cacheTime = 60 * time.Second + npuCount = 8 + defaultUpdateTime = 10 * time.Millisecond + num2 = 2 + num100 = 100 + mockKey = "mockKey" + mockValue = "mockValue" + + // Test constants for setElabelInfo + testCardID = int32(1) + testProductName = "Atlas 900" + testModel = "Atlas-900-9000" + testManufacturer = "Huawei" + testManufacturerDate = "2023-01-01" + testSerialNumber = "SN123456789" + testDefaultSerial = "NA" + testErrorMsg = "get elabel info failed" +) + +type mockContainerRuntimeOperator struct{} + +// Init implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) Init() error { + return nil +} + +// Close implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) Close() error { + return nil +} + +// ContainerIDs implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetContainers(ctx context.Context) ([]*container.CommonContainer, error) { + return []*container.CommonContainer{}, nil +} + +// GetContainerInfoByID implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { + return v1.Spec{}, nil +} + +// GetIsulaContainerInfoByID implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetIsulaContainerInfoByID(ctx context.Context, + id string) (isula.ContainerJson, error) { + return isula.ContainerJson{}, nil +} + +// GetContainerType implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetContainerType() string { + return container.DefaultContainer +} + +func mockScan4AscendDevices(_ string) ([]int, bool, error) { + return []int{1}, true, nil +} + +func mockGetCgroupPath(controller, specCgroupsPath string) (string, error) { + return "", nil +} + +func makeMockDevicesParser() *container.DevicesParser { + return &container.DevicesParser{ + RuntimeOperator: new(mockContainerRuntimeOperator), + } +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +// TestNewNpuCollector test method of NewNpuCollector +func TestNewNpuCollector(t *testing.T) { + tc := newNpuCollectorTestCase{ + cacheTime: cacheTime, + updateTime: defaultUpdateTime, + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + + c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + + assert.NotNil(t, c) +} + +type testCase struct { + name string + wantErr bool + mockPart interface{} + expectValue interface{} + expectCount interface{} +} + +func newTestCase(name string, wantErr bool, mockPart interface{}) testCase { + return testCase{ + name: name, + wantErr: wantErr, + mockPart: mockPart, + } +} + +// TestGetChipInfo test method getChipInfo +func TestGetChipInfo(t *testing.T) { + tests := []testCase{ + newTestCase("should return chip info successfully when dsmi works normally", false, + &devmanager.DeviceManagerMock{}), + newTestCase("should return nil when dsmi works abnormally", true, &devmanager.DeviceManagerMockErr{}), + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + chipInfo := getNPUChipList(tt.mockPart.(devmanager.DeviceInterface)) + t.Logf("%#v", chipInfo) + assert.NotNil(t, chipInfo) + if tt.wantErr { + assert.Len(t, chipInfo, 0) + } else { + assert.NotNil(t, chipInfo) + } + }) + } +} + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") +} + +func mockGetNPUChipList() []HuaWeiAIChip { + chips := make([]HuaWeiAIChip, 0) + for id := int32(0); id < npuCount; id++ { + chip := HuaWeiAIChip{ + CardId: id, + PhyId: id, + DeviceID: id, + LogicID: id, + } + + chips = append(chips, chip) + } + return chips +} + +// TestInitCardInfo test method getChipInfo +func TestInitCardInfo(t *testing.T) { + patches := gomonkey.ApplyFuncReturn(getNPUChipList, mockGetNPUChipList()) + defer patches.Reset() + convey.Convey("test InitCardInfo", t, func() { + + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() + npuCollector := mockNewNpuCollector() + + InitCardInfo(&sync.WaitGroup{}, ctx, npuCollector) + time.Sleep(time.Millisecond * num100) + cancelFunc() + chips := getChipListCache(npuCollector) + convey.So(len(chips), convey.ShouldEqual, npuCount) + }) +} + +// TestGetChipListCache test method getChipListCache +func TestGetChipListCache(t *testing.T) { + npuCollector := mockNewNpuCollector() + tests := []testCase{ + {name: "should return 0 chips when cache is nil", wantErr: false, mockPart: func() {}, expectCount: 0}, + {name: "should return chips : " + strconv.Itoa(npuCount), expectCount: npuCount, wantErr: false, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, mockGetNPUChipList(), cacheTime) }}, + {name: "should return 0 chips when cache value is nil", wantErr: false, expectCount: 0, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, nil, cacheTime) }}, + {name: "should return 0 chips when value is a incorrect type", expectCount: 0, wantErr: false, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, &HuaWeiAIChip{}, cacheTime) }}, + {name: "should return 0 chips when cache is empty", expectCount: 0, wantErr: false, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, []HuaWeiAIChip{}, cacheTime) }, + }, + } + + convey.Convey("getChipListCache", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + tt.mockPart.(func())() + chips := getChipListCache(npuCollector) + assert.Len(t, chips, tt.expectCount.(int)) + convey.So(len(chips), convey.ShouldEqual, tt.expectCount) + }) + } + }) +} + +func mockNewNpuCollector() *NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: cacheTime, + updateTime: defaultUpdateTime, + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +func TestNpuChipInfoInitAtFirstTime(t *testing.T) { + n := mockNewNpuCollector() + convey.Convey("TestNpuChipInfoInitAtFirstTime", t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyFuncReturn(getNPUChipList, []HuaWeiAIChip{{CardId: 0}}) + // do test + npuChipInfoInitAtFirstTime(n) + // valid cache + data, err := n.cache.Get(npuListCacheKey) + convey.So(err, convey.ShouldBeNil) + chips, ok := data.([]HuaWeiAIChip) + convey.So(ok, convey.ShouldBeTrue) + convey.So(len(chips), convey.ShouldEqual, 1) + }) +} + +func patchCollectToCache() *gomonkey.Patches { + return gomonkey.ApplyMethod(&MetricsCollectorAdapter{}, "CollectToCache", + func(_ *MetricsCollectorAdapter, n *NpuCollector, chipList []HuaWeiAIChip) { + n.cache.Set(mockKey, mockValue, n.cacheTime) + }) +} + +func TestStartCollectForMultiGoroutine(t *testing.T) { + n := mockNewNpuCollector() + wg := sync.WaitGroup{} + ChainForMultiGoroutine = []MetricsCollector{ + &MetricsCollectorAdapter{}, + &MetricsCollectorAdapter{}, + } + patches := patchCollectToCache() + defer patches.Reset() + patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{createChip()}) + convey.Convey("TestStartCollectForMultiGoroutine", t, func() { + ctx, cancel := context.WithCancel(context.Background()) + startCollectForMultiGoroutine(&wg, ctx, n) + time.Sleep(n.updateTime) + cancel() + data, err := n.cache.Get(mockKey) + convey.So(err, convey.ShouldBeNil) + value, ok := data.(string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(value, convey.ShouldEqual, mockValue) + }) +} + +func TestRunChipCollector(t *testing.T) { + n := mockNewNpuCollector() + patches := patchCollectToCache() + defer patches.Reset() + convey.Convey("TestRunChipCollector", t, func() { + ctx, cancel := context.WithCancel(context.Background()) + tickCh := make(chan time.Time) + patches.ApplyFuncReturn(time.NewTicker, &time.Ticker{C: tickCh}) + close(tickCh) + go runChipCollector(ctx, n, createChip()) + time.Sleep(n.updateTime) + cancel() + data, err := n.cache.Get(mockKey) + convey.So(err, convey.ShouldBeNil) + value, ok := data.(string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(value, convey.ShouldEqual, mockValue) + }) +} + +func TestStartCollectSingleGoroutine(t *testing.T) { + n := mockNewNpuCollector() + wg := sync.WaitGroup{} + ChainForSingleGoroutine = []MetricsCollector{ + &MetricsCollectorAdapter{}, + } + patches := patchCollectToCache() + defer patches.Reset() + convey.Convey("TestStartCollectSingleGoroutine", t, func() { + ctx, cancel := context.WithCancel(context.Background()) + startCollectSingleGoroutine(&wg, ctx, n) + time.Sleep(n.updateTime) + cancel() + data, err := n.cache.Get(mockKey) + convey.So(err, convey.ShouldBeNil) + value, ok := data.(string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(value, convey.ShouldEqual, mockValue) + }) +} + +type chipsCase struct { + name string + devType string + buildChips func() + expectValue int +} + +func TestGetChipListWithVNPU(t *testing.T) { + n := mockNewNpuCollector() + chip := HuaWeiAIChip{} + tests := []chipsCase{ + {name: "TestGetChipListWithVNPU_310p_no_vnpu", + devType: api.Ascend310P, + buildChips: func() { + chip = createChip() + }, + expectValue: 1, + }, + {name: "TestGetChipListWithVNPU_310p_2_vnpus", + devType: api.Ascend310P, + buildChips: func() { + chip = createValidVnpuChip() + }, + expectValue: num2, + }, + {name: "TestGetChipListWithVNPU_910", + devType: api.Ascend910, + buildChips: func() { + chip = createChip() + }, + expectValue: 1, + }, + } + + convey.Convey("TestGetChipListWithVNPU", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + tt.buildChips() + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.devType) + patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{chip}) + + chips := GetChipListWithVNPU(n) + convey.So(len(chips), convey.ShouldEqual, tt.expectValue) + }) + } + }) +} + +func createValidVnpuChip() HuaWeiAIChip { + chip := createChip() + chip.VDevInfos = &common.VirtualDevInfo{ + VDevActivityInfo: []common.VDevActivityInfo{ + { + VDevID: 0, + VDevAiCore: 0, + VDevTotalMem: 0, + VDevUsedMem: 0, + IsVirtualDev: true, + }, + { + VDevID: 1, + VDevAiCore: 1, + VDevTotalMem: 1, + VDevUsedMem: 1, + IsVirtualDev: true, + }, + }, + } + return chip +} + +func createChip() HuaWeiAIChip { + return HuaWeiAIChip{ + CardId: 0, + PhyId: 0, + DeviceID: 0, + LogicID: 0, + ChipInfo: &common.ChipInfo{ + Name: api.Ascend910, + Type: "Ascend", + Version: "V1", + }, + } +} + +func TestSetPCIeBusInfo(t *testing.T) { + const mockPcieBus = "0000:01:00.0" + tests := []struct { + name string + productTypes []string + err error + expectValue string + }{{ + name: "TestSetPCIeBusInfo_910", + productTypes: []string{api.Ascend910}, + err: nil, + expectValue: mockPcieBus, + }, { + name: "TestSetPCIeBusInfo_910_err", + productTypes: []string{api.Ascend910}, + err: mockErr, + expectValue: "", + }, { + name: "TestSetPCIeBusInfo_Atlas200ISoc", + productTypes: []string{common.Atlas200ISoc}, + err: nil, + expectValue: mockPcieBus, + }, { + name: "TestSetPCIeBusInfo_Atlas200ISoc_err", + productTypes: []string{common.Atlas200ISoc}, + err: mockErr, + expectValue: "", + }} + chip := createChip() + convey.Convey("TestSetPCIeBusInfo", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + dmgr := &devmanager.DeviceManager{ProductTypes: tt.productTypes} + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(dmgr, "GetPCIeBusInfo", mockPcieBus, tt.err) + + setPCIeBusInfo(0, dmgr, &chip) + convey.So(chip.PCIeBusInfo, convey.ShouldEqual, tt.expectValue) + }) + } + }) +} + +type setElabelInfoTestCase struct { + name string + cardID int32 + mockElabelInfo common.ElabelInfo + mockError error + expectSerial string + expectProduct string + expectModel string + expectManufacturer string + expectManufacturerDate string +} + +func createSetElabelInfoTestCases() []setElabelInfoTestCase { + return []setElabelInfoTestCase{ + { + name: "should set elabel info successfully when GetCardElabelV2 returns valid data", + cardID: testCardID, + mockElabelInfo: common.ElabelInfo{ + ProductName: testProductName, + Model: testModel, + Manufacturer: testManufacturer, + ManufacturerDate: testManufacturerDate, + SerialNumber: testSerialNumber, + }, + mockError: nil, + expectSerial: testSerialNumber, + expectProduct: testProductName, + expectModel: testModel, + expectManufacturer: testManufacturer, + expectManufacturerDate: testManufacturerDate, + }, + { + name: "should set default elabel info when GetCardElabelV2 returns error", + cardID: testCardID, + mockElabelInfo: common.ElabelInfo{}, + mockError: testError, + expectSerial: testDefaultSerial, + expectProduct: "", + expectModel: "", + expectManufacturer: "", + expectManufacturerDate: "", + }, + } +} + +func executeSetElabelInfoTest(tc setElabelInfoTestCase) { + // Create mock device manager + mockDmgr := &devmanager.DeviceManager{} + + // Create test chip + chip := &HuaWeiAIChip{} + + // Apply gomonkey patches + patches := gomonkey.NewPatches() + defer patches.Reset() + + patches.ApplyMethodReturn(mockDmgr, "GetCardElabelV2", + tc.mockElabelInfo, tc.mockError) + + // Execute the function under test + setElabelInfo(chip, mockDmgr, tc.cardID) + + // Verify results + convey.So(chip.ElabelInfo, convey.ShouldNotBeNil) + convey.So(chip.ElabelInfo.SerialNumber, convey.ShouldEqual, tc.expectSerial) +} + +// TestSetElabelInfo test setElabelInfo method +func TestSetElabelInfo(t *testing.T) { + testCases := createSetElabelInfoTestCases() + + convey.Convey("TestSetElabelInfo", t, func() { + for _, tc := range testCases { + convey.Convey(tc.name, func() { + executeSetElabelInfoTest(tc) + }) + } + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/common/types.go b/mind-cluster/component/npu-exporter/collector/common/types.go new file mode 100644 index 0000000..4576c85 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/types.go @@ -0,0 +1,50 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for collector +package common + +import ( + "ascend-common/devmanager/common" +) + +// HuaWeiAIChip chip info +type HuaWeiAIChip struct { + + // CardId npu card id + CardId int32 `json:"card_id"` + // PhyId npu chip phy id + PhyId int32 `json:"phy_id"` + // DeviceID the chip physic ID + DeviceID int32 `json:"device_id"` + // the chip logic ID + LogicID int32 `json:"logic_id"` + // VDieID the vdie id + VDieID string `json:"vdie_id"` + // MainBoardId main board id , used to distinguish between A900A3SuperPod and A9000A3SuperPod + MainBoardId uint32 + // ChipInfo the chip info + ChipInfo *common.ChipInfo `json:"chip_info"` + // BoardInfo board info of device, but not display + BoardInfo *common.BoardInfo + + // VDevActivityInfo the activity virtual device info + VDevActivityInfo *common.VDevActivityInfo `json:"v_dev_activity_info"` + // VDevInfos the virtual device info + VDevInfos *common.VirtualDevInfo `json:"v_dev_infos"` + // PCIeBusInfo bus info + PCIeBusInfo string + // ElabelInfo elabel info including SN + ElabelInfo *common.ElabelInfo `json:"elabel_info"` +} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config.go new file mode 100644 index 0000000..be32832 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/config/metrics_config.go @@ -0,0 +1,208 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package config for general collector +package config + +import ( + "encoding/json" + "fmt" + "reflect" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" + + "ascend-common/common-utils/utils" +) + +var ( + // singleGoroutineMap metrics in this map will be collected in single goroutine + singleGoroutineMap = map[string]common.MetricsCollector{ + groupHccs: &metrics.HccsCollector{}, + groupNpu: &metrics.BaseInfoCollector{}, + groupSio: &metrics.SioCollector{}, + groupVersion: &metrics.VersionCollector{}, + groupHbm: &metrics.HbmCollector{}, + groupDDR: &metrics.DdrCollector{}, + groupVnpu: &metrics.VnpuCollector{}, + groupPcie: &metrics.PcieCollector{}, + } + // multiGoroutineMap metrics in this map will be collected in multi goroutine + multiGoroutineMap = map[string]common.MetricsCollector{ + groupNetwork: &metrics.NetworkCollector{}, + groupRoce: &metrics.RoceCollector{}, + groupOptical: &metrics.OpticalCollector{}, + } + // pluginCollectorMap metrics in this map will be collected in plugin goroutine + pluginCollectorMap = map[string]common.MetricsCollector{} + presetConfigs = make([]map[string]string, 0) + pluginConfigs = make([]map[string]string, 0) + + defaultPresetConfigs = []map[string]string{ + {metricsGroup: groupDDR, state: stateOn}, + {metricsGroup: groupHccs, state: stateOn}, + {metricsGroup: groupNpu, state: stateOn}, + {metricsGroup: groupNetwork, state: stateOn}, + {metricsGroup: groupPcie, state: stateOn}, + {metricsGroup: groupRoce, state: stateOn}, + {metricsGroup: groupSio, state: stateOn}, + {metricsGroup: groupVnpu, state: stateOn}, + {metricsGroup: groupVersion, state: stateOn}, + {metricsGroup: groupOptical, state: stateOn}, + {metricsGroup: groupHbm, state: stateOn}, + } + defaultPluginConfigs = []map[string]string{ + {metricsGroup: groupText, state: stateOn}, + } +) + +const ( + metricsGroup = "metricsGroup" + state = "state" + + groupDDR = "ddr" + groupHccs = "hccs" + groupNpu = "npu" + groupNetwork = "network" + groupPcie = "pcie" + groupRoce = "roce" + groupSio = "sio" + groupVnpu = "vnpu" + groupVersion = "version" + groupOptical = "optical" + groupHbm = "hbm" + groupText = "text" + + stateOn = "ON" + stateOFF = "OFF" +) + +const ( + PresetConfigPath = "/usr/local/metricConfiguration.json" + PluginConfigPath = "/usr/local/pluginConfiguration.json" +) + +func loadConfiguration() { + if fileBytes := loadFromFile(PresetConfigPath); fileBytes == nil { + logger.Warnf("load config from file %s failed, use default config", PresetConfigPath) + presetConfigs = defaultPresetConfigs + } else { + initConfiguration(fileBytes, &presetConfigs) + } + if fileBytes := loadFromFile(PluginConfigPath); fileBytes == nil { + logger.Warnf("load config from file %s failed, use default config", PluginConfigPath) + pluginConfigs = defaultPluginConfigs + } else { + initConfiguration(fileBytes, &pluginConfigs) + } +} + +func loadFromFile(filePath string) []byte { + fileBytes, err := utils.LoadFile(filePath) + if err != nil { + return nil + } + return fileBytes +} + +func initConfiguration(fileBytes []byte, configs *[]map[string]string) { + if err := json.Unmarshal(fileBytes, configs); err != nil { + logger.Errorf("unmarshal config byte failed: %v", err) + return + } +} + +// AddPluginCollector add plugin collector to cache +func AddPluginCollector(name string, collector common.MetricsCollector) error { + if _, exist := pluginCollectorMap[name]; exist { + logger.Errorf("plugin collector %v already exist", name) + return fmt.Errorf("plugin collector %v already exist", name) + } + logger.Infof("add plugin collector %v ok", name) + pluginCollectorMap[name] = collector + return nil +} + +// DeletePluginCollector delete plugin collector from cache +func DeletePluginCollector(name string) { + if _, exist := pluginCollectorMap[name]; !exist { + logger.Warnf("plugin collector %v does not exist", name) + return + } + logger.Infof("delete plugin collector %v ok", name) + delete(pluginCollectorMap, name) +} + +// Register register collector to cache +func Register(n *common.NpuCollector) { + loadConfiguration() + + for _, config := range presetConfigs { + metricsGroupName := config[metricsGroup] + + if config[state] != stateOn { + logger.Infof("metricsGroup [%v] is off", metricsGroupName) + continue + } + logger.Infof("metricsGroup [%v] is on", metricsGroupName) + collector, exist := singleGoroutineMap[metricsGroupName] + if exist && collector.IsSupported(n) { + common.ChainForSingleGoroutine = append(common.ChainForSingleGoroutine, collector) + } + + collector, exist = multiGoroutineMap[metricsGroupName] + if exist && collector.IsSupported(n) { + common.ChainForMultiGoroutine = append(common.ChainForMultiGoroutine, collector) + } + } + + for _, config := range pluginConfigs { + metricsGroupName := config[metricsGroup] + + if config[state] != stateOn { + logger.Infof("plugin collector [%v] is off", metricsGroupName) + continue + } + logger.Infof("plugin collector [%v] is on", metricsGroupName) + collector, exist := pluginCollectorMap[metricsGroupName] + if exist && collector.IsSupported(n) { + logger.Infof("add plugin collector:%v", metricsGroupName) + common.ChainForCustomPlugin = append(common.ChainForCustomPlugin, collector) + } + + } + + logger.Infof("ChainForSingleGoroutine:%#v", common.ChainForSingleGoroutine) + logger.Infof("ChainForMultiGoroutine:%#v", common.ChainForMultiGoroutine) + logger.Infof("ChainForCustomPlugin:%#v", common.ChainForCustomPlugin) +} + +// UnRegister delete collector from chain +func UnRegister(worker reflect.Type) { + logger.Debugf("unRegister collector:%v", worker) + unRegisterChain(worker, &common.ChainForSingleGoroutine) + unRegisterChain(worker, &common.ChainForMultiGoroutine) + unRegisterChain(worker, &common.ChainForCustomPlugin) +} + +func unRegisterChain(worker reflect.Type, chain *[]common.MetricsCollector) { + newChain := make([]common.MetricsCollector, 0) + for _, collector := range *chain { + if reflect.TypeOf(collector) != worker { + newChain = append(newChain, collector) + } + } + *chain = newChain +} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go new file mode 100644 index 0000000..974ed3e --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go @@ -0,0 +1,216 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package config for general collector +package config + +import ( + "ascend-common/common-utils/utils" + "reflect" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" +) + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + initChain() +} + +func initChain() { + common.ChainForSingleGoroutine = []common.MetricsCollector{} + common.ChainForMultiGoroutine = []common.MetricsCollector{} +} + +func TestInitConfiguration(t *testing.T) { + convey.Convey("TestInitConfiguration", t, func() { + initConfiguration([]byte("test"), &presetConfigs) + convey.So(len(presetConfigs), convey.ShouldEqual, 0) + }) +} + +func TestLoadConfiguration(t *testing.T) { + convey.Convey("TestLoadConfiguration", t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + convey.Convey("load config ok", func() { + patches.ApplyFunc(loadFromFile, func(filePath string) []byte { + if filePath == PresetConfigPath { + filePath = "../../build/metricConfiguration.json" + } else if filePath == PluginConfigPath { + filePath = "../../build/pluginConfiguration.json" + } + fileBytes, _ := utils.LoadFile(filePath) + return fileBytes + }) + defer func() { + presetConfigs = make([]map[string]string, 0) + pluginConfigs = make([]map[string]string, 0) + }() + loadConfiguration() + convey.So(len(presetConfigs), convey.ShouldBeGreaterThan, 0) + convey.So(len(pluginConfigs), convey.ShouldBeGreaterThan, 0) + }) + convey.Convey("load config fail", func() { + presetConfigs = make([]map[string]string, 0) + pluginConfigs = make([]map[string]string, 0) + patches.ApplyFunc(loadFromFile, func(filePath string) []byte { + return nil + }) + loadConfiguration() + convey.So(len(presetConfigs), convey.ShouldEqual, len(defaultPresetConfigs)) + convey.So(len(pluginConfigs), convey.ShouldEqual, len(defaultPluginConfigs)) + }) + }) +} + +func TestAddPluginCollector(t *testing.T) { + convey.Convey("TestAddPluginCollector", t, func() { + convey.Convey("add plugin ok", func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + defer func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + }() + err := AddPluginCollector("test", &metrics.HccsCollector{}) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("add plugin fail", func() { + pluginCollectorMap["test"] = &metrics.HccsCollector{} + defer func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + }() + err := AddPluginCollector("test", &metrics.HccsCollector{}) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestDeletePluginCollector(t *testing.T) { + convey.Convey("TestDeletePluginCollector", t, func() { + convey.Convey("delete plugin ok", func() { + pluginCollectorMap["test"] = &metrics.HccsCollector{} + DeletePluginCollector("test") + convey.So(pluginCollectorMap["test"], convey.ShouldBeNil) + }) + convey.Convey("delete plugin fail", func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + DeletePluginCollector("test") + convey.So(len(pluginCollectorMap), convey.ShouldEqual, 0) + }) + }) +} + +func TestRegister(t *testing.T) { + convey.Convey("TestRegister", t, func() { + n := &common.NpuCollector{} + patches := gomonkey.NewPatches() + defer patches.Reset() + // Mock IsSupported method to always return true + patches.ApplyMethodReturn(&metrics.HccsCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.BaseInfoCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.SioCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.VersionCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.HbmCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.DdrCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.VnpuCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.PcieCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.NetworkCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.RoceCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.OpticalCollector{}, "IsSupported", true) + patches.ApplyFunc(loadConfiguration, func() { + initConfiguration(loadFromFile("../../build/metricConfiguration.json"), &presetConfigs) + initConfiguration(loadFromFile("../../build/pluginConfiguration.json"), &pluginConfigs) + }) + Register(n) + convey.Convey("Should add collectors to ChainForSingleGoroutine", func() { + convey.So(len(common.ChainForSingleGoroutine), convey.ShouldBeGreaterThan, 0) + }) + convey.Convey("Should add collectors to ChainForMultiGoroutine", func() { + convey.So(len(common.ChainForMultiGoroutine), convey.ShouldBeGreaterThan, 0) + }) + }) +} + +func TestUnRegister(t *testing.T) { + convey.Convey("TestUnRegister", t, func() { + // Initialize chains with some collectors + common.ChainForSingleGoroutine = []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.BaseInfoCollector{}, + } + common.ChainForMultiGoroutine = []common.MetricsCollector{ + &metrics.NetworkCollector{}, + &metrics.RoceCollector{}, + } + + convey.Convey("When UnRegister is called with HccsCollector type", func() { + UnRegister(reflect.TypeOf(&metrics.HccsCollector{})) + + convey.Convey("Should remove HccsCollector from ChainForSingleGoroutine", func() { + expected := []common.MetricsCollector{ + &metrics.BaseInfoCollector{}, + } + convey.So(len(common.ChainForSingleGoroutine), convey.ShouldEqual, len(expected)) + for i, collector := range common.ChainForSingleGoroutine { + convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) + } + }) + + convey.Convey("Should not affect ChainForMultiGoroutine", func() { + expected := []common.MetricsCollector{ + &metrics.NetworkCollector{}, + &metrics.RoceCollector{}, + } + convey.So(len(common.ChainForMultiGoroutine), convey.ShouldEqual, len(expected)) + for i, collector := range common.ChainForMultiGoroutine { + convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) + } + }) + }) + }) +} + +func TestUnRegisterChain(t *testing.T) { + convey.Convey("TestUnRegisterChain", t, func() { + // Initialize a chain with some collectors + chain := []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.BaseInfoCollector{}, + &metrics.NetworkCollector{}, + } + + convey.Convey("When unRegisterChain is called with BaseInfoCollector type", func() { + unRegisterChain(reflect.TypeOf(&metrics.BaseInfoCollector{}), &chain) + convey.Convey("Should remove BaseInfoCollector from the chain", func() { + expected := []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.NetworkCollector{}, + } + convey.So(len(chain), convey.ShouldEqual, len(expected)) + for i, collector := range chain { + convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) + } + }) + }) + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go new file mode 100644 index 0000000..5ee3c7f --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go @@ -0,0 +1,870 @@ +// +//Copyright 2018 The Kubernetes Authors. +//Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +//modify descripe: remove unused options for example: +//remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" +// +//Licensed under the Apache License, Version 2.0 (the "License"); +//you may not use this file except in compliance with the License. +//You may obtain a copy of the License at +// +//http://www.apache.org/licenses/LICENSE-2.0 +// +//Unless required by applicable law or agreed to in writing, software +//distributed under the License is distributed on an "AS IS" BASIS, +//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//See the License for the specific language governing permissions and +//limitations under the License. + +// To regenerate api.pb.go run hack/update-generated-runtime.sh + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.13.0 +// source: isula_api.proto + +package isula + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type ContainerState int32 + +const ( + ContainerState_CONTAINER_CREATED ContainerState = 0 + ContainerState_CONTAINER_RUNNING ContainerState = 1 + ContainerState_CONTAINER_EXITED ContainerState = 2 + ContainerState_CONTAINER_UNKNOWN ContainerState = 3 +) + +// Enum value maps for ContainerState. +var ( + ContainerState_name = map[int32]string{ + 0: "CONTAINER_CREATED", + 1: "CONTAINER_RUNNING", + 2: "CONTAINER_EXITED", + 3: "CONTAINER_UNKNOWN", + } + ContainerState_value = map[string]int32{ + "CONTAINER_CREATED": 0, + "CONTAINER_RUNNING": 1, + "CONTAINER_EXITED": 2, + "CONTAINER_UNKNOWN": 3, + } +) + +func (x ContainerState) Enum() *ContainerState { + p := new(ContainerState) + *p = x + return p +} + +func (x ContainerState) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ContainerState) Descriptor() protoreflect.EnumDescriptor { + return file_isula_api_proto_enumTypes[0].Descriptor() +} + +func (ContainerState) Type() protoreflect.EnumType { + return &file_isula_api_proto_enumTypes[0] +} + +func (x ContainerState) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use ContainerState.Descriptor instead. +func (ContainerState) EnumDescriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{0} +} + +// ImageSpec is an internal representation of an image. Currently, it wraps the +// value of a Container's Image field (e.g. imageID or imageDigest), but in the +// future it will include more detailed information about the different image types. +type ImageSpec struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` + // Unstructured key-value map holding arbitrary metadata. + // ImageSpec Annotations can be used to help the runtime target specific + // images in multi-arch images. + Annotations map[string]string `protobuf:"bytes,2,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` +} + +func (x *ImageSpec) Reset() { + *x = ImageSpec{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ImageSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ImageSpec) ProtoMessage() {} + +func (x *ImageSpec) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ImageSpec.ProtoReflect.Descriptor instead. +func (*ImageSpec) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{0} +} + +func (x *ImageSpec) GetImage() string { + if x != nil { + return x.Image + } + return "" +} + +func (x *ImageSpec) GetAnnotations() map[string]string { + if x != nil { + return x.Annotations + } + return nil +} + +// ContainerMetadata holds all necessary information for building the container +// name. The container runtime is encouraged to expose the metadata in its user +// interface for better user experience. E.g., runtime can construct a unique +// container name based on the metadata. Note that (name, attempt) is unique +// within a sandbox for the entire lifetime of the sandbox. +type ContainerMetadata struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Name of the container. Same as the container name in the PodSpec. + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + // Attempt number of creating the container. Default: 0. + Attempt uint32 `protobuf:"varint,2,opt,name=attempt,proto3" json:"attempt,omitempty"` +} + +func (x *ContainerMetadata) Reset() { + *x = ContainerMetadata{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ContainerMetadata) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ContainerMetadata) ProtoMessage() {} + +func (x *ContainerMetadata) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ContainerMetadata.ProtoReflect.Descriptor instead. +func (*ContainerMetadata) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{1} +} + +func (x *ContainerMetadata) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ContainerMetadata) GetAttempt() uint32 { + if x != nil { + return x.Attempt + } + return 0 +} + +// ContainerStateValue is the wrapper of ContainerState. +type ContainerStateValue struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // State of the container. + State ContainerState `protobuf:"varint,1,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` +} + +func (x *ContainerStateValue) Reset() { + *x = ContainerStateValue{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ContainerStateValue) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ContainerStateValue) ProtoMessage() {} + +func (x *ContainerStateValue) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ContainerStateValue.ProtoReflect.Descriptor instead. +func (*ContainerStateValue) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{2} +} + +func (x *ContainerStateValue) GetState() ContainerState { + if x != nil { + return x.State + } + return ContainerState_CONTAINER_CREATED +} + +// ContainerFilter is used to filter containers. +// All those fields are combined with 'AND' +type ContainerFilter struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // ID of the container. + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // State of the container. + State *ContainerStateValue `protobuf:"bytes,2,opt,name=state,proto3" json:"state,omitempty"` + // ID of the PodSandbox. + PodSandboxId string `protobuf:"bytes,3,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + LabelSelector map[string]string `protobuf:"bytes,4,rep,name=label_selector,json=labelSelector,proto3" json:"label_selector,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` +} + +func (x *ContainerFilter) Reset() { + *x = ContainerFilter{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ContainerFilter) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ContainerFilter) ProtoMessage() {} + +func (x *ContainerFilter) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ContainerFilter.ProtoReflect.Descriptor instead. +func (*ContainerFilter) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{3} +} + +func (x *ContainerFilter) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *ContainerFilter) GetState() *ContainerStateValue { + if x != nil { + return x.State + } + return nil +} + +func (x *ContainerFilter) GetPodSandboxId() string { + if x != nil { + return x.PodSandboxId + } + return "" +} + +func (x *ContainerFilter) GetLabelSelector() map[string]string { + if x != nil { + return x.LabelSelector + } + return nil +} + +type ListContainersRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Filter *ContainerFilter `protobuf:"bytes,1,opt,name=filter,proto3" json:"filter,omitempty"` +} + +func (x *ListContainersRequest) Reset() { + *x = ListContainersRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ListContainersRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListContainersRequest) ProtoMessage() {} + +func (x *ListContainersRequest) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListContainersRequest.ProtoReflect.Descriptor instead. +func (*ListContainersRequest) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{4} +} + +func (x *ListContainersRequest) GetFilter() *ContainerFilter { + if x != nil { + return x.Filter + } + return nil +} + +// Container provides the runtime information for a container, such as ID, hash, +// state of the container. +type Container struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // ID of the container, used by the container runtime to identify + // a container. + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // ID of the sandbox to which this container belongs. + PodSandboxId string `protobuf:"bytes,2,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` + // Metadata of the container. + Metadata *ContainerMetadata `protobuf:"bytes,3,opt,name=metadata,proto3" json:"metadata,omitempty"` + // Spec of the image. + Image *ImageSpec `protobuf:"bytes,4,opt,name=image,proto3" json:"image,omitempty"` + // Reference to the image in use. For most runtimes, this should be an + // image ID. + ImageRef string `protobuf:"bytes,5,opt,name=image_ref,json=imageRef,proto3" json:"image_ref,omitempty"` + // State of the container. + State ContainerState `protobuf:"varint,6,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` + // Creation time of the container in nanoseconds. + CreatedAt int64 `protobuf:"varint,7,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` + // Key-value pairs that may be used to scope and select individual resources. + Labels map[string]string `protobuf:"bytes,8,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate this Container. + Annotations map[string]string `protobuf:"bytes,9,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` +} + +func (x *Container) Reset() { + *x = Container{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Container) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Container) ProtoMessage() {} + +func (x *Container) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Container.ProtoReflect.Descriptor instead. +func (*Container) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{5} +} + +func (x *Container) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *Container) GetPodSandboxId() string { + if x != nil { + return x.PodSandboxId + } + return "" +} + +func (x *Container) GetMetadata() *ContainerMetadata { + if x != nil { + return x.Metadata + } + return nil +} + +func (x *Container) GetImage() *ImageSpec { + if x != nil { + return x.Image + } + return nil +} + +func (x *Container) GetImageRef() string { + if x != nil { + return x.ImageRef + } + return "" +} + +func (x *Container) GetState() ContainerState { + if x != nil { + return x.State + } + return ContainerState_CONTAINER_CREATED +} + +func (x *Container) GetCreatedAt() int64 { + if x != nil { + return x.CreatedAt + } + return 0 +} + +func (x *Container) GetLabels() map[string]string { + if x != nil { + return x.Labels + } + return nil +} + +func (x *Container) GetAnnotations() map[string]string { + if x != nil { + return x.Annotations + } + return nil +} + +type ListContainersResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // List of containers. + Containers []*Container `protobuf:"bytes,1,rep,name=containers,proto3" json:"containers,omitempty"` +} + +func (x *ListContainersResponse) Reset() { + *x = ListContainersResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ListContainersResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListContainersResponse) ProtoMessage() {} + +func (x *ListContainersResponse) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListContainersResponse.ProtoReflect.Descriptor instead. +func (*ListContainersResponse) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{6} +} + +func (x *ListContainersResponse) GetContainers() []*Container { + if x != nil { + return x.Containers + } + return nil +} + +var File_isula_api_proto protoreflect.FileDescriptor + +var file_isula_api_proto_rawDesc = []byte{ + 0x0a, 0x0f, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x5f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x12, 0x10, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, + 0x68, 0x61, 0x32, 0x22, 0xb1, 0x01, 0x0a, 0x09, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, + 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, + 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, + 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, + 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x3e, 0x0a, 0x10, 0x41, 0x6e, 0x6e, 0x6f, 0x74, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, + 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, + 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x41, 0x0a, 0x11, 0x43, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, + 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, + 0x12, 0x18, 0x0a, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x0d, 0x52, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x22, 0x4d, 0x0a, 0x13, 0x43, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, 0x6c, 0x75, + 0x65, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, + 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, + 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, + 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0xa3, 0x02, 0x0a, 0x0f, 0x43, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x12, 0x0e, 0x0a, + 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x3b, 0x0a, + 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x25, 0x2e, 0x72, + 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, + 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, + 0x6c, 0x75, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, + 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x5f, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, + 0x12, 0x5b, 0x0a, 0x0e, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, + 0x6f, 0x72, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, + 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, 0x65, + 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0d, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x1a, 0x40, 0x0a, + 0x12, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, + 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, + 0x52, 0x0a, 0x15, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x39, 0x0a, 0x06, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, + 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x52, 0x06, 0x66, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x22, 0xb5, 0x04, 0x0a, 0x09, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, + 0x72, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, + 0x64, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, + 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, + 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, 0x12, 0x3f, 0x0a, 0x08, 0x6d, 0x65, 0x74, 0x61, 0x64, + 0x61, 0x74, 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x72, 0x75, 0x6e, 0x74, + 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x08, + 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x31, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, + 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, + 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x49, 0x6d, 0x61, 0x67, 0x65, + 0x53, 0x70, 0x65, 0x63, 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x69, + 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x72, 0x65, 0x66, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, + 0x69, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x66, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, + 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, + 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x61, 0x74, 0x18, 0x07, + 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x41, 0x74, 0x12, + 0x3f, 0x0a, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, + 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, + 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, + 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, + 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, + 0x09, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, + 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, + 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, + 0x1a, 0x39, 0x0a, 0x0b, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, + 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, + 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x1a, 0x3e, 0x0a, 0x10, 0x41, + 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, + 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, + 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x55, 0x0a, 0x16, 0x4c, + 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, + 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, + 0x72, 0x73, 0x2a, 0x6b, 0x0a, 0x0e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, + 0x74, 0x61, 0x74, 0x65, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, + 0x52, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x43, + 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x52, 0x55, 0x4e, 0x4e, 0x49, 0x4e, 0x47, + 0x10, 0x01, 0x12, 0x14, 0x0a, 0x10, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, + 0x45, 0x58, 0x49, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, + 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x03, 0x32, + 0x77, 0x0a, 0x0e, 0x52, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, + 0x65, 0x12, 0x65, 0x0a, 0x0e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x73, 0x12, 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, + 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x28, 0x2e, 0x72, + 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, + 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0a, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, + 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_isula_api_proto_rawDescOnce sync.Once + file_isula_api_proto_rawDescData = file_isula_api_proto_rawDesc +) + +func file_isula_api_proto_rawDescGZIP() []byte { + file_isula_api_proto_rawDescOnce.Do(func() { + file_isula_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_isula_api_proto_rawDescData) + }) + return file_isula_api_proto_rawDescData +} + +var file_isula_api_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_isula_api_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_isula_api_proto_goTypes = []interface{}{ + (ContainerState)(0), // 0: runtime.v1alpha2.ContainerState + (*ImageSpec)(nil), // 1: runtime.v1alpha2.ImageSpec + (*ContainerMetadata)(nil), // 2: runtime.v1alpha2.ContainerMetadata + (*ContainerStateValue)(nil), // 3: runtime.v1alpha2.ContainerStateValue + (*ContainerFilter)(nil), // 4: runtime.v1alpha2.ContainerFilter + (*ListContainersRequest)(nil), // 5: runtime.v1alpha2.ListContainersRequest + (*Container)(nil), // 6: runtime.v1alpha2.Container + (*ListContainersResponse)(nil), // 7: runtime.v1alpha2.ListContainersResponse + nil, // 8: runtime.v1alpha2.ImageSpec.AnnotationsEntry + nil, // 9: runtime.v1alpha2.ContainerFilter.LabelSelectorEntry + nil, // 10: runtime.v1alpha2.Container.LabelsEntry + nil, // 11: runtime.v1alpha2.Container.AnnotationsEntry +} +var file_isula_api_proto_depIdxs = []int32{ + 8, // 0: runtime.v1alpha2.ImageSpec.annotations:type_name -> runtime.v1alpha2.ImageSpec.AnnotationsEntry + 0, // 1: runtime.v1alpha2.ContainerStateValue.state:type_name -> runtime.v1alpha2.ContainerState + 3, // 2: runtime.v1alpha2.ContainerFilter.state:type_name -> runtime.v1alpha2.ContainerStateValue + 9, // 3: runtime.v1alpha2.ContainerFilter.label_selector:type_name -> runtime.v1alpha2.ContainerFilter.LabelSelectorEntry + 4, // 4: runtime.v1alpha2.ListContainersRequest.filter:type_name -> runtime.v1alpha2.ContainerFilter + 2, // 5: runtime.v1alpha2.Container.metadata:type_name -> runtime.v1alpha2.ContainerMetadata + 1, // 6: runtime.v1alpha2.Container.image:type_name -> runtime.v1alpha2.ImageSpec + 0, // 7: runtime.v1alpha2.Container.state:type_name -> runtime.v1alpha2.ContainerState + 10, // 8: runtime.v1alpha2.Container.labels:type_name -> runtime.v1alpha2.Container.LabelsEntry + 11, // 9: runtime.v1alpha2.Container.annotations:type_name -> runtime.v1alpha2.Container.AnnotationsEntry + 6, // 10: runtime.v1alpha2.ListContainersResponse.containers:type_name -> runtime.v1alpha2.Container + 5, // 11: runtime.v1alpha2.RuntimeService.ListContainers:input_type -> runtime.v1alpha2.ListContainersRequest + 7, // 12: runtime.v1alpha2.RuntimeService.ListContainers:output_type -> runtime.v1alpha2.ListContainersResponse + 12, // [12:13] is the sub-list for method output_type + 11, // [11:12] is the sub-list for method input_type + 11, // [11:11] is the sub-list for extension type_name + 11, // [11:11] is the sub-list for extension extendee + 0, // [0:11] is the sub-list for field type_name +} + +func init() { file_isula_api_proto_init() } +func file_isula_api_proto_init() { + if File_isula_api_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_isula_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ImageSpec) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ContainerMetadata) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ContainerStateValue) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ContainerFilter) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ListContainersRequest) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*Container) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ListContainersResponse) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_isula_api_proto_rawDesc, + NumEnums: 1, + NumMessages: 11, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_isula_api_proto_goTypes, + DependencyIndexes: file_isula_api_proto_depIdxs, + EnumInfos: file_isula_api_proto_enumTypes, + MessageInfos: file_isula_api_proto_msgTypes, + }.Build() + File_isula_api_proto = out.File + file_isula_api_proto_rawDesc = nil + file_isula_api_proto_goTypes = nil + file_isula_api_proto_depIdxs = nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto new file mode 100644 index 0000000..3f1f9f9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto @@ -0,0 +1,118 @@ +/* +Copyright 2018 The Kubernetes Authors. +Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. + modify descripe: remove unused options for example: + remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// To regenerate api.pb.go run hack/update-generated-runtime.sh +syntax = 'proto3'; + +package runtime.v1alpha2; +option go_package = "./;isula"; + +// Runtime service defines the public APIs for remote container runtimes +service RuntimeService { + // ListContainers lists all containers by filters. + rpc ListContainers(ListContainersRequest) returns (ListContainersResponse) {} +} + +// ImageSpec is an internal representation of an image. Currently, it wraps the +// value of a Container's Image field (e.g. imageID or imageDigest), but in the +// future it will include more detailed information about the different image types. +message ImageSpec { + string image = 1; + // Unstructured key-value map holding arbitrary metadata. + // ImageSpec Annotations can be used to help the runtime target specific + // images in multi-arch images. + map annotations = 2; +} + +// ContainerMetadata holds all necessary information for building the container +// name. The container runtime is encouraged to expose the metadata in its user +// interface for better user experience. E.g., runtime can construct a unique +// container name based on the metadata. Note that (name, attempt) is unique +// within a sandbox for the entire lifetime of the sandbox. +message ContainerMetadata { + // Name of the container. Same as the container name in the PodSpec. + string name = 1; + // Attempt number of creating the container. Default: 0. + uint32 attempt = 2; +} + +enum ContainerState { + CONTAINER_CREATED = 0; + CONTAINER_RUNNING = 1; + CONTAINER_EXITED = 2; + CONTAINER_UNKNOWN = 3; +} + +// ContainerStateValue is the wrapper of ContainerState. +message ContainerStateValue { + // State of the container. + ContainerState state = 1; +} + +// ContainerFilter is used to filter containers. +// All those fields are combined with 'AND' +message ContainerFilter { + // ID of the container. + string id = 1; + // State of the container. + ContainerStateValue state = 2; + // ID of the PodSandbox. + string pod_sandbox_id = 3; + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + map label_selector = 4; +} + +message ListContainersRequest { + ContainerFilter filter = 1; +} + +// Container provides the runtime information for a container, such as ID, hash, +// state of the container. +message Container { + // ID of the container, used by the container runtime to identify + // a container. + string id = 1; + // ID of the sandbox to which this container belongs. + string pod_sandbox_id = 2; + // Metadata of the container. + ContainerMetadata metadata = 3; + // Spec of the image. + ImageSpec image = 4; + // Reference to the image in use. For most runtimes, this should be an + // image ID. + string image_ref = 5; + // State of the container. + ContainerState state = 6; + // Creation time of the container in nanoseconds. + int64 created_at = 7; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 8; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate this Container. + map annotations = 9; +} + +message ListContainersResponse { + // List of containers. + repeated Container containers = 1; +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go new file mode 100644 index 0000000..a503e15 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go @@ -0,0 +1,107 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.13.0 +// source: isula_api.proto + +package isula + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// RuntimeServiceClient is the client API for RuntimeService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type RuntimeServiceClient interface { + // ListContainers lists all containers by filters. + ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) +} + +type runtimeServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewRuntimeServiceClient(cc grpc.ClientConnInterface) RuntimeServiceClient { + return &runtimeServiceClient{cc} +} + +func (c *runtimeServiceClient) ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) { + out := new(ListContainersResponse) + err := c.cc.Invoke(ctx, "/runtime.v1alpha2.RuntimeService/ListContainers", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// RuntimeServiceServer is the server API for RuntimeService service. +// All implementations must embed UnimplementedRuntimeServiceServer +// for forward compatibility +type RuntimeServiceServer interface { + // ListContainers lists all containers by filters. + ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) + mustEmbedUnimplementedRuntimeServiceServer() +} + +// UnimplementedRuntimeServiceServer must be embedded to have forward compatible implementations. +type UnimplementedRuntimeServiceServer struct { +} + +func (UnimplementedRuntimeServiceServer) ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method ListContainers not implemented") +} +func (UnimplementedRuntimeServiceServer) mustEmbedUnimplementedRuntimeServiceServer() {} + +// UnsafeRuntimeServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to RuntimeServiceServer will +// result in compilation errors. +type UnsafeRuntimeServiceServer interface { + mustEmbedUnimplementedRuntimeServiceServer() +} + +func RegisterRuntimeServiceServer(s grpc.ServiceRegistrar, srv RuntimeServiceServer) { + s.RegisterService(&RuntimeService_ServiceDesc, srv) +} + +func _RuntimeService_ListContainers_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(ListContainersRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(RuntimeServiceServer).ListContainers(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/runtime.v1alpha2.RuntimeService/ListContainers", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(RuntimeServiceServer).ListContainers(ctx, req.(*ListContainersRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// RuntimeService_ServiceDesc is the grpc.ServiceDesc for RuntimeService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var RuntimeService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "runtime.v1alpha2.RuntimeService", + HandlerType: (*RuntimeServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "ListContainers", + Handler: _RuntimeService_ListContainers_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "isula_api.proto", +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go new file mode 100644 index 0000000..e31fea9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go @@ -0,0 +1,39 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package isula for monitoring isula' npu allocation +package isula + +// Config represents env +type Config struct { + Env []string `json:"Env,omitempty" platform:"linux"` +} + +// DeviceInfo represents device info +type DeviceInfo struct { + PathInContainer string `json:"PathInContainer,omitempty" platform:"linux"` +} + +// HostConfig represents host config content +type HostConfig struct { + Devices []DeviceInfo `json:"Devices,omitempty" platform:"linux"` + Privileged bool `json:"Privileged,omitempty" platform:"linux"` +} + +// ContainerJson represents container json content +type ContainerJson struct { + Config *Config `json:"Config,omitempty" platform:"linux"` + HostConfig *HostConfig `json:"HostConfig,omitempty" platform:"linux"` +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go new file mode 100644 index 0000000..5e4f83f --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go @@ -0,0 +1,278 @@ +// ####################################################################### +// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +// # - iSulad licensed under the Mulan PSL v2. +// # - You can use this software according to the terms and conditions of the Mulan PSL v2. +// # - You may obtain a copy of Mulan PSL v2 at: +// # - http://license.coscl.org.cn/MulanPSL2 +// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +// # - PURPOSE. +// # - See the Mulan PSL v2 for more details. +// ##- @Description: generate grpc +// ##- @Author: wujing +// ##- @Create: 2019-04-25 +// ####################################################################### + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.13.0 +// source: isulad.proto + +package isula + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type InspectContainerRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + Bformat bool `protobuf:"varint,2,opt,name=bformat,proto3" json:"bformat,omitempty"` + Timeout int32 `protobuf:"varint,3,opt,name=timeout,proto3" json:"timeout,omitempty"` +} + +func (x *InspectContainerRequest) Reset() { + *x = InspectContainerRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_isulad_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *InspectContainerRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InspectContainerRequest) ProtoMessage() {} + +func (x *InspectContainerRequest) ProtoReflect() protoreflect.Message { + mi := &file_isulad_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use InspectContainerRequest.ProtoReflect.Descriptor instead. +func (*InspectContainerRequest) Descriptor() ([]byte, []int) { + return file_isulad_proto_rawDescGZIP(), []int{0} +} + +func (x *InspectContainerRequest) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *InspectContainerRequest) GetBformat() bool { + if x != nil { + return x.Bformat + } + return false +} + +func (x *InspectContainerRequest) GetTimeout() int32 { + if x != nil { + return x.Timeout + } + return 0 +} + +type InspectContainerResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ContainerJSON string `protobuf:"bytes,1,opt,name=ContainerJSON,proto3" json:"ContainerJSON,omitempty"` + Cc uint32 `protobuf:"varint,2,opt,name=cc,proto3" json:"cc,omitempty"` + Errmsg string `protobuf:"bytes,3,opt,name=errmsg,proto3" json:"errmsg,omitempty"` +} + +func (x *InspectContainerResponse) Reset() { + *x = InspectContainerResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_isulad_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *InspectContainerResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InspectContainerResponse) ProtoMessage() {} + +func (x *InspectContainerResponse) ProtoReflect() protoreflect.Message { + mi := &file_isulad_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use InspectContainerResponse.ProtoReflect.Descriptor instead. +func (*InspectContainerResponse) Descriptor() ([]byte, []int) { + return file_isulad_proto_rawDescGZIP(), []int{1} +} + +func (x *InspectContainerResponse) GetContainerJSON() string { + if x != nil { + return x.ContainerJSON + } + return "" +} + +func (x *InspectContainerResponse) GetCc() uint32 { + if x != nil { + return x.Cc + } + return 0 +} + +func (x *InspectContainerResponse) GetErrmsg() string { + if x != nil { + return x.Errmsg + } + return "" +} + +var File_isulad_proto protoreflect.FileDescriptor + +var file_isulad_proto_rawDesc = []byte{ + 0x0a, 0x0c, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0a, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x22, 0x5d, 0x0a, 0x17, 0x49, 0x6e, + 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x12, + 0x18, 0x0a, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x22, 0x68, 0x0a, 0x18, 0x49, 0x6e, 0x73, + 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x24, 0x0a, 0x0d, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x43, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x12, 0x0e, 0x0a, 0x02, 0x63, + 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x02, 0x63, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x65, + 0x72, 0x72, 0x6d, 0x73, 0x67, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x65, 0x72, 0x72, + 0x6d, 0x73, 0x67, 0x32, 0x68, 0x0a, 0x10, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x54, 0x0a, 0x07, 0x49, 0x6e, 0x73, 0x70, 0x65, + 0x63, 0x74, 0x12, 0x23, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x2e, + 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, + 0x6e, 0x65, 0x72, 0x73, 0x2e, 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x42, 0x0c, 0x48, + 0x02, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x33, +} + +var ( + file_isulad_proto_rawDescOnce sync.Once + file_isulad_proto_rawDescData = file_isulad_proto_rawDesc +) + +func file_isulad_proto_rawDescGZIP() []byte { + file_isulad_proto_rawDescOnce.Do(func() { + file_isulad_proto_rawDescData = protoimpl.X.CompressGZIP(file_isulad_proto_rawDescData) + }) + return file_isulad_proto_rawDescData +} + +var file_isulad_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_isulad_proto_goTypes = []interface{}{ + (*InspectContainerRequest)(nil), // 0: containers.InspectContainerRequest + (*InspectContainerResponse)(nil), // 1: containers.InspectContainerResponse +} +var file_isulad_proto_depIdxs = []int32{ + 0, // 0: containers.ContainerService.Inspect:input_type -> containers.InspectContainerRequest + 1, // 1: containers.ContainerService.Inspect:output_type -> containers.InspectContainerResponse + 1, // [1:2] is the sub-list for method output_type + 0, // [0:1] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_isulad_proto_init() } +func file_isulad_proto_init() { + if File_isulad_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_isulad_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*InspectContainerRequest) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isulad_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*InspectContainerResponse) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_isulad_proto_rawDesc, + NumEnums: 0, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_isulad_proto_goTypes, + DependencyIndexes: file_isulad_proto_depIdxs, + MessageInfos: file_isulad_proto_msgTypes, + }.Build() + File_isulad_proto = out.File + file_isulad_proto_rawDesc = nil + file_isulad_proto_goTypes = nil + file_isulad_proto_depIdxs = nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto new file mode 100644 index 0000000..af5f85c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto @@ -0,0 +1,35 @@ +// ####################################################################### +// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +// # - iSulad licensed under the Mulan PSL v2. +// # - You can use this software according to the terms and conditions of the Mulan PSL v2. +// # - You may obtain a copy of Mulan PSL v2 at: +// # - http://license.coscl.org.cn/MulanPSL2 +// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +// # - PURPOSE. +// # - See the Mulan PSL v2 for more details. +// ##- @Description: generate grpc +// ##- @Author: wujing +// ##- @Create: 2019-04-25 +// ####################################################################### +syntax = "proto3"; +option optimize_for = CODE_SIZE; + +package containers; +option go_package = "./;isula"; + +service ContainerService { + rpc Inspect(InspectContainerRequest) returns (InspectContainerResponse); +} + +message InspectContainerRequest { + string id = 1; + bool bformat = 2; + int32 timeout = 3; +} + +message InspectContainerResponse { + string ContainerJSON = 1; + uint32 cc = 2; + string errmsg = 3; +} \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go new file mode 100644 index 0000000..c563e0a --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go @@ -0,0 +1,105 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.13.0 +// source: isulad.proto + +package isula + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// ContainerServiceClient is the client API for ContainerService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type ContainerServiceClient interface { + Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) +} + +type containerServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewContainerServiceClient(cc grpc.ClientConnInterface) ContainerServiceClient { + return &containerServiceClient{cc} +} + +func (c *containerServiceClient) Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) { + out := new(InspectContainerResponse) + err := c.cc.Invoke(ctx, "/containers.ContainerService/Inspect", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// ContainerServiceServer is the server API for ContainerService service. +// All implementations must embed UnimplementedContainerServiceServer +// for forward compatibility +type ContainerServiceServer interface { + Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) + mustEmbedUnimplementedContainerServiceServer() +} + +// UnimplementedContainerServiceServer must be embedded to have forward compatible implementations. +type UnimplementedContainerServiceServer struct { +} + +func (UnimplementedContainerServiceServer) Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Inspect not implemented") +} +func (UnimplementedContainerServiceServer) mustEmbedUnimplementedContainerServiceServer() {} + +// UnsafeContainerServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to ContainerServiceServer will +// result in compilation errors. +type UnsafeContainerServiceServer interface { + mustEmbedUnimplementedContainerServiceServer() +} + +func RegisterContainerServiceServer(s grpc.ServiceRegistrar, srv ContainerServiceServer) { + s.RegisterService(&ContainerService_ServiceDesc, srv) +} + +func _ContainerService_Inspect_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(InspectContainerRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ContainerServiceServer).Inspect(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/containers.ContainerService/Inspect", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ContainerServiceServer).Inspect(ctx, req.(*InspectContainerRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// ContainerService_ServiceDesc is the grpc.ServiceDesc for ContainerService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var ContainerService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "containers.ContainerService", + HandlerType: (*ContainerServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "Inspect", + Handler: _ContainerService_Inspect_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "isulad.proto", +} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser.go b/mind-cluster/component/npu-exporter/collector/container/parser.go new file mode 100644 index 0000000..4531374 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/parser.go @@ -0,0 +1,630 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container for monitoring containers' npu allocation +package container + +import ( + "bufio" + "context" + "errors" + "fmt" + "math" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + namespaceMoby = "moby" // Docker + namespaceK8s = "k8s.io" // CRI + Containerd + sliceLen8 = 8 + ascendEnvPart = 2 + charDevice = "c" + + minus = "-" + comma = "," + ascend = "Ascend" + maxEnvLength = 1024 + parsingNpuDefaultTimeoutDuration = 3 +) + +const ( + // EndpointTypeContainerd K8S + Containerd + EndpointTypeContainerd = iota + // EndpointTypeDockerd Docker with or without K8S + EndpointTypeDockerd + // EndpointTypeIsula K8S + isula + EndpointTypeIsula = 2 +) + +var ( + // ErrFromContext error is from the context + ErrFromContext = errors.New("error from context") + + npuMajorID []string + npuMajorFetchCtrl sync.Once + parsingNpuDefaultTimeout = parsingNpuDefaultTimeoutDuration * time.Second +) + +var ( + envErrDescribe = func(ctrID, devID, env string, err error) string { + return fmt.Sprintf("container (%s) has an invalid device ID (%s) in %s, err is %v", ctrID, devID, env, err) + } + minusStyle = func(s string) bool { + return strings.Contains(s, minus) + } + commaMinusStyle = func(s string) bool { + return strings.Contains(s, minus) && strings.Contains(s, comma) + } + ascendStyle = func(s string) bool { + return strings.Contains(s, ascend) + } +) + +// CntNpuMonitorOpts contains setting options for monitoring containers +type CntNpuMonitorOpts struct { + EndpointType int // containerd or docker + CriEndpoint string // CRI server address + UseCriBackup bool // whether try to use cri backup address + OciEndpoint string // OCI server, now is containerd address + UseOciBackup bool // whether try to use oci backup address +} + +// MakeDevicesParser evaluates option settings and make an instance according to it +func MakeDevicesParser(opts CntNpuMonitorOpts) *DevicesParser { + runtimeOperator := &RuntimeOperatorTool{ + UseCriBackup: opts.UseCriBackup, + UseOciBackup: opts.UseOciBackup, + CriEndpoint: opts.CriEndpoint, + OciEndpoint: opts.OciEndpoint, + } + parser := &DevicesParser{ + RuntimeOperator: runtimeOperator, + } + + switch opts.EndpointType { + case EndpointTypeContainerd: + runtimeOperator.Namespace = namespaceK8s + case EndpointTypeDockerd: + runtimeOperator.Namespace = namespaceMoby + case EndpointTypeIsula: + runtimeOperator.Namespace = namespaceK8s + default: + logger.Errorf("invalid type value %d", opts.EndpointType) + } + + return parser +} + +// DevicesInfo the container device information struct +type DevicesInfo struct { + // container id + ID string + // container name, the format is: PodNameSpace_PodName_ContainerName + Name string + Devices []int +} + +// DevicesInfos the device information storage map +type DevicesInfos = map[string]DevicesInfo + +// DevicesParser the parser which parse device info +type DevicesParser struct { + // instances + result chan DevicesInfos + err chan error + // configuration + RuntimeOperator RuntimeOperator + Timeout time.Duration +} + +// Init initializes connection to containerd daemon and to CRI server or dockerd daemon based on name fetcher setting +func (dp *DevicesParser) Init() error { + if err := dp.RuntimeOperator.Init(); err != nil { + return contactError(err, "connecting to container runtime failed") + } + dp.result = make(chan DevicesInfos, 1) + dp.err = make(chan error, 1) + return nil +} + +// RecvResult exposes the channel used for receiving devices info analyzing result +func (dp *DevicesParser) RecvResult() <-chan DevicesInfos { + return dp.result +} + +// RecvErr exposes the channel used for receiving errors occurred during analyzing +func (dp *DevicesParser) RecvErr() <-chan error { + return dp.err +} + +// Close closes all connections and channels established during initializing +func (dp *DevicesParser) Close() { + _ = dp.RuntimeOperator.Close() +} + +func (dp *DevicesParser) parseDevices(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { + if dp.RuntimeOperator.GetContainerType() == IsulaContainer { + return dp.parseDeviceInIsula(ctx, c, rs) + } + + return dp.parseDevicesInContainerd(ctx, c, rs) +} + +func (dp *DevicesParser) parseDevicesInContainerd(ctx context.Context, c *CommonContainer, + rs chan<- DevicesInfo) error { + if rs == nil { + return errors.New("empty result channel") + } + deviceInfo := DevicesInfo{} + defer func(di *DevicesInfo) { + rs <- *di + }(&deviceInfo) + + spec, err := dp.RuntimeOperator.GetContainerInfoByID(ctx, c.Id) + if err != nil { + return contactError(err, fmt.Sprintf("cannot get container devices by container id (%s)", c.Id)) + } + if spec.Linux == nil || spec.Linux.Resources == nil || len(spec.Linux.Resources.Devices) > maxDevicesNum { + return contactError(errors.New("device error"), + fmt.Sprintf("devices in container is too much (%v) or empty", maxDevicesNum)) + } + if spec.Process == nil || len(spec.Process.Env) > maxEnvNum { + return contactError(errors.New("env error"), fmt.Sprintf("env in container is too much (%v) or empty", + maxEnvNum)) + } + + envs := spec.Process.Env + for i := len(envs) - 1; i >= 0; i-- { + e := envs[i] + if strings.Contains(e, api.AscendDeviceInfo) { + deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) + return err + } + } + + deviceInfo, err = dp.getDevicesWithoutAscendRuntime(spec, c) + return err +} + +func (dp *DevicesParser) getDevicesWithoutAscendRuntime(spec v1.Spec, c *CommonContainer) (DevicesInfo, error) { + deviceInfo := DevicesInfo{} + devicesIDs, err := filterNPUDevices(spec) + if err != nil { + logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) + return DevicesInfo{}, nil + } + logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) + + if len(devicesIDs) != 0 { + if deviceInfo, err = makeUpDeviceInfo(c); err == nil { + deviceInfo.Devices = devicesIDs + return deviceInfo, nil + } else { + logger.Errorf("makeUpDeviceInfo failed: %s", err) + } + return DevicesInfo{}, err + } + + return DevicesInfo{}, nil +} + +func (dp *DevicesParser) getDevicesWithAscendRuntime(ascendDevEnv string, c *CommonContainer) (DevicesInfo, error) { + logger.Debugf("get device info by env (%s) in %s", ascendDevEnv, c.Id) + devInfo := strings.Split(ascendDevEnv, "=") + if len(devInfo) != ascendEnvPart { + return DevicesInfo{}, fmt.Errorf("an invalid %s env(%s)", api.AscendDeviceInfo, ascendDevEnv) + } + devicesIDs := dp.parseDiffEnvFmt(devInfo[1], c.Id) + if len(devicesIDs) == 0 { + return DevicesInfo{}, nil + } + + deviceInfo, err := makeUpDeviceInfo(c) + if err != nil { + hwlog.RunLog.Error(err) + return DevicesInfo{}, err + } + deviceInfo.Devices = devicesIDs + return deviceInfo, nil +} + +func (dp *DevicesParser) parseDiffEnvFmt(devices, containerID string) []int { + if len(devices) > maxEnvLength { + return []int{} + } + if ascendStyle(devices) { + return dp.getDeviceIDsByAscendStyle(devices, containerID) + } + if commaMinusStyle(devices) { + return dp.getDeviceIDsByCommaMinusStyle(devices, containerID) + } + if minusStyle(devices) { + return dp.getDeviceIDsByMinusStyle(devices, containerID) + } + return dp.getDeviceIDsByCommaStyle(devices, containerID) +} + +func (dp *DevicesParser) getDeviceIDsByCommaStyle(devices, containerID string) []int { + devList := strings.Split(devices, comma) + devicesIDs := make([]int, 0, len(devList)) + for _, devID := range devList { + id, err := strconv.Atoi(devID) + if err != nil { + logger.Errorf("container (%s) has an invalid device ID (%v) in %s, error is %s", containerID, + devID, api.AscendDeviceInfo, err) + continue + } + devicesIDs = append(devicesIDs, id) + } + return devicesIDs +} + +func (dp *DevicesParser) getDeviceIDsByAscendStyle(devices, containerID string) []int { + devList := strings.Split(devices, comma) + deviceIDs := make([]int, 0, len(devList)) + for _, subDevice := range devList { + deviceName := strings.Split(subDevice, minus) + if len(deviceName) != ascendEnvPart { + logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, nil)) + continue + } + id, err := strconv.Atoi(deviceName[1]) + if err != nil { + logger.Errorf(envErrDescribe(containerID, deviceName[1], api.AscendDeviceInfo, err)) + continue + } + deviceIDs = append(deviceIDs, id) + } + return deviceIDs +} + +func (dp *DevicesParser) getDeviceIDsByMinusStyle(devices, containerID string) []int { + deviceIDs := make([]int, 0) + devIDRange := strings.Split(devices, minus) + if len(devIDRange) != ascendEnvPart { + logger.Errorf(envErrDescribe(containerID, "range", api.AscendDeviceInfo, nil)) + return deviceIDs + } + minDevID, err := strconv.Atoi(devIDRange[0]) + if err != nil { + logger.Errorf(envErrDescribe(containerID, devIDRange[0], api.AscendDeviceInfo, err)) + return deviceIDs + } + maxDevID, err := strconv.Atoi(devIDRange[1]) + if err != nil { + logger.Errorf(envErrDescribe(containerID, devIDRange[1], api.AscendDeviceInfo, err)) + return deviceIDs + } + if minDevID > maxDevID { + logger.Errorf(envErrDescribe(containerID, "", + api.AscendDeviceInfo, errors.New("min id bigger than max id"))) + return deviceIDs + } + if maxDevID > math.MaxInt16 { + logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, errors.New("max id invalid"))) + return deviceIDs + } + for deviceID := minDevID; deviceID <= maxDevID; deviceID++ { + deviceIDs = append(deviceIDs, deviceID) + } + return deviceIDs +} + +func (dp *DevicesParser) getDeviceIDsByCommaMinusStyle(devices, containerID string) []int { + var deviceIDs []int + devList := strings.Split(devices, comma) + for _, subDevices := range devList { + if minusStyle(subDevices) { + deviceIDs = append(deviceIDs, dp.getDeviceIDsByMinusStyle(subDevices, containerID)...) + continue + } + deviceIDs = append(deviceIDs, dp.getDeviceIDsByCommaStyle(subDevices, containerID)...) + } + return deviceIDs +} + +func (dp *DevicesParser) getDevWithoutAscendRuntimeInIsula(containerInfo isula.ContainerJson, + c *CommonContainer) (DevicesInfo, error) { + deviceInfo := DevicesInfo{} + devicesIDs, err := filterNPUDevicesInIsula(containerInfo) + if err != nil { + logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) + return DevicesInfo{}, nil + } + logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) + + if len(devicesIDs) == 0 { + return DevicesInfo{}, nil + } + + deviceInfo, err = makeUpDeviceInfo(c) + if err != nil { + hwlog.RunLog.Error(err) + return DevicesInfo{}, err + } + deviceInfo.Devices = devicesIDs + return deviceInfo, nil +} + +func (dp *DevicesParser) parseDeviceInIsula(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { + if rs == nil { + return errors.New("empty result channel") + } + + deviceInfo := DevicesInfo{} + defer func(di *DevicesInfo) { + rs <- *di + }(&deviceInfo) + + if len(c.Id) > maxCgroupPath { + return fmt.Errorf("the containerId (%s) is too long", c.Id) + } + containerInfo, err := dp.RuntimeOperator.GetIsulaContainerInfoByID(ctx, c.Id) + if err != nil { + return contactError(err, fmt.Sprintf("getting config of container(%s) fail", c.Id)) + } + if containerInfo.HostConfig == nil || containerInfo.Config == nil { + return errors.New("empty container info") + } + + envs := containerInfo.Config.Env + for i := len(envs) - 1; i >= 0; i-- { + e := envs[i] + if strings.Contains(e, api.AscendDeviceInfo) { + deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) + return err + } + } + + deviceInfo, err = dp.getDevWithoutAscendRuntimeInIsula(containerInfo, c) + return err +} + +func (dp *DevicesParser) collect(ctx context.Context, r <-chan DevicesInfo, ct int32) (DevicesInfos, error) { + if r == nil { + return nil, errors.New("receiving channel is empty") + } + if ct < 0 { + return nil, nil + } + + results := make(map[string]DevicesInfo, ct) + for { + select { + case info, ok := <-r: + if !ok { + return nil, nil + } + if info.ID != "" { + results[info.ID] = info + } + if ct -= 1; ct <= 0 { + return results, nil + } + case <-ctx.Done(): + hwlog.RunLog.Error("ctx is timeout") + dp.err <- ErrFromContext + return nil, nil + } + } +} + +func (dp *DevicesParser) doParse(resultOut chan<- DevicesInfos) { + var result DevicesInfos = nil + defer func(rslt DevicesInfos) { + if resultOut != nil { + resultOut <- rslt + close(resultOut) + } + }(result) + + ctx := context.Background() + containers, err := dp.RuntimeOperator.GetContainers(ctx) + if err != nil { + dp.err <- err + return + } + + l := len(containers) + if l == 0 || l > maxContainers { + logger.Debugf("get %d containers from cri interface, return empty data", l) + dp.result <- make(DevicesInfos) + return + } + + r := make(chan DevicesInfo) + defer close(r) + wg := sync.WaitGroup{} + wg.Add(l) + + for _, container := range containers { + go func(container *CommonContainer, c context.Context) { + if err := dp.parseDevices(c, container, r); err != nil { + dp.err <- err + } + wg.Done() + }(container, ctx) + } + ctx, cancelFn := context.WithTimeout(ctx, withDefault(dp.Timeout, parsingNpuDefaultTimeout)) + defer cancelFn() + result, err = dp.collect(ctx, r, int32(l)) + if err != nil { + logger.Errorf("collect info error: %v", err) + } + + if result != nil { + dp.result <- result + } + wg.Wait() +} + +// FetchAndParse triggers the asynchronous process of querying and analyzing all containers +// resultOut channel is for fetching the current result +func (dp *DevicesParser) FetchAndParse(resultOut chan<- DevicesInfos) { + if dp.err == nil { + logger.Debug("device paster is not initialized") + return + } + go dp.doParse(resultOut) +} + +func withDefault(v time.Duration, d time.Duration) time.Duration { + if v == 0 { + return d + } + + return v +} + +// query the MajorID of NPU devices +func getNPUMajorID() ([]string, error) { + const ( + deviceCount = 2 + maxSearchLine = 512 + ) + + path, err := utils.CheckPath("/proc/devices") + if err != nil { + return nil, err + } + majorID := make([]string, 0, deviceCount) + f, err := os.Open(path) + if err != nil { + return majorID, err + } + defer func() { + err = f.Close() + if err != nil { + hwlog.RunLog.Error(err) + } + }() + s := bufio.NewScanner(f) + count := 0 + for s.Scan() { + // prevent from searching too many lines + if count > maxSearchLine { + break + } + count++ + text := s.Text() + matched, err := regexp.MatchString("^[0-9]{1,3}\\s[v]?devdrv-cdev$", text) + if err != nil { + return majorID, err + } + if !matched { + continue + } + fields := strings.Fields(text) + majorID = append(majorID, fields[0]) + } + return majorID, nil +} + +func npuMajor() []string { + npuMajorFetchCtrl.Do(func() { + var err error + npuMajorID, err = getNPUMajorID() + if err != nil { + return + } + }) + return npuMajorID +} + +func contains(slice []string, target string) bool { + for _, v := range slice { + if v == target { + return true + } + } + return false +} + +func contactError(err error, msg string) error { + return fmt.Errorf("%s->%s", err.Error(), msg) +} + +func filterNPUDevices(spec v1.Spec) ([]int, error) { + if spec.Linux == nil || spec.Linux.Resources == nil { + return nil, errors.New("empty spec info") + } + + const base = 10 + devIDs := make([]int, 0, sliceLen8) + majorIDs := npuMajor() + for _, dev := range spec.Linux.Resources.Devices { + if dev.Minor == nil || dev.Major == nil { + // do not monitor privileged container + continue + } + if *dev.Minor > math.MaxInt32 { + return nil, fmt.Errorf("get wrong device ID (%v)", dev.Minor) + } + major := strconv.FormatInt(*dev.Major, base) + if dev.Type == charDevice && contains(majorIDs, major) { + devIDs = append(devIDs, int(*dev.Minor)) + } + } + + return devIDs, nil +} + +// filterNPUDevicesInIsula get id of device from containerJson(containerInfo) +func filterNPUDevicesInIsula(containerInfo isula.ContainerJson) ([]int, error) { + privileged := containerInfo.HostConfig.Privileged + if privileged { + return nil, errors.New("it's a privileged container and skip it") + } + + devIDs := make([]int, 0, sliceLen8) + devices := containerInfo.HostConfig.Devices + for _, dev := range devices { + Id, err := getDevIdFromPath(api.DevicePathPattern, dev.PathInContainer) + if err != nil { + logger.Warn(err) + continue + } + devIDs = append(devIDs, Id) + } + + return devIDs, nil +} + +func getDevIdFromPath(pattern, path string) (int, error) { + if match, err := regexp.MatchString(pattern, path); err != nil || !match { + return -1, fmt.Errorf("unexpected path of device: %s or match error: %v", path, err) + } + number := regexp.MustCompile(`\d+`) + IdStr := number.FindString(path) + Id, err := strconv.Atoi(IdStr) + if err != nil { + return -1, fmt.Errorf("unexpected device ID (%v)", IdStr) + } + if Id > math.MaxInt32 { + return -1, fmt.Errorf("get wrong device ID (%v)", Id) + } + return Id, nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser_test.go b/mind-cluster/component/npu-exporter/collector/container/parser_test.go new file mode 100644 index 0000000..f2975b9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/parser_test.go @@ -0,0 +1,1027 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container provides utilities for container monitoring and testing. +package container + +import ( + "context" + "errors" + "os" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + // Test endpoint constants + testContainerdEndpoint = "unix:///run/containerd.sock" + testDockerEndpoint = "unix:///run/docker.sock" + + device0 = 0 + device1 = 1 + device2 = 2 + device3 = 3 + testDeviceRange = "0-2" + testDeviceComma = "0,1,2" + testDeviceCommaRange = "0-1,2-3" + testAscendDevice0 = "Ascend-0" + testAscendDevices = "Ascend-0,Ascend-1" + testMixedDevices = "0-1,3" + + // Test error constants + testOriginalError = "original error" + testErrorMessage = "test message" + testContactedError = "original error->test message" + + // Test path constants + testDevicePattern = "/dev/npu([0-9]+)" + + // Test duration constants + testZeroDuration = 0 +) + +func TestMakeDevicesParser(t *testing.T) { + testCases := []struct { + name string + opts CntNpuMonitorOpts + expected *DevicesParser + }{ + {name: "should create parser when options are valid for containerd", + opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeContainerd, + OciEndpoint: testContainerdEndpoint, UseOciBackup: false, UseCriBackup: false}, + expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: false, UseCriBackup: false, + CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, + {name: "should create parser when options are valid for docker", + opts: CntNpuMonitorOpts{CriEndpoint: testDockerEndpoint, EndpointType: EndpointTypeDockerd, + OciEndpoint: testDockerEndpoint, UseOciBackup: true, UseCriBackup: false}, + expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, + CriEndpoint: testDockerEndpoint, OciEndpoint: testDockerEndpoint}, Timeout: testZeroDuration}}, + {name: "should create parser when options are valid for isula", + opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeIsula, + OciEndpoint: testContainerdEndpoint, UseOciBackup: true, UseCriBackup: true}, + expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, + CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, + } + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + result := MakeDevicesParser(tc.opts) + convey.So(result, convey.ShouldNotBeNil) + convey.So(result.RuntimeOperator, convey.ShouldNotBeNil) + convey.So(result.Timeout, convey.ShouldEqual, tc.expected.Timeout) + }) + } +} + +func TestDevicesParserInit(t *testing.T) { + convey.Convey("TestDevicesParserInit", t, func() { + convey.Convey("should initialize successfully when runtime operator init succeeds", func() { + dp := &DevicesParser{ + RuntimeOperator: &RuntimeOperatorTool{}, + } + + patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", nil) + defer patches.Reset() + + err := dp.Init() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when initialization fails", func() { + dp := &DevicesParser{ + RuntimeOperator: &RuntimeOperatorTool{}, + } + patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", errors.New("init failed")) + defer patches.Reset() + err := dp.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "init failed") + }) + }) +} + +func TestDevicesParserRecvResult(t *testing.T) { + convey.Convey("TestDevicesParserRecvResult", t, func() { + convey.Convey("should return result channel when initialized", func() { + dp := &DevicesParser{ + result: make(chan DevicesInfos, 1), + } + resultChan := dp.RecvResult() + convey.So(resultChan, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserRecvErr(t *testing.T) { + convey.Convey("TestDevicesParserRecvErr", t, func() { + convey.Convey("should return error channel when initialized", func() { + dp := &DevicesParser{ + err: make(chan error, 1), + } + errChan := dp.RecvErr() + convey.So(errChan, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserClose(t *testing.T) { + convey.Convey("TestDevicesParserClose", t, func() { + convey.Convey("should close runtime operator when called", func() { + mockOperator := &RuntimeOperatorTool{} + dp := &DevicesParser{ + RuntimeOperator: mockOperator, + } + + visited := false + patches := gomonkey.ApplyMethod(mockOperator, "Close", func(*RuntimeOperatorTool) error { + visited = true + return nil + }) + defer patches.Reset() + + dp.Close() + convey.So(visited, convey.ShouldBeTrue) + }) + }) +} + +func TestDevicesParserParseDevices(t *testing.T) { + convey.Convey("TestDevicesParserParseDevices", t, func() { + convey.Convey("should parse isula devices when container type is isula", func() { + dp := &DevicesParser{} + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", IsulaContainer). + ApplyFuncReturn((*DevicesParser).parseDeviceInIsula, nil) + defer patches.Reset() + + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + resultChan := make(chan DevicesInfo, 1) + err := dp.parseDevices(ctx, container, resultChan) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should parse containerd devices when container type is not isula", func() { + dp := &DevicesParser{} + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", DefaultContainer). + ApplyFuncReturn((*DevicesParser).parseDevicesInContainerd, nil) + defer patches.Reset() + + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + resultChan := make(chan DevicesInfo, 1) + err := dp.parseDevices(ctx, container, resultChan) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestDevicesParserParseDevicesInContainerd(t *testing.T) { + convey.Convey("TestDevicesParserParseDevicesInContainerd", t, func() { + convey.Convey("should return error when result channel is nil", func() { + dp := &DevicesParser{} + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + + err := dp.parseDevicesInContainerd(ctx, container, nil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") + }) + + convey.Convey("should return error when get container info fails", func() { + dp := &DevicesParser{} + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethod(mockOperator, "GetContainerInfoByID", + func(*RuntimeOperatorTool, context.Context, string) (v1.Spec, error) { + return v1.Spec{}, errors.New("get container info failed") + }) + defer patches.Reset() + + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + resultChan := make(chan DevicesInfo, 1) + + err := dp.parseDevicesInContainerd(ctx, container, resultChan) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserGetDevicesWithoutAscendRuntime(t *testing.T) { + convey.Convey("TestDevicesParserGetDevicesWithoutAscendRuntime", t, func() { + convey.Convey("should return devices when filter succeeds", func() { + dp := &DevicesParser{} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevices, []int{device0, device1, device2}, nil) + defer patches.Reset() + + patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) + + spec := v1.Spec{} + container := &CommonContainer{Id: "test-container"} + + result, err := dp.getDevicesWithoutAscendRuntime(spec, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) + }) + + convey.Convey("should return empty when filter fails", func() { + dp := &DevicesParser{} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevices, nil, errors.New("filter failed")) + defer patches.Reset() + + spec := v1.Spec{} + container := &CommonContainer{Id: "test-container"} + + result, err := dp.getDevicesWithoutAscendRuntime(spec, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldResemble, DevicesInfo{}) + }) + }) +} + +func TestDevicesParserGetDevicesWithAscendRuntime(t *testing.T) { + convey.Convey("TestDevicesParserGetDevicesWithAscendRuntime", t, func() { + convey.Convey("should return error when env format is invalid", func() { + dp := &DevicesParser{} + ascendDevEnv := "invalid-env" + container := &CommonContainer{Id: "test-container"} + + result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) + convey.So(err, convey.ShouldNotBeNil) + convey.So(result, convey.ShouldResemble, DevicesInfo{}) + }) + + convey.Convey("should return devices when env format is valid", func() { + dp := &DevicesParser{} + ascendDevEnv := "ASCEND_VISIBLE_DEVICES=0,1,2" + container := &CommonContainer{Id: "test-container"} + + patches := gomonkey.ApplyFunc(makeUpDeviceInfo, func(*CommonContainer) (DevicesInfo, error) { + return DevicesInfo{ID: "test", Name: "test-name"}, nil + }) + defer patches.Reset() + + result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) + }) + }) +} + +func TestDevicesParserGetDevWithoutAscendRuntimeInIsula(t *testing.T) { + convey.Convey("TestDevicesParserGetDevWithoutAscendRuntimeInIsula", t, func() { + convey.Convey("should return devices when filter succeeds", func() { + dp := &DevicesParser{} + containerInfo := isula.ContainerJson{} + container := &CommonContainer{Id: "test-container"} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, []int{device0, device1, device2}, nil) + defer patches.Reset() + + patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) + + result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) + }) + + convey.Convey("should return empty when filter fails", func() { + dp := &DevicesParser{} + containerInfo := isula.ContainerJson{} + container := &CommonContainer{Id: "test-container"} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, nil, errors.New("filter failed")) + defer patches.Reset() + + result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldResemble, DevicesInfo{}) + }) + }) +} + +func TestDevicesParserParseDeviceInIsula(t *testing.T) { + convey.Convey("TestDevicesParserParseDeviceInIsula", t, func() { + convey.Convey("should return error when result channel is nil", func() { + dp := &DevicesParser{} + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + + err := dp.parseDeviceInIsula(ctx, container, nil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") + }) + + convey.Convey("should return error when container id is too long", func() { + dp := &DevicesParser{} + ctx := context.Background() + longId := string(make([]byte, maxCgroupPath+1)) + container := &CommonContainer{Id: longId} + resultChan := make(chan DevicesInfo, 1) + + err := dp.parseDeviceInIsula(ctx, container, resultChan) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserCollect(t *testing.T) { + convey.Convey("TestDevicesParserCollect", t, func() { + convey.Convey("should return error when receiving channel is nil", func() { + dp := &DevicesParser{} + ctx := context.Background() + + result, err := dp.collect(ctx, nil, 1) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "receiving channel is empty") + convey.So(result, convey.ShouldBeNil) + }) + + convey.Convey("should return nil when count is negative", func() { + dp := &DevicesParser{} + ctx := context.Background() + resultChan := make(chan DevicesInfo) + + result, err := dp.collect(ctx, resultChan, -1) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldBeNil) + }) + }) +} + +func TestDevicesParserDoParse(t *testing.T) { + convey.Convey("TestDevicesParserDoParse", t, func() { + const time100ms = 100 * time.Millisecond + convey.Convey("should handle error when get containers fails", func() { + dp := &DevicesParser{ + err: make(chan error, 1), + } + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethod(mockOperator, "GetContainers", + func(*RuntimeOperatorTool, context.Context) ([]*CommonContainer, error) { + return nil, errors.New("get containers failed") + }) + defer patches.Reset() + + resultChan := make(chan DevicesInfos, 1) + dp.doParse(resultChan) + + select { + case err := <-dp.err: + convey.So(err, convey.ShouldNotBeNil) + case <-time.After(time100ms): + convey.So("timeout", convey.ShouldEqual, "should receive error") + } + }) + }) +} + +func TestDevicesParserFetchAndParse(t *testing.T) { + const time10ms = 10 * time.Millisecond + convey.Convey("TestDevicesParserFetchAndParse", t, func() { + convey.Convey("should return early when err channel is nil", func() { + dp := &DevicesParser{ + err: nil, + } + visited := make(chan bool, 1) + patches := gomonkey.ApplyPrivateMethod(dp, "doParse", + func(*DevicesParser, chan<- DevicesInfos) error { + visited <- true + return nil + }) + defer patches.Reset() + + dp.FetchAndParse(nil) + time.Sleep(time10ms) + convey.So(len(visited), convey.ShouldEqual, 0) + }) + + convey.Convey("should start parsing when initialized", func() { + dp := &DevicesParser{ + err: make(chan error, 1), + RuntimeOperator: &RuntimeOperatorTool{}, + } + visited := make(chan bool, 1) + patches := gomonkey.ApplyPrivateMethod(dp, "doParse", + func(*DevicesParser, chan<- DevicesInfos) error { + visited <- true + return nil + }) + defer patches.Reset() + + dp.FetchAndParse(nil) + time.Sleep(time10ms) + convey.So(len(visited), convey.ShouldEqual, 1) + }) + }) +} + +func TestDevicesParserGetDeviceIDsByMinusStyle(t *testing.T) { + convey.Convey("TestDevicesParserGetDeviceIDsByMinusStyle", t, func() { + testCases := []struct { + name string + devices string + expected []int + }{ + {name: "should return empty slice when devices string is invalid", devices: "invalid-devices", expected: []int{}}, + {name: "should return empty slice when min device ID is invalid", devices: "invalid-5", expected: []int{}}, + {name: "should return empty slice when max device ID is invalid", devices: "0-invalid", expected: []int{}}, + {name: "should return empty slice when min ID is bigger than max ID", devices: "5-3", expected: []int{}}, + {name: "should return empty slice when max ID is too large", devices: "0-99999", expected: []int{}}, + {name: "should return device IDs when range is valid", devices: "0-2", expected: []int{0, 1, 2}}, + {name: "should return single device ID when min equals max", devices: "1-1", expected: []int{1}}, + } + for _, tc := range testCases { + convey.Convey(tc.name, func() { + dp := &DevicesParser{} + result := dp.getDeviceIDsByMinusStyle(tc.devices, "test-container") + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetNPUMajorID(t *testing.T) { + testCases := builderTestGetNPUMajorIDCases() + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + _, cleanup := tc.setup(t) + defer cleanup() + result, err := getNPUMajorID() + if tc.hasError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + } + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } +} + +type TestGetNPUMajorIDCase struct { + name string + setup func(*testing.T) (*gomonkey.Patches, func()) + expected []string + hasError bool +} + +func builderTestGetNPUMajorIDCases() []TestGetNPUMajorIDCase { + testCases := []TestGetNPUMajorIDCase{{name: "should return error when path check fails", + setup: func(*testing.T) (*gomonkey.Patches, func()) { + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) + return patches, func() { patches.Reset() } + }, expected: nil, hasError: true}, + {name: "should return error when file open fails", + setup: func(*testing.T) (*gomonkey.Patches, func()) { + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, "/proc/devices", nil) + p1.ApplyFuncReturn(os.Open, nil, errors.New("file open failed")) + return p1, func() { p1.Reset() } + }, expected: []string{}, hasError: true}, + {name: "should return empty slice when no NPU devices found", + setup: func(t *testing.T) (*gomonkey.Patches, func()) { + tmpFile, clean, err := mkTemp("1 mem\n2 pty\n") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) + return p1, func() { clean(); p1.Reset() } + }, expected: []string{}, hasError: false}, + {name: "should return major IDs when NPU devices found", + setup: func(t *testing.T) (*gomonkey.Patches, func()) { + tmpFile, clean, err := mkTemp("195 devdrv-cdev\n196 devdrv-cdev\n") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) + return p1, func() { clean(); p1.Reset() } + }, expected: []string{"195", "196"}, hasError: false}, + {name: "should return major IDs when mixed devices found", + setup: func(t *testing.T) (*gomonkey.Patches, func()) { + tmpFile, clean, err := mkTemp("1 mem\n195 devdrv-cdev\n2 pty\n196 devdrv-cdev\n") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) + return p1, func() { clean(); p1.Reset() } + }, expected: []string{"195", "196"}, hasError: false}, + } + return testCases +} + +func TestNpuMajor(t *testing.T) { + convey.Convey("TestNpuMajor", t, func() { + convey.Convey("should return cached major IDs", func() { + patches := gomonkey.ApplyFuncReturn(getNPUMajorID, []string{"123", "456"}, nil) + defer patches.Reset() + + result := npuMajor() + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +func TestFilterNPUDevices(t *testing.T) { + convey.Convey("TestFilterNPUDevices", t, func() { + const mockMajorID = 236 + convey.Convey("should return error when spec is empty", func() { + spec := v1.Spec{} + result, err := filterNPUDevices(spec) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "empty spec info") + convey.So(result, convey.ShouldBeNil) + }) + + convey.Convey("should return devices when spec is valid", func() { + spec := v1.Spec{ + Linux: &v1.Linux{ + Resources: &v1.LinuxResources{ + Devices: []v1.LinuxDeviceCgroup{{Type: "c", Major: int64Ptr(mockMajorID), Minor: int64Ptr(0)}}, + }, + }, + } + patches := gomonkey.ApplyFuncReturn(npuMajor, []string{"236"}) + defer patches.Reset() + + result, err := filterNPUDevices(spec) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +// mkTemp creates a temporary file with the given content and returns the file name, +// a cleanup function, and an error. The file is closed before returning. +func mkTemp(content string) (string, func(), error) { + f, err := os.CreateTemp("", "test_*") + if err != nil { + return "", func() {}, err + } + if _, err = f.WriteString(content); err != nil { + clean(f) + return "", func() {}, err + } + if _, err = f.Seek(0, 0); err != nil { + clean(f) + return "", func() {}, err + } + name := f.Name() + return name, func() { clean(f) }, nil +} + +func clean(f *os.File) { + if f == nil { + return + } + if err := f.Close(); err != nil { + logger.Errorf("an error occurred where close file [%v],err :%v", f.Name(), err) + } + if err := os.Remove(f.Name()); err != nil { + logger.Errorf("an error occurred where remove file [%v],err :%v", f.Name(), err) + } +} + +func TestFilterNPUDevicesInIsula(t *testing.T) { + convey.Convey("TestFilterNPUDevicesInIsula", t, func() { + convey.Convey("should return error when container is privileged", func() { + containerInfo := isula.ContainerJson{ + HostConfig: &isula.HostConfig{ + Privileged: true, + }, + } + + result, err := filterNPUDevicesInIsula(containerInfo) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "privileged container") + convey.So(result, convey.ShouldBeNil) + }) + + convey.Convey("should return devices when container is not privileged", func() { + containerInfo := isula.ContainerJson{ + HostConfig: &isula.HostConfig{ + Privileged: false, + Devices: []isula.DeviceInfo{ + { + PathInContainer: "/dev/npu0", + }, + }, + }, + } + + patches := gomonkey.ApplyFuncReturn(getDevIdFromPath, 0, nil) + defer patches.Reset() + + result, err := filterNPUDevicesInIsula(containerInfo) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +// Helper function for creating int64 pointers +func int64Ptr(v int64) *int64 { + return &v +} + +func TestParseDiffEnvFmt(t *testing.T) { + convey.Convey("TestParseDiffEnvFmt", t, func() { + dp := &DevicesParser{} + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + {name: "should parse comma style devices when valid", + devices: testDeviceComma, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + {name: "should parse minus style devices when valid", + devices: testDeviceRange, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + {name: "should parse ascend style devices when valid", + devices: testAscendDevices, + containerID: "test-container", + expected: []int{device0, device1}, + }, + {name: "should parse comma minus style devices when valid", + devices: testDeviceCommaRange, + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + {name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.parseDiffEnvFmt(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByCommaStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByCommaStyle", t, func() { + dp := &DevicesParser{} + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + {name: "should parse comma separated devices when valid", + devices: "0,1,2,3", + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + {name: "should parse single device when valid", + devices: "0", + containerID: "test-container", + expected: []int{device0}, + }, + {name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + {name: "should parse devices with spaces when valid", + devices: testDeviceComma, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByCommaStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByAscendStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByAscendStyle", t, func() { + dp := &DevicesParser{} + + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + { + name: "should parse ascend devices when valid", + devices: "Ascend-0,Ascend-1,Ascend-2", + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + { + name: "should parse single ascend device when valid", + devices: testAscendDevice0, + containerID: "test-container", + expected: []int{0}, + }, + { + name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + { + name: "should parse mixed case ascend devices when valid", + devices: "ascend-0,ASCEND-1", + containerID: "test-container", + expected: []int{device0, device1}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByAscendStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByMinusStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByMinusStyle", t, func() { + dp := &DevicesParser{} + + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + { + name: "should parse range devices when valid", + devices: "0-3", + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + { + name: "should parse single device range when valid", + devices: "0-0", + containerID: "test-container", + expected: []int{device0}, + }, + { + name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByMinusStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByCommaMinusStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByCommaMinusStyle", t, func() { + dp := &DevicesParser{} + + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + { + name: "should parse comma minus devices when valid", + devices: testDeviceCommaRange, + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + { + name: "should parse single range when valid", + devices: testDeviceRange, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + { + name: "should return nil when devices are empty", + devices: "", + containerID: "test-container", + expected: nil, + }, + { + name: "should parse mixed ranges when valid", + devices: testMixedDevices, + containerID: "test-container", + expected: []int{device0, device1, device3}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByCommaMinusStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestContains(t *testing.T) { + convey.Convey("TestContains", t, func() { + testCases := []struct { + name string + slice []string + target string + expected bool + }{ + { + name: "should return true when target exists in slice", + slice: []string{"a", "b", "c"}, + target: "b", + expected: true, + }, + { + name: "should return false when target does not exist in slice", + slice: []string{"a", "b", "c"}, + target: "d", + expected: false, + }, + { + name: "should return false when slice is empty", + slice: []string{}, + target: "a", + expected: false, + }, + { + name: "should return false when slice is nil", + slice: nil, + target: "a", + expected: false, + }, + { + name: "should return false when target is empty string", + slice: []string{"a", "b", "c"}, + target: "", + expected: false, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := contains(tc.slice, tc.target) + convey.So(result, convey.ShouldEqual, tc.expected) + }) + } + }) +} + +func TestContactError(t *testing.T) { + convey.Convey("TestContactError", t, func() { + testCases := []struct { + name string + err error + msg string + expected string + }{ + { + name: "should concatenate error with message when both provided", + err: errors.New(testOriginalError), + msg: testErrorMessage, + expected: testContactedError, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := contactError(tc.err, tc.msg) + convey.So(result.Error(), convey.ShouldEqual, tc.expected) + }) + } + }) +} + +func TestGetDevIdFromPath(t *testing.T) { + convey.Convey("TestGetDevIdFromPath", t, func() { + testCases := []struct { + name string + pattern string + path string + expected int + hasError bool + }{ + {name: "should extract device id when path is valid", + pattern: testDevicePattern, + path: "/dev/npu0", + expected: 0, + hasError: false, + }, + {name: "should extract device id when path has multiple digits", + pattern: testDevicePattern, + path: "/dev/npu123", + expected: 123, + hasError: false, + }, + {name: "should return error when device path is invalid", + pattern: testDevicePattern, + path: "/dev/cpu0", + expected: 0, + hasError: true, + }, + {name: "should return error when path is empty", + pattern: testDevicePattern, + path: "", + expected: 0, + hasError: true, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result, err := getDevIdFromPath(tc.pattern, tc.path) + if tc.hasError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldEqual, tc.expected) + } + }) + } + }) +} + +func TestWithDefault(t *testing.T) { + convey.Convey("TestWithDefault", t, func() { + const time0s = 0 + const time3s = 3 * time.Second + const time5s = 5 * time.Second + testCases := []struct { + name string + v time.Duration + d time.Duration + expected time.Duration + }{ + {name: "should return default when duration is zero", + v: time0s, + d: time5s, + expected: time5s, + }, + {name: "should return value when duration is non-zero", + v: time3s, + d: time5s, + expected: time3s, + }, + {name: "should return value when duration is negative", + v: -1 * time.Second, + d: time5s, + expected: -1 * time.Second, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := withDefault(tc.v, tc.d) + convey.So(result, convey.ShouldEqual, tc.expected) + }) + } + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go new file mode 100644 index 0000000..daab834 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go @@ -0,0 +1,413 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container for monitoring containers' npu allocation +package container + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "syscall" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" + criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" + + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + labelK8sPodNamespace = "io.kubernetes.pod.namespace" + labelK8sPodName = "io.kubernetes.pod.name" + labelContainerName = "io.kubernetes.container.name" + + // DefaultIsuladAddr default isulad sock adress + DefaultIsuladAddr = "unix:///run/isulad.sock" + // DefaultDockerShim default docker shim sock address + DefaultDockerShim = "unix:///run/dockershim.sock" + // DefaultCRIDockerd default cri-dockerd sock address + DefaultCRIDockerd = "unix:///run/cri-dockerd.sock" + // DefaultContainerdAddr default containerd sock address + DefaultContainerdAddr = "unix:///run/containerd/containerd.sock" + // DefaultDockerAddr default docker containerd sock address + DefaultDockerAddr = "unix:///run/docker/containerd/docker-containerd.sock" + defaultDockerOnEuler = "unix:///run/docker/containerd/containerd.sock" + grpcHeader = "containerd-namespace" + unixPre = "unix://" + + // IsulaContainer represents isula container type + IsulaContainer = "isula" + // DefaultContainer represents default container type + DefaultContainer = "docker-containerd" + excludePermissions = 0002 + + criV1alpha2 = "runtime.v1alpha2.RuntimeService" +) + +// CommonContainer wraps some common container attribute of isulad and containerd +type CommonContainer struct { + Id string + Labels map[string]string +} + +// RuntimeOperator wraps operations against container runtime +type RuntimeOperator interface { + Init() error + Close() error + GetContainers(ctx context.Context) ([]*CommonContainer, error) + GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) + GetIsulaContainerInfoByID(ctx context.Context, id string) (isula.ContainerJson, error) + GetContainerType() string +} + +// RuntimeOperatorTool implements RuntimeOperator interface +type RuntimeOperatorTool struct { + criConn *grpc.ClientConn + conn *grpc.ClientConn + criClient interface{} + client interface{} + // CriEndpoint CRI server endpoint + CriEndpoint string + // OciEndpoint containerd Server endpoint + OciEndpoint string + // Namespace the namespace of containerd + Namespace string + // UseCriBackup use cri back up address or not + UseCriBackup bool + // UseOciBackup use oci back up address or not + UseOciBackup bool +} + +// Init initializes container runtime operator +func (operator *RuntimeOperatorTool) Init() error { + start := syscall.Getuid() + logger.Debugf("the init uid is:%d", start) + if start != 0 { + err := syscall.Setuid(0) + if err != nil { + return fmt.Errorf("raise uid failed: %v", err) + } + logger.Debugf("raise uid to:%d", 0) + defer func() { + err = syscall.Setuid(start) + if err != nil { + logger.Errorf("recover uid failed: %v", err) + } + logger.Debugf("recover uid to:%d", start) + }() + } + if err := sockCheck(operator); err != nil { + hwlog.RunLog.Error("check socket path failed") + return err + } + + if err := operator.initCriClient(); err != nil { + return fmt.Errorf("init CRI client failed, %s", err) + } + + if err := operator.initOciClient(); err != nil { + return fmt.Errorf("init OCI client failed, %s", err) + } + return nil +} + +func (operator *RuntimeOperatorTool) initCriClient() error { + criConn, err := GetConnection(operator.CriEndpoint) + if err != nil || criConn == nil { + msg := fmt.Sprintf("connecting to CRI server failed: %v", err) + if operator.UseCriBackup { + logger.Warnf("%v, will use cri-dockerd address to try again", msg) + if utils.IsExist(strings.TrimPrefix(DefaultCRIDockerd, unixPre)) { + criConn, err = GetConnection(DefaultCRIDockerd) + } + } else { + logger.Warn(msg) + } + } + if err != nil { + return fmt.Errorf("connecting to CRI server failed: %v", err) + } + if operator.CriEndpoint == DefaultIsuladAddr { + operator.criClient = isula.NewRuntimeServiceClient(criConn) + } else { + operator.criClient = v1alpha2.NewRuntimeServiceClient(criConn) + } + operator.criConn = criConn + return nil +} + +func (operator *RuntimeOperatorTool) initOciClient() error { + conn, err := GetConnection(operator.OciEndpoint) + if err != nil || conn == nil { + msg := fmt.Sprintf("failed to get OCI connection: %v", err) + if operator.UseOciBackup { + logger.Warnf("%v, will use backup address to try again", msg) + if utils.IsExist(strings.TrimPrefix(DefaultContainerdAddr, unixPre)) { + conn, err = GetConnection(DefaultContainerdAddr) + + } else if utils.IsExist(strings.TrimPrefix(defaultDockerOnEuler, unixPre)) { + conn, err = GetConnection(defaultDockerOnEuler) + } + } else { + logger.Warn(msg) + } + } + if err != nil { + return fmt.Errorf("connecting to OCI server failed: %v", err) + } + if operator.OciEndpoint == DefaultIsuladAddr { + operator.client = isula.NewContainerServiceClient(conn) + } else { + operator.client = v1.NewContainersClient(conn) + } + operator.conn = conn + return nil +} + +func sockCheck(operator *RuntimeOperatorTool) error { + absPath, err := utils.CheckPath(strings.TrimPrefix(operator.CriEndpoint, unixPre)) + if err != nil { + return err + } + if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { + return err + } + + absPath, err = utils.CheckPath(strings.TrimPrefix(operator.OciEndpoint, unixPre)) + if err != nil { + return err + } + if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { + return err + } + return nil +} + +// Close closes container runtime operator +func (operator *RuntimeOperatorTool) Close() error { + err := operator.conn.Close() + if err != nil { + return err + } + err = operator.criConn.Close() + if err != nil { + return err + } + return nil +} + +// GetContainers returns all containers' IDs +func (operator *RuntimeOperatorTool) GetContainers(ctx context.Context) ([]*CommonContainer, error) { + if utils.IsNil(operator.criClient) || operator.criConn == nil { + return nil, errors.New("criClient is empty") + } + if client, ok := operator.criClient.(v1alpha2.RuntimeServiceClient); ok { + containers, err := getContainersByContainerdV1alpha2(ctx, client) + if isUnimplementedError(err, criV1alpha2) { + v1Client := criv1.NewRuntimeServiceClient(operator.criConn) + return getContainersByContainerdV1(ctx, v1Client) + } + return containers, err + } + if client, ok := operator.criClient.(isula.RuntimeServiceClient); ok { + return getContainersByIsulad(ctx, client) + } + + logger.Errorf("client %v is unexpected", operator.criClient) + return nil, errors.New("unexpected client type") +} + +func isUnimplementedError(err error, serviceName string) bool { + if err == nil { + return false + } + st, ok := status.FromError(err) + if ok { + return st.Code() == codes.Unimplemented && strings.Contains(st.Message(), serviceName) + } + errStr := err.Error() + if strings.Contains(errStr, "code = Unimplemented") && + strings.Contains(errStr, "desc = ") && strings.Contains(errStr, serviceName) { + return true + } + return false +} + +// GetContainerInfoByID use oci interface to get container +func (operator *RuntimeOperatorTool) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { + if utils.IsNil(operator.client) || operator.conn == nil { + return v1.Spec{}, errors.New("oci client is empty") + } + + s := v1.Spec{} + if client, ok := operator.client.(v1.ContainersClient); ok { + resp, err := client.Get(setGrpcNamespaceHeader(ctx, operator.Namespace), &v1.GetContainerRequest{ + Id: id, + }) + if err != nil { + hwlog.RunLog.Error("get call OCI get method failed") + return v1.Spec{}, err + } + if err = json.Unmarshal(resp.Container.Spec.Value, &s); err != nil { + hwlog.RunLog.Error("unmarshal OCI response failed") + return v1.Spec{}, err + } + return s, nil + } + + return s, errors.New("unexpected containerd client") +} + +// GetIsulaContainerInfoByID return isula container info +func (operator *RuntimeOperatorTool) GetIsulaContainerInfoByID(ctx context.Context, + id string) (isula.ContainerJson, error) { + containerJsonInfo := isula.ContainerJson{} + if utils.IsNil(operator.client) || operator.conn == nil { + return containerJsonInfo, errors.New("oci client is empty") + } + + if client, ok := operator.client.(isula.ContainerServiceClient); ok { + resp, err := client.Inspect(setGrpcNamespaceHeader(ctx, operator.Namespace), &isula.InspectContainerRequest{ + Id: id, + }) + if err != nil { + hwlog.RunLog.Error("call isula OCI Inspect method failed") + return containerJsonInfo, err + } + if err = json.Unmarshal([]byte(resp.ContainerJSON), &containerJsonInfo); err != nil { + logger.Errorf("unmarshal err: %v", err) + return containerJsonInfo, err + } + return containerJsonInfo, nil + } + + return containerJsonInfo, errors.New("unexpected isula client") +} + +// GetContainerType return container type +func (operator *RuntimeOperatorTool) GetContainerType() string { + if operator.OciEndpoint == DefaultIsuladAddr { + return IsulaContainer + } + return DefaultContainer +} + +type nsKey struct{} + +func setGrpcNamespaceHeader(ctx context.Context, namespace string) context.Context { + context.WithValue(ctx, nsKey{}, namespace) + ns := metadata.Pairs(grpcHeader, namespace) + md, ok := metadata.FromOutgoingContext(ctx) + if !ok { + md = ns + } else { + md = metadata.Join(ns, md) + } + return metadata.NewOutgoingContext(ctx, md) +} + +func getContainersByContainerdV1alpha2(ctx context.Context, + client v1alpha2.RuntimeServiceClient) ([]*CommonContainer, error) { + var allContainers []*CommonContainer + request := genContainerRequestV1alpha2() + r, err := client.ListContainers(ctx, request) + if err != nil { + hwlog.RunLog.Warn(err) + return nil, err + } + for _, container := range r.Containers { + allContainers = append(allContainers, &CommonContainer{ + Id: container.Id, + Labels: container.Labels, + }) + } + return allContainers, nil +} + +func getContainersByContainerdV1(ctx context.Context, client criv1.RuntimeServiceClient) ([]*CommonContainer, error) { + var allContainers []*CommonContainer + request := genContainerRequestV1() + r, err := client.ListContainers(ctx, request) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + for _, container := range r.Containers { + allContainers = append(allContainers, &CommonContainer{ + Id: container.Id, + Labels: container.Labels, + }) + } + return allContainers, nil +} + +func getContainersByIsulad(ctx context.Context, client isula.RuntimeServiceClient) ([]*CommonContainer, error) { + var allContainers []*CommonContainer + request := genIsulaRequest() + r, err := client.ListContainers(ctx, request) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + for _, container := range r.Containers { + allContainers = append(allContainers, &CommonContainer{ + Id: container.Id, + Labels: container.Labels, + }) + } + return allContainers, nil +} + +func genContainerRequestV1alpha2() *v1alpha2.ListContainersRequest { + filter := &v1alpha2.ContainerFilter{} + st := &v1alpha2.ContainerStateValue{} + st.State = v1alpha2.ContainerState_CONTAINER_RUNNING + filter.State = st + request := &v1alpha2.ListContainersRequest{ + Filter: filter, + } + return request +} + +func genContainerRequestV1() *criv1.ListContainersRequest { + filter := &criv1.ContainerFilter{} + st := &criv1.ContainerStateValue{} + st.State = criv1.ContainerState_CONTAINER_RUNNING + filter.State = st + request := &criv1.ListContainersRequest{ + Filter: filter, + } + return request +} + +func genIsulaRequest() *isula.ListContainersRequest { + filter := &isula.ContainerFilter{} + st := &isula.ContainerStateValue{} + st.State = isula.ContainerState_CONTAINER_RUNNING + filter.State = st + request := &isula.ListContainersRequest{ + Filter: filter, + } + return request +} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go new file mode 100644 index 0000000..2bc135c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go @@ -0,0 +1,568 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container provides utilities for container monitoring and testing. +package container + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" + + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" +) + +const ( + // Test constants for runtime operations + testNamespace = "test-namespace" + + // Test error messages + testInitCriError = "init CRI client failed" + testInitOciError = "init OCI client failed" + testSockCheckError = "socket check failed" + testCriClientEmptyError = "criClient is empty" + testOciClientEmptyError = "oci client is empty" + testUnexpectedClientError = "unexpected client type" + testUnexpectedContainerdClientError = "unexpected containerd client" + testUnexpectedIsulaClientError = "unexpected isula client" + testCriV1alpha2 = "runtime.v1alpha2.RuntimeService" + testCriV1 = "runtime.v1.RuntimeService" +) + +func TestRuntimeOperatorToolInit(t *testing.T) { + r := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + convey.Convey("should initialize successfully when all components succeed", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, nil) + defer patches.Reset() + patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) + patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) + err := operator.Init() + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("should return error when socket check fails", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, errors.New(testSockCheckError)) + defer patches.Reset() + err := operator.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testSockCheckError) + }) + convey.Convey("should return error when CRI client init fails", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, nil) + defer patches.Reset() + patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, errors.New(testInitCriError)) + patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) + err := operator.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testInitCriError) + }) + convey.Convey("should return error when OCI client init fails", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, nil) + defer patches.Reset() + patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) + patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, errors.New(testInitOciError)) + err := operator.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testInitOciError) + }) +} + +func TestRuntimeOperatorToolInitCriClient(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolInitCriClient", t, func() { + convey.Convey("should initialize CRI client successfully for containerd", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + UseOciBackup: false, + UseCriBackup: false, + } + + patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + defer patches.Reset() + + err := operator.initCriClient() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should initialize CRI client successfully for isulad", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: DefaultIsuladAddr, + UseOciBackup: false, + UseCriBackup: false, + } + + patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + defer patches.Reset() + + err := operator.initCriClient() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when connection fails and no backup", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + UseOciBackup: false, + UseCriBackup: false, + } + + patches := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) + defer patches.Reset() + + err := operator.initCriClient() + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestRuntimeOperatorToolInitOciClient(t *testing.T) { + testCases := buildInitOciClientTestCases() + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + operator, patches := tc.setup() + if patches != nil { + defer patches.Reset() + } + err := operator.initOciClient() + if tc.hasError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + } + }) + } +} + +type initOciClientTestCase struct { + name string + setup func() (*RuntimeOperatorTool, *gomonkey.Patches) + hasError bool +} + +func buildInitOciClientTestCases() []initOciClientTestCase { + return []initOciClientTestCase{ + {name: "should initialize OCI client successfully for containerd", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} + p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + return op, p + }, + hasError: false}, + {name: "should initialize OCI client successfully for isulad", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: DefaultIsuladAddr, UseOciBackup: false} + p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + return op, p + }, + hasError: false}, + {name: "should return error when connection fails and no backup", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} + p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) + return op, p + }, + hasError: true}, + {name: "should return error when OCI endpoint is empty", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: "", UseOciBackup: false} + return op, nil + }, + hasError: true}, + {name: "should try backup when primary connection fails", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} + p := gomonkey.ApplyFunc(GetConnection, func(endpoint string) (*grpc.ClientConn, error) { + if endpoint == testContainerdEndpoint { + return nil, errors.New("primary failed") + } + return nil, errors.New("backup failed") + }) + return op, p + }, + hasError: true}, + {name: "should return error when all connections fail", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} + p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("all failed")) + return op, p + }, + hasError: true}, + } +} + +func TestSockCheck(t *testing.T) { + convey.Convey("TestSockCheck", t, func() { + convey.Convey("should pass when socket paths are valid", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) + defer patches.Reset() + patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, nil) + + err := sockCheck(operator) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when CRI endpoint check fails", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) + defer patches.Reset() + + err := sockCheck(operator) + convey.So(err, convey.ShouldNotBeNil) + }) + + convey.Convey("should return error when CRI endpoint permission check fails", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) + defer patches.Reset() + patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, errors.New("permission check failed")) + + err := sockCheck(operator) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestRuntimeOperatorToolClose(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolClose", t, func() { + convey.Convey("should close connections successfully", func() { + operator := &RuntimeOperatorTool{ + conn: &grpc.ClientConn{}, + criConn: &grpc.ClientConn{}, + } + + patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { + return nil + }) + defer patches.Reset() + + err := operator.Close() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when OCI connection close fails", func() { + operator := &RuntimeOperatorTool{ + conn: &grpc.ClientConn{}, + criConn: &grpc.ClientConn{}, + } + + patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { + return errors.New("close failed") + }) + defer patches.Reset() + + err := operator.Close() + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestRuntimeOperatorToolGetContainers(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetContainers", t, func() { + convey.Convey("should return error when CRI client is empty", func() { + operator := &RuntimeOperatorTool{} + + patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) + defer patches.Reset() + + containers, err := operator.GetContainers(context.Background()) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) + convey.So(containers, convey.ShouldBeNil) + }) + + convey.Convey("should return error when CRI connection is nil", func() { + operator := &RuntimeOperatorTool{ + criClient: "mock-client", + } + + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + + containers, err := operator.GetContainers(context.Background()) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) + convey.So(containers, convey.ShouldBeNil) + }) + + convey.Convey("should return error when client type is unexpected", func() { + operator := &RuntimeOperatorTool{ + criClient: "unexpected", + criConn: &grpc.ClientConn{}, + } + + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + + containers, err := operator.GetContainers(context.Background()) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testUnexpectedClientError) + convey.So(containers, convey.ShouldBeNil) + }) + }) +} + +func TestIsUnimplementedError(t *testing.T) { + tests := []struct { + name string + err error + serviceName string + want bool + }{ + { + name: "nil error returns false", + err: nil, + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "non-grpc error returns false", + err: errors.New("unknown service " + testCriV1alpha2), + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "mismatched code returns false", + err: status.Error(codes.NotFound, "unknown service "+testCriV1alpha2), + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "mismatched message returns false", + err: status.Error(codes.Unimplemented, "unknown service "+testCriV1), + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "matched unimplemented error returns true", + err: status.Error(codes.Unimplemented, "unknown service "+testCriV1alpha2), + serviceName: testCriV1alpha2, + want: true, + }, + { + name: "real grpc error format returns true", + err: fmt.Errorf("rpc error: code = Unimplemented desc = unknown service " + testCriV1alpha2), + serviceName: testCriV1alpha2, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isUnimplementedError(tt.err, tt.serviceName); got != tt.want { + t.Errorf("isUnimplementedError() = %v, want %v (err: %v)", got, tt.want, tt.err) + } + }) + } +} + +func TestRuntimeOperatorToolGetContainerInfoByID(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetContainerInfoByID", t, func() { + convey.Convey("should return error when OCI client is empty", func() { + operator := &RuntimeOperatorTool{} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when OCI connection is nil", func() { + operator := &RuntimeOperatorTool{client: "mock-client"} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when client type is unexpected", func() { + operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testUnexpectedContainerdClientError) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when GetContainer call fails", func() { + operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when JSON unmarshal fails", func() { + operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + + }) +} + +func TestRuntimeOperatorToolGetIsulaContainerInfoByID(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetIsulaContainerInfoByID", t, func() { + convey.Convey("should return error when OCI client is empty", func() { + operator := &RuntimeOperatorTool{} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when OCI connection is nil", func() { + operator := &RuntimeOperatorTool{client: "mock-client"} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when client type is unexpected", func() { + operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testUnexpectedIsulaClientError) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when Inspect call fails", func() { + operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when JSON unmarshal fails", func() { + operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + + }) +} + +func TestRuntimeOperatorToolGetContainerType(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetContainerType", t, func() { + convey.Convey("should return isula when endpoint is isulad", func() { + operator := &RuntimeOperatorTool{ + OciEndpoint: DefaultIsuladAddr, + } + + containerType := operator.GetContainerType() + convey.So(containerType, convey.ShouldEqual, IsulaContainer) + }) + + convey.Convey("should return default when endpoint is not isulad", func() { + operator := &RuntimeOperatorTool{ + OciEndpoint: testContainerdEndpoint, + } + + containerType := operator.GetContainerType() + convey.So(containerType, convey.ShouldEqual, DefaultContainer) + }) + }) +} + +func TestSetGrpcNamespaceHeader(t *testing.T) { + convey.Convey("TestSetGrpcNamespaceHeader", t, func() { + convey.Convey("should set namespace header when context has no metadata", func() { + ctx := context.Background() + result := setGrpcNamespaceHeader(ctx, testNamespace) + convey.So(result, convey.ShouldNotBeNil) + }) + + convey.Convey("should set namespace header when context has existing metadata", func() { + ctx := context.Background() + ctx = context.WithValue(ctx, "test", "value") + result := setGrpcNamespaceHeader(ctx, testNamespace) + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +func TestGenContainerRequestV1alpha2(t *testing.T) { + convey.Convey("TestGenContainerRequestV1alpha2", t, func() { + convey.Convey("should generate valid container request", func() { + request := genContainerRequestV1alpha2() + convey.So(request, convey.ShouldNotBeNil) + convey.So(request.Filter, convey.ShouldNotBeNil) + convey.So(request.Filter.State, convey.ShouldNotBeNil) + convey.So(request.Filter.State.State, convey.ShouldEqual, v1alpha2.ContainerState_CONTAINER_RUNNING) + }) + }) +} + +func TestGenContainerRequestV1(t *testing.T) { + convey.Convey("TestGenContainerRequestV1", t, func() { + convey.Convey("should generate valid container request", func() { + request := genContainerRequestV1() + convey.So(request, convey.ShouldNotBeNil) + convey.So(request.Filter, convey.ShouldNotBeNil) + convey.So(request.Filter.State, convey.ShouldNotBeNil) + convey.So(request.Filter.State.State, convey.ShouldEqual, criv1.ContainerState_CONTAINER_RUNNING) + }) + }) +} + +func TestGenIsulaRequest(t *testing.T) { + convey.Convey("TestGenIsulaRequest", t, func() { + convey.Convey("should generate valid isula request", func() { + request := genIsulaRequest() + convey.So(request, convey.ShouldNotBeNil) + convey.So(request.Filter, convey.ShouldNotBeNil) + convey.So(request.Filter.State, convey.ShouldNotBeNil) + convey.So(request.Filter.State.State, convey.ShouldEqual, isula.ContainerState_CONTAINER_RUNNING) + }) + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils.go b/mind-cluster/component/npu-exporter/collector/container/utils.go new file mode 100644 index 0000000..b5ff57e --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/utils.go @@ -0,0 +1,133 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container for monitoring containers' npu allocation +package container + +import ( + "context" + "errors" + "fmt" + "net" + "net/url" + "strings" + "time" + + "google.golang.org/grpc" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + defaultTimeout = 5 * time.Second + unixPrefix = "unix" + // MaxLenDNS configName max len + MaxLenDNS = 512 + // MinLenDNS configName min len + MinLenDNS = 1 + maxContainers = 1024 + maxCgroupPath = 2048 + + maxDevicesNum = 100000 + maxEnvNum = 10000 +) + +// CgroupVersion is the cgroups mode of the host system +type CgroupVersion int + +// GetConnection return the grpc connection +func GetConnection(endPoint string) (*grpc.ClientConn, error) { + if endPoint == "" { + return nil, fmt.Errorf("endpoint is not set") + } + logger.Debugf("connect using endpoint '%s' with '%s' timeout", + utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://")), defaultTimeout) + addr, dialer, err := getAddressAndDialer(endPoint) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + ctx, cancelFn := context.WithTimeout(context.Background(), defaultTimeout) + defer cancelFn() + conn, err := grpc.DialContext(ctx, addr, grpc.WithInsecure(), grpc.WithBlock(), grpc.WithContextDialer(dialer)) + if err != nil { + return nil, err + } + logger.Debugf("connected successfully using endpoint: %s", + utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://"))) + return conn, nil +} + +func parseSocketEndpoint(endpoint string) (string, string, error) { + u, err := url.Parse(endpoint) + if err != nil { + return "", "", err + } + + switch u.Scheme { + case "unix": + return "unix", u.Path, nil + case "tcp": + return "tcp", u.Host, nil + default: + return u.Scheme, "", fmt.Errorf("protocol %q not supported", u.Scheme) + } +} + +// getAddressAndDialer returns the address parsed from the given socket endpoint and dialer +func getAddressAndDialer(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { + prefix, addr, err := parseSocketEndpoint(endpoint) + if err != nil { + return "", nil, err + } + if prefix != unixPrefix { + return "", nil, fmt.Errorf("only support unix socket") + } + return addr, dial, nil +} + +// dial return the context dialer +func dial(ctx context.Context, addr string) (net.Conn, error) { + return (&net.Dialer{}).DialContext(ctx, unixPrefix, addr) +} + +func validDNSRe(dnsContent string) error { + if len(dnsContent) < MinLenDNS || len(dnsContent) > MaxLenDNS { + return errors.New("param len invalid") + } + return nil +} + +func makeUpDeviceInfo(c *CommonContainer) (DevicesInfo, error) { + deviceInfo := DevicesInfo{} + var names []string + + ns := c.Labels[labelK8sPodNamespace] + names = append(names, ns) + podName := c.Labels[labelK8sPodName] + names = append(names, podName) + containerName := c.Labels[labelContainerName] + names = append(names, containerName) + for _, v := range names { + if err := validDNSRe(v); err != nil { + return DevicesInfo{}, err + } + } + + deviceInfo.ID = c.Id + deviceInfo.Name = ns + "_" + podName + "_" + containerName + return deviceInfo, nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils_test.go b/mind-cluster/component/npu-exporter/collector/container/utils_test.go new file mode 100644 index 0000000..32e6716 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/utils_test.go @@ -0,0 +1,329 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// package container test methods in utils +package container + +import ( + "context" + "errors" + "net" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "google.golang.org/grpc" + + "ascend-common/common-utils/hwlog" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + testContainerID = "container123" + testPodNamespace = "default" + testPodName = "test-pod" + testContainerName = "test-container" + testUnixSocket = "unix:///test.sock" + testInvalidEndpoint = "invalid://endpoint" + testDialError = "dial error" + testGrpcDialError = "grpc dial error" + testInvalidEndpointError = "invalid endpoint" + testEndpointNotSetError = "endpoint is not set" + testDNSContent = "test-dns" + testMinDNSContent = "a" + testEmptyDNSContent = "" + testTarget = "test" + testUnixScheme = "unix" + testTcpScheme = "tcp" + testUnixAddr = "/tmp/test.sock" + testTcpAddr = "localhost:8080" + testInvalidURL = "://invalid" + testEmptyNamespace = "" + testEmptyPodName = "" + testEmptyContainerName = "" +) + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") +} + +func TestGetConnection(t *testing.T) { + convey.Convey("TestGetConnection", t, func() { + convey.Convey("should return error when endpoint is empty", func() { + testEmptyEndpoint() + }) + convey.Convey("should return error when endpoint is invalid", func() { + testInvalidEndpointFunc() + }) + convey.Convey("should return error when grpc dial context fails", func() { + testGrpcDialErrorFunc() + }) + convey.Convey("should return connection when successful", func() { + testSuccessfulConnection() + }) + }) +} + +func testEmptyEndpoint() { + conn, err := GetConnection("") + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testEndpointNotSetError) +} + +func testInvalidEndpointFunc() { + patches := gomonkey.ApplyFuncReturn(getAddressAndDialer, "", nil, errors.New(testInvalidEndpointError)) + defer patches.Reset() + conn, err := GetConnection(testInvalidEndpoint) + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testInvalidEndpointError) +} + +func testGrpcDialErrorFunc() { + patches := gomonkey.ApplyFunc(getAddressAndDialer, + func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { + return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { + return nil, errors.New(testDialError) + }, nil + }) + defer patches.Reset() + patches.ApplyFuncReturn(grpc.DialContext, nil, errors.New(testGrpcDialError)) + conn, err := GetConnection(testUnixSocket) + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testGrpcDialError) +} + +func testSuccessfulConnection() { + mockConn := &grpc.ClientConn{} + patches := gomonkey.ApplyFunc(getAddressAndDialer, + func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { + return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { + return nil, nil + }, nil + }) + defer patches.Reset() + patches.ApplyFuncReturn(grpc.DialContext, mockConn, nil) + conn, err := GetConnection(testUnixSocket) + convey.So(conn, convey.ShouldEqual, mockConn) + convey.So(err, convey.ShouldBeNil) +} + +func TestParseSocketEndpoint(t *testing.T) { + testCases := []struct { + name string + endpoint string + expectedScheme string + expectedAddr string + expectedError bool + }{ + {name: "should parse unix endpoint when valid", endpoint: "unix:///tmp/test.sock", + expectedScheme: testUnixScheme, expectedAddr: testUnixAddr, expectedError: false}, + {name: "should parse tcp endpoint when valid", endpoint: "tcp://localhost:8080", + expectedScheme: testTcpScheme, expectedAddr: testTcpAddr, expectedError: false}, + {name: "should return error when scheme is invalid", endpoint: "http://localhost:8080", + expectedScheme: "http", expectedAddr: "", expectedError: true}, + {name: "should return error when url is invalid", endpoint: testInvalidURL, + expectedScheme: "", expectedAddr: "", expectedError: true}, + } + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + scheme, addr, err := parseSocketEndpoint(tc.endpoint) + convey.So(scheme, convey.ShouldEqual, tc.expectedScheme) + convey.So(addr, convey.ShouldEqual, tc.expectedAddr) + if tc.expectedError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + } + }) + } +} + +func TestGetAddressAndDialer(t *testing.T) { + convey.Convey("TestGetAddressAndDialer", t, func() { + testCases := []struct { + name string + endpoint string + expectedAddr string + expectedError bool + }{ + { + name: "should return address when unix endpoint is valid", + endpoint: "unix:///tmp/test.sock", + expectedAddr: "/tmp/test.sock", + expectedError: false, + }, + { + name: "should return error when scheme is invalid", + endpoint: "tcp://localhost:8080", + expectedAddr: "", + expectedError: true, + }, + { + name: "should return error when parse fails", + endpoint: "://invalid", + expectedAddr: "", + expectedError: true, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + addr, dialer, err := getAddressAndDialer(tc.endpoint) + convey.So(addr, convey.ShouldEqual, tc.expectedAddr) + if tc.expectedError { + convey.So(dialer, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(dialer, convey.ShouldNotBeNil) + convey.So(err, convey.ShouldBeNil) + } + }) + } + }) +} + +func TestDial(t *testing.T) { + convey.Convey("should call net.Dialer.DialContext when dialing", t, func() { + var dialerCalled bool + patches := gomonkey.ApplyMethod(&net.Dialer{}, "DialContext", + func(d *net.Dialer, ctx context.Context, network, address string) (net.Conn, error) { + dialerCalled = true + return nil, errors.New("mock dial error") + }) + defer patches.Reset() + ctx := context.Background() + conn, err := dial(ctx, "/tmp/test.sock") + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(dialerCalled, convey.ShouldBeTrue) + }) +} + +func TestValidDNSRe(t *testing.T) { + convey.Convey("TestValidDNSRe", t, func() { + testCases := []struct { + name string + dnsContent string + expectedError bool + }{ + {name: "should pass validation when dns content has valid length", + dnsContent: testDNSContent, expectedError: false}, + {name: "should return error when dns content is empty", + dnsContent: testEmptyDNSContent, expectedError: true}, + {name: "should return error when dns content is too long", + dnsContent: string(make([]byte, MaxLenDNS+1)), expectedError: true}, + {name: "should pass validation when dns content has minimum valid length", + dnsContent: testMinDNSContent, expectedError: false}, + {name: "should pass validation when dns content has maximum valid length", + dnsContent: string(make([]byte, MaxLenDNS)), expectedError: false}, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + err := validDNSRe(tc.dnsContent) + if tc.expectedError { + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "param len invalid") + } else { + convey.So(err, convey.ShouldBeNil) + } + }) + } + }) +} + +func TestMakeUpDeviceInfo(t *testing.T) { + testCases := getMakeUpDeviceInfoTestCases() + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + deviceInfo, err := makeUpDeviceInfo(tc.container) + validateMakeUpDeviceInfoResult(deviceInfo, err, tc) + }) + } +} + +func getMakeUpDeviceInfoTestCases() []struct { + name string + container *CommonContainer + expectedError bool + expectedName string +} { + return []struct { + name string + container *CommonContainer + expectedError bool + expectedName string + }{ + {name: "should return valid device info when container has all labels", + container: createValidContainer(), expectedError: false, expectedName: "default_test-pod_test-container"}, + {name: "should return error when container has invalid namespace length", + container: createContainerWithEmptyNamespace(), expectedError: true, expectedName: ""}, + {name: "should return error when container has invalid pod name length", + container: createContainerWithEmptyPodName(), expectedError: true, expectedName: ""}, + {name: "should return error when container has invalid container name length", + container: createContainerWithEmptyContainerName(), expectedError: true, expectedName: ""}, + {name: "should return error when container has too long namespace", + container: createContainerWithLongNamespace(), expectedError: true, expectedName: ""}, + } +} + +func createValidContainer() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, + labelContainerName: testContainerName}} +} +func createContainerWithEmptyNamespace() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testEmptyNamespace, labelK8sPodName: testPodName, + labelContainerName: testContainerName}} +} +func createContainerWithEmptyPodName() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testEmptyPodName, + labelContainerName: testContainerName}} +} +func createContainerWithEmptyContainerName() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, + labelContainerName: testEmptyContainerName}} +} + +func createContainerWithLongNamespace() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: string(make([]byte, MaxLenDNS+1)), + labelK8sPodName: testPodName, labelContainerName: testContainerName}} +} + +func validateMakeUpDeviceInfoResult(deviceInfo DevicesInfo, err error, tc struct { + name string + container *CommonContainer + expectedError bool + expectedName string +}) { + if tc.expectedError { + convey.So(err, convey.ShouldNotBeNil) + convey.So(deviceInfo, convey.ShouldResemble, DevicesInfo{}) + } else { + convey.So(err, convey.ShouldBeNil) + convey.So(deviceInfo.ID, convey.ShouldEqual, tc.container.Id) + convey.So(deviceInfo.Name, convey.ShouldEqual, tc.expectedName) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go new file mode 100644 index 0000000..46762f3 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go @@ -0,0 +1,310 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: containerd.proto +// protoc:3.13.0 +// protoc-gen-go 1.3.5 + +package v1 + +import ( + "context" + "fmt" + "math" + + "github.com/golang/protobuf/proto" + "github.com/golang/protobuf/ptypes/any" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = fmt.Errorf +var _ = math.Inf +var _ = proto.Marshal + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +type Container struct { + // ID the container id + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // Labels the container labels + Labels map[string]string `protobuf:"bytes,2,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // Image the container image + Image string `protobuf:"bytes,3,opt,name=image,proto3" json:"image,omitempty"` + // Spec runtime specific. + Spec *any.Any `protobuf:"bytes,5,opt,name=spec,proto3" json:"spec,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +// Reset reset the object +func (m *Container) Reset() { *m = Container{} } + +// String +func (m *Container) String() string { return proto.CompactTextString(m) } + +// ProtoMessage +func (*Container) ProtoMessage() {} + +// Descriptor +func (*Container) Descriptor() ([]byte, []int) { + return fileDescriptor_29bcc067d8d1b7d0, []int{0} +} + +// XXX_Unmarshal +func (m *Container) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Container.Unmarshal(m, b) +} + +// XXX_Marshal +func (m *Container) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Container.Marshal(b, m, deterministic) +} + +// XXX_Merge +func (m *Container) XXX_Merge(src proto.Message) { + xxx_messageInfo_Container.Merge(m, src) +} + +// XXX_Size +func (m *Container) XXX_Size() int { + return xxx_messageInfo_Container.Size(m) +} + +// XXX_DiscardUnknown +func (m *Container) XXX_DiscardUnknown() { + xxx_messageInfo_Container.DiscardUnknown(m) +} + +var xxx_messageInfo_Container proto.InternalMessageInfo + +// GetId +func (m *Container) GetId() string { + if m != nil { + return m.Id + } + return "" +} + +// GetLabels +func (m *Container) GetLabels() map[string]string { + if m != nil { + return m.Labels + } + return nil +} + +// GetImage +func (m *Container) GetImage() string { + if m != nil { + return m.Image + } + return "" +} + +// GetSpec +func (m *Container) GetSpec() *any.Any { + if m != nil { + return m.Spec + } + return nil +} + +type GetContainerRequest struct { + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *GetContainerRequest) Reset() { *m = GetContainerRequest{} } +func (m *GetContainerRequest) String() string { return proto.CompactTextString(m) } +func (*GetContainerRequest) ProtoMessage() {} +func (*GetContainerRequest) Descriptor() ([]byte, []int) { + return fileDescriptor_29bcc067d8d1b7d0, []int{1} +} + +func (m *GetContainerRequest) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_GetContainerRequest.Unmarshal(m, b) +} +func (m *GetContainerRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_GetContainerRequest.Marshal(b, m, deterministic) +} +func (m *GetContainerRequest) XXX_Merge(src proto.Message) { + xxx_messageInfo_GetContainerRequest.Merge(m, src) +} +func (m *GetContainerRequest) XXX_Size() int { + return xxx_messageInfo_GetContainerRequest.Size(m) +} +func (m *GetContainerRequest) XXX_DiscardUnknown() { + xxx_messageInfo_GetContainerRequest.DiscardUnknown(m) +} + +var xxx_messageInfo_GetContainerRequest proto.InternalMessageInfo + +func (m *GetContainerRequest) GetId() string { + if m != nil { + return m.Id + } + return "" +} + +type GetContainerResponse struct { + Container *Container `protobuf:"bytes,1,opt,name=container,proto3" json:"container,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *GetContainerResponse) Reset() { *m = GetContainerResponse{} } +func (m *GetContainerResponse) String() string { return proto.CompactTextString(m) } +func (*GetContainerResponse) ProtoMessage() {} +func (*GetContainerResponse) Descriptor() ([]byte, []int) { + return fileDescriptor_29bcc067d8d1b7d0, []int{2} +} + +func (m *GetContainerResponse) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_GetContainerResponse.Unmarshal(m, b) +} +func (m *GetContainerResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_GetContainerResponse.Marshal(b, m, deterministic) +} +func (m *GetContainerResponse) XXX_Merge(src proto.Message) { + xxx_messageInfo_GetContainerResponse.Merge(m, src) +} +func (m *GetContainerResponse) XXX_Size() int { + return xxx_messageInfo_GetContainerResponse.Size(m) +} +func (m *GetContainerResponse) XXX_DiscardUnknown() { + xxx_messageInfo_GetContainerResponse.DiscardUnknown(m) +} + +var xxx_messageInfo_GetContainerResponse proto.InternalMessageInfo + +func (m *GetContainerResponse) GetContainer() *Container { + if m != nil { + return m.Container + } + return nil +} + +func init() { + proto.RegisterType((*Container)(nil), "containerd.services.containers.v1.Container") + proto.RegisterMapType((map[string]string)(nil), "containerd.services.containers.v1.Container.LabelsEntry") + proto.RegisterType((*GetContainerRequest)(nil), "containerd.services.containers.v1.GetContainerRequest") + proto.RegisterType((*GetContainerResponse)(nil), "containerd.services.containers.v1.GetContainerResponse") +} + +func init() { + proto.RegisterFile("containerd.proto", fileDescriptor_29bcc067d8d1b7d0) +} + +var fileDescriptor_29bcc067d8d1b7d0 = []byte{ + // 327 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x51, 0x4f, 0x4b, 0xc3, 0x30, + 0x14, 0xa7, 0xad, 0x1b, 0xec, 0x15, 0x64, 0xc4, 0x1d, 0xea, 0x4e, 0x73, 0x20, 0xf4, 0xa0, 0xa9, + 0xab, 0xa0, 0x53, 0x4f, 0x2a, 0x32, 0x10, 0x0f, 0xd2, 0xa3, 0xb7, 0xb6, 0x7b, 0xce, 0x62, 0x96, + 0xd4, 0x24, 0xad, 0xf6, 0xee, 0x87, 0xf5, 0x63, 0xc8, 0xd2, 0xad, 0x4e, 0x11, 0x74, 0xb7, 0xf7, + 0x5e, 0x7f, 0x7f, 0x1b, 0xe8, 0xa6, 0x82, 0xeb, 0x38, 0xe3, 0x28, 0xa7, 0x34, 0x97, 0x42, 0x0b, + 0xb2, 0xb7, 0x76, 0x51, 0x28, 0xcb, 0x2c, 0x45, 0x45, 0x9b, 0x9b, 0xa2, 0xe5, 0xa8, 0xbf, 0x3b, + 0x13, 0x62, 0xc6, 0x30, 0x30, 0x84, 0xa4, 0x78, 0x0c, 0x62, 0x5e, 0xd5, 0xec, 0xe1, 0x87, 0x05, + 0x9d, 0xeb, 0x15, 0x98, 0x6c, 0x83, 0x9d, 0x4d, 0x3d, 0x6b, 0x60, 0xf9, 0x9d, 0xc8, 0xce, 0xa6, + 0xe4, 0x1e, 0xda, 0x2c, 0x4e, 0x90, 0x29, 0xcf, 0x1e, 0x38, 0xbe, 0x1b, 0x8e, 0xe9, 0x9f, 0x66, + 0xb4, 0x51, 0xa3, 0x77, 0x86, 0x7a, 0xc3, 0xb5, 0xac, 0xa2, 0xa5, 0x0e, 0xe9, 0x41, 0x2b, 0x9b, + 0xc7, 0x33, 0xf4, 0x1c, 0x63, 0x52, 0x2f, 0xc4, 0x87, 0x2d, 0x95, 0x63, 0xea, 0xb5, 0x06, 0x96, + 0xef, 0x86, 0x3d, 0x5a, 0xe7, 0xa5, 0xab, 0xbc, 0xf4, 0x92, 0x57, 0x91, 0x41, 0xf4, 0xcf, 0xc0, + 0x5d, 0x93, 0x25, 0x5d, 0x70, 0x9e, 0xb1, 0x5a, 0x26, 0x5e, 0x8c, 0x0b, 0x83, 0x32, 0x66, 0x05, + 0x7a, 0x76, 0x6d, 0x60, 0x96, 0x73, 0x7b, 0x6c, 0x0d, 0xf7, 0x61, 0x67, 0x82, 0xba, 0x89, 0x17, + 0xe1, 0x4b, 0x81, 0x4a, 0xff, 0xec, 0x3c, 0x4c, 0xa0, 0xf7, 0x1d, 0xa6, 0x72, 0xc1, 0x15, 0x92, + 0x5b, 0xe8, 0x34, 0x45, 0x0d, 0xdc, 0x0d, 0x0f, 0x36, 0xf9, 0x1d, 0xd1, 0x17, 0x3d, 0x7c, 0xb7, + 0x00, 0x9a, 0x0f, 0x8a, 0x94, 0xe0, 0x4c, 0x50, 0x93, 0x93, 0x7f, 0xc8, 0xfd, 0xd2, 0xa0, 0x7f, + 0xba, 0x31, 0xaf, 0xae, 0x74, 0x75, 0xf4, 0x40, 0x9f, 0x8a, 0xf8, 0x15, 0x33, 0x9a, 0x8a, 0x79, + 0xc0, 0xf3, 0xe2, 0x10, 0xdf, 0x72, 0x21, 0x35, 0xca, 0x20, 0x15, 0x8c, 0x61, 0xaa, 0xc5, 0x62, + 0x5a, 0xd2, 0x2e, 0xca, 0x51, 0xd2, 0x36, 0x4f, 0x72, 0xfc, 0x19, 0x00, 0x00, 0xff, 0xff, 0x30, + 0xcc, 0x1c, 0x74, 0x87, 0x02, 0x00, 0x00, +} + +// Reference imports to suppress errors if they are not otherwise used. +var _ context.Context +var _ grpc.ClientConnInterface + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +const _ = grpc.SupportPackageIsVersion6 + +// ContainersClient is the client API for Containers service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. +type ContainersClient interface { + Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) +} + +type containersClient struct { + cc grpc.ClientConnInterface +} + +func NewContainersClient(cc grpc.ClientConnInterface) ContainersClient { + return &containersClient{cc} +} + +func (c *containersClient) Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) { + out := new(GetContainerResponse) + err := c.cc.Invoke(ctx, "/containerd.services.containers.v1.Containers/Get", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// ContainersServer is the server API for Containers service. +type ContainersServer interface { + Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) +} + +// UnimplementedContainersServer can be embedded to have forward compatible implementations. +type UnimplementedContainersServer struct { +} + +func (*UnimplementedContainersServer) Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Get not implemented") +} + +func RegisterContainersServer(s *grpc.Server, srv ContainersServer) { + s.RegisterService(&_Containers_desc, srv) +} + +func _Containers_Get_Method(srv interface{}, ctx context.Context, desc func(interface{}) error, itcpt grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetContainerRequest) + if err := desc(in); err != nil { + return nil, err + } + if itcpt == nil { + return srv.(ContainersServer).Get(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/containerd.services.containers.v1.Containers/Get", + } + handler := func(ctx context.Context, request interface{}) (interface{}, error) { + return srv.(ContainersServer).Get(ctx, request.(*GetContainerRequest)) + } + return itcpt(ctx, in, info, handler) +} + +var _Containers_desc = grpc.ServiceDesc{ + ServiceName: "containerd.services.containers.v1.Containers", + HandlerType: (*ContainersServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "Get", + Handler: _Containers_Get_Method, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "containerd.proto", +} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto new file mode 100644 index 0000000..48a4a4b --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto @@ -0,0 +1,62 @@ +syntax = "proto3"; + +package containerd.services.containers.v1; + + +import "google/protobuf/any.proto"; +import "google/protobuf/timestamp.proto"; + +option go_package = "huawei.com/npu-exporter/v6/collector/container;v1"; + +// Containers provides metadata storage for containers used in the execution +// service. +service Containers { + rpc Get(GetContainerRequest) returns (GetContainerResponse); +} + +message Container { + // ID is the user-specified identifier. + string id = 1; + + // Labels provides an area to include arbitrary data on containers. + map labels = 2; + + // Image contains the reference of the image used to build the + string image = 3; + + message Runtime { + // Name is the name of the runtime. + string name = 1; + // Options runtime initialization options. + google.protobuf.Any options = 2; + } + // Runtime specifies runtime. + Runtime runtime = 4; + + // Spec opencotainer spec. + google.protobuf.Any spec = 5; + + // Snapshotter is the snapshotter name used for rootfs + string snapshotter = 6; + + // SnapshotKey the snapshot key to use for the container's root + string snapshot_key = 7; + + // CreatedAt is the create time of container. + google.protobuf.Timestamp created_at = 8 ; + + // UpdatedAt is the last update of container. + google.protobuf.Timestamp updated_at = 9 ; + + // Extensions allow clients to provide zero or more blobs that are directly + map extensions = 10 ; +} + +message GetContainerRequest { + string id = 1; +} + +message GetContainerResponse { + Container container = 1 ; +} + diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/spec.go b/mind-cluster/component/npu-exporter/collector/container/v1/spec.go new file mode 100644 index 0000000..2efa216 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/v1/spec.go @@ -0,0 +1,59 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package v1 implement the containerd client +package v1 + +// Spec is the base configuration for the container. +type Spec struct { + // Linux is platform-specific configuration for Linux based containers. + Linux *Linux `json:"linux,omitempty" platform:"linux"` + // Process for get capabilities + Process *Process `json:"process,omitempty" platform:"linux"` +} + +// Process is the base configuration for the container. +type Process struct { + // Env for container env + Env []string `json:"env,omitempty" platform:"linux"` +} + +// Linux contains platform-specific configuration for Linux based containers. +type Linux struct { + // Resources contain cgroup information for handling resource constraints + // for the container + Resources *LinuxResources `json:"resources,omitempty"` + // Devices are a list of device nodes that are created for the container +} + +// LinuxResources has container runtime resource constraints +type LinuxResources struct { + // Devices configures the device allowlist. + Devices []LinuxDeviceCgroup `json:"devices,omitempty"` +} + +// LinuxDeviceCgroup represents a device rule for the devices specified to +// the device controller +type LinuxDeviceCgroup struct { + // Allow or deny + Allow bool `json:"allow"` + // Device type, block, char, etc. + Type string `json:"type,omitempty"` + // Major is the device's major number. + Major *int64 `json:"major,omitempty"` + // Minor is the device's minor number. + Minor *int64 `json:"minor,omitempty"` + // Cgroup access permissions format, rwm. + Access string `json:"access,omitempty"` +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go new file mode 100644 index 0000000..53a7645 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go @@ -0,0 +1,142 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + descTotalMemory = colcommon.BuildDesc("npu_chip_info_total_memory", "the npu total memory") + descUsedMemory = colcommon.BuildDesc("npu_chip_info_used_memory", "the npu used memory") + + notSupportedDdrDevices = map[string]bool{ + api.Ascend910B: true, + api.Ascend910A3: true, + } +) + +type ddrCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo the memoryInfo of the chip + extInfo *common.MemoryInfo +} + +// DdrCollector collect ddr info +type DdrCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the metric is supported +func (c *DdrCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := !notSupportedDdrDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "there is no DDR module. DDR information cannot be queried.") + return isSupport +} + +// Describe description of the metric +func (c *DdrCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descTotalMemory + ch <- descUsedMemory +} + +// CollectToCache collect the metric to cache +func (c *DdrCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + + for _, chip := range chipList { + logicID := chip.LogicID + mem, err := n.Dmgr.GetDeviceMemoryInfo(logicID) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForDDR, logicID, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForDDR, logicID) + + c.LocalCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mem}) + } + colcommon.UpdateCache[ddrCache](n, colcommon.GetCacheKey(c), &c.LocalCache) + +} + +// UpdatePrometheus update prometheus metrics +func (c *DdrCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache ddrCache, cardLabel []string) { + extInfo := cache.extInfo + if extInfo == nil { + return + } + memorySize := extInfo.MemorySize + memoryAvailable := extInfo.MemoryAvailable + + doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, descTotalMemory) + doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, descUsedMemory) + + // vnpu not support this metrics + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + + containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) + if !c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { + doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, npuCtrTotalMemory) + doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, npuCtrUsedMemory) + } + } + + updateFrame[ddrCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf update telegraf metrics +func (c *DdrCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + logger.Debugf("cacheKey(%v) not found", chip.PhyId) + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + memoryInfo := cache.extInfo + if memoryInfo == nil { + logger.Debugf("info in cache is nil,cacheKey(%v)", chip.PhyId) + continue + } + memorySize := memoryInfo.MemorySize + memoryAvailable := memoryInfo.MemoryAvailable + + doUpdateTelegraf(fieldMap, descTotalMemory, memorySize, "") + doUpdateTelegraf(fieldMap, descUsedMemory, memorySize-memoryAvailable, "") + + } + return fieldsMap +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go new file mode 100644 index 0000000..d9f5601 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go @@ -0,0 +1,228 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +var ( + descHbmUsedMemory = colcommon.BuildDesc("npu_chip_info_hbm_used_memory", "the npu hbm used memory") + descHbmTotalMemory = colcommon.BuildDesc("npu_chip_info_hbm_total_memory", "the npu hbm total memory") + descHbmUtilization = colcommon.BuildDesc("npu_chip_info_hbm_utilization", "the npu hbm utilization") + descHbmTemperature = colcommon.BuildDesc("npu_chip_info_hbm_temperature", "the npu hbm temperature") + descHbmBWUtil = colcommon.BuildDesc("npu_chip_info_hbm_bandwidth_utilization", "the npu hbm bandwidth util rate") + + descEccEnableFlag = colcommon.BuildDesc("npu_chip_info_hbm_ecc_enable_flag", + "whether HBM ecc detection is enabled") + descEccSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_error_cnt", + "HBM Single Bit Error Count") + descEccDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_error_cnt", + "HBM Double Bit Error Count") + + descEccTotalSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_single_bit_error_cnt", + "HBM Single Bit Aggregate Total Err Cnt") + descEccTotalDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_double_bit_error_cnt", + "HBM Double Bit Aggregate Total Err Cnt") + descEccSingleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_isolated_pages_cnt", + "HBM Single Bit Isolated Pages Count") + descEccDoubleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_isolated_pages_cnt", + "HBM Double Bit Isolated Pages Count") +) + +var ( + supportedHbmDevices = map[string]bool{ + api.Ascend910A: true, + api.Ascend910B: true, + api.Ascend910A3: true, + } +) + +type hbmCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo the hbm info + extInfo *common.HbmAggregateInfo + // hbmUtilization the hbm utilization + hbmUtilization uint32 +} + +// HbmCollector collects hbm info +type HbmCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *HbmCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedHbmDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe describes all the metrics that will be exposed. +func (c *HbmCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descHbmUsedMemory + ch <- descHbmTotalMemory + ch <- descHbmUtilization + ch <- descHbmTemperature + ch <- descHbmBWUtil + + ch <- descEccEnableFlag + ch <- descEccSingleBitErrorCnt + ch <- descEccDoubleBitErrorCnt + ch <- descEccTotalSingleBitErrorCnt + ch <- descEccTotalDoubleBitErrorCnt + ch <- descEccSingleBitIoslatedPagesCnt + ch <- descEccDoubleBitIoslatedPagesCnt +} + +// CollectToCache collects hbm info +func (c *HbmCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + getAllHBMEccInfo(c, chip.LogicID, n.Dmgr, &chip) + } + colcommon.UpdateCache[hbmCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus updates the prometheus metrics. +func (c *HbmCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hbmCache, cardLabel []string) { + extInfo := cache.extInfo + if extInfo == nil { + return + } + timestamp := cache.timestamp + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.hbmUtilization), cardLabel, descHbmUtilization) + + c.updateHbmInfo(ch, cache, cardLabel, containerMap, chipWithVnpu) + + eccInfo := extInfo.ECCInfo + updateHbmEccInfo(ch, eccInfo, timestamp, cardLabel) + } + + updateFrame[hbmCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf updates the telegraf metrics. +func (c *HbmCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + caches := colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + + doUpdateTelegrafWithValidateNum(fieldMap, descHbmUtilization, float64(cache.hbmUtilization), "") + + hbmInfo := extInfo.HbmInfo + if hbmInfo != nil { + doUpdateTelegraf(fieldMap, descHbmUsedMemory, hbmInfo.Usage, "") + doUpdateTelegraf(fieldMap, descHbmTotalMemory, hbmInfo.MemorySize, "") + doUpdateTelegraf(fieldMap, descHbmTemperature, hbmInfo.Temp, "") + doUpdateTelegraf(fieldMap, descHbmBWUtil, hbmInfo.BandWidthUtilRate, "") + } + + eccInfo := extInfo.ECCInfo + if eccInfo != nil { + doUpdateTelegraf(fieldMap, descEccEnableFlag, eccInfo.EnableFlag, "") + doUpdateTelegraf(fieldMap, descEccSingleBitErrorCnt, eccInfo.SingleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccDoubleBitErrorCnt, eccInfo.DoubleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccTotalSingleBitErrorCnt, eccInfo.TotalSingleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccTotalDoubleBitErrorCnt, eccInfo.TotalDoubleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccSingleBitIoslatedPagesCnt, eccInfo.SingleBitIsolatedPagesCnt, "") + doUpdateTelegraf(fieldMap, descEccDoubleBitIoslatedPagesCnt, eccInfo.DoubleBitIsolatedPagesCnt, "") + + } + } + return fieldsMap + +} + +func getAllHBMEccInfo(c *HbmCollector, logicID int32, dmgr devmanager.DeviceInterface, chip *colcommon.HuaWeiAIChip) { + + hbmInfo := &common.HbmAggregateInfo{} + var utilizationRate uint32 + var err error + hbmInfo.HbmInfo, err = dmgr.GetDeviceHbmInfo(logicID) + handleErr(err, colcommon.DomainForHBM, logicID) + + utilizationRate, err = dmgr.GetDeviceUtilizationRate(logicID, common.HbmUtilization) + handleErr(err, colcommon.DomainForHbmUtilization, logicID) + + hbmInfo.ECCInfo, err = dmgr.GetDeviceEccInfo(logicID, common.DcmiDeviceTypeHBM) + handleErr(err, colcommon.DomainForHBMECC, logicID) + c.LocalCache.Store(chip.PhyId, hbmCache{ + chip: *chip, + timestamp: time.Now(), + extInfo: hbmInfo, + hbmUtilization: utilizationRate}, + ) +} + +func updateHbmEccInfo(ch chan<- prometheus.Metric, eccInfo *common.ECCInfo, timestamp time.Time, cardLabel []string) { + if eccInfo == nil { + return + } + doUpdateMetric(ch, timestamp, eccInfo.EnableFlag, cardLabel, descEccEnableFlag) + doUpdateMetric(ch, timestamp, eccInfo.SingleBitErrorCnt, cardLabel, descEccSingleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.DoubleBitErrorCnt, cardLabel, descEccDoubleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.TotalSingleBitErrorCnt, cardLabel, descEccTotalSingleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.TotalDoubleBitErrorCnt, cardLabel, descEccTotalDoubleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.SingleBitIsolatedPagesCnt, cardLabel, descEccSingleBitIoslatedPagesCnt) + doUpdateMetric(ch, timestamp, eccInfo.DoubleBitIsolatedPagesCnt, cardLabel, descEccDoubleBitIoslatedPagesCnt) +} + +func (c *HbmCollector) updateHbmInfo(ch chan<- prometheus.Metric, cache hbmCache, cardLabel []string, + containerMap map[int32]container.DevicesInfo, chipWithVnpu colcommon.HuaWeiAIChip) { + hbmInfo := cache.extInfo + if hbmInfo == nil || hbmInfo.HbmInfo == nil { + return + } + timestamp := cache.timestamp + doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, descHbmUsedMemory) + doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, descHbmTotalMemory) + doUpdateMetric(ch, timestamp, hbmInfo.Temp, cardLabel, descHbmTemperature) + doUpdateMetric(ch, timestamp, hbmInfo.BandWidthUtilRate, cardLabel, descHbmBWUtil) + + // vnpu not support this metrics + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + + containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) + if c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { + doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, npuCtrTotalMemory) + doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, npuCtrUsedMemory) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go new file mode 100644 index 0000000..4bf59cd --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go @@ -0,0 +1,115 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" +) + +type TestCase struct { + name string + initFunc func() + expectMetricLen int +} + +const ( + expectMetricLen4 = 4 + expectMetricLen6 = 6 + vdevId = 132 + maxMetrics = 10 + mockNs = "mockNs" + mockPodName = "mockPodName" +) + +func TestUpdateHbmInfo(t *testing.T) { + collector := HbmCollector{} + ch := make(chan int, maxMetrics) + defer close(ch) + cache := buildHbmCache() + chipWithVnpu := &colcommon.HuaWeiAIChip{} + cases := buildTestCases(&collector, chipWithVnpu, &cache) + patch := gomonkey.NewPatches() + patch.ApplyFunc(doUpdateMetric, func(_ chan<- prometheus.Metric, _ time.Time, _ interface{}, _ []string, + desc *prometheus.Desc) { + ch <- 0 + }) + patch.ApplyFuncReturn(geenContainerInfo, nil) + patch.ApplyFuncReturn(getContainerNameArray, []string{mockNs, mockPodName, mockContainerName}) + defer patch.Reset() + + for _, c := range cases { + convey.Convey(c.name, t, func() { + ch = make(chan int, maxMetrics) + c.initFunc() + collector.updateHbmInfo(nil, cache, nil, nil, *chipWithVnpu) + convey.So(len(ch), convey.ShouldEqual, c.expectMetricLen) + }) + } +} + +func buildTestCases(collector *HbmCollector, chipWithVnpu *colcommon.HuaWeiAIChip, cache *hbmCache) []TestCase { + cases := []TestCase{ + {name: "when npu is not 910 series ", initFunc: func() {}, expectMetricLen: expectMetricLen4}, + {name: "when vnpu is nil and with container info", initFunc: func() { + collector.Is910Series = true + }, expectMetricLen: expectMetricLen6}, + {name: "when chip is vnpu", initFunc: func() { + chipWithVnpu.VDevActivityInfo = &common.VDevActivityInfo{ + VDevID: vdevId, + } + }, expectMetricLen: expectMetricLen4}, + {name: "when extInfo.HbmInfo is nil", initFunc: func() { cache.extInfo.HbmInfo = nil }, expectMetricLen: 0}, + {name: "when extInfo is nil", initFunc: func() { cache.extInfo = nil }, expectMetricLen: 0}, + } + return cases +} + +func buildHbmCache() hbmCache { + cache := hbmCache{ + chip: colcommon.HuaWeiAIChip{ + PhyId: 0, + }, + hbmUtilization: 0, + timestamp: time.Now(), + extInfo: &common.HbmAggregateInfo{ + HbmInfo: &common.HbmInfo{ + BandWidthUtilRate: 0, + Frequency: 0, + MemorySize: 0, + Temp: 0, + Usage: 0, + }, + ECCInfo: &common.ECCInfo{ + EnableFlag: 0, + SingleBitErrorCnt: 0, + DoubleBitErrorCnt: 0, + TotalSingleBitErrorCnt: 0, + TotalDoubleBitErrorCnt: 0, + SingleBitIsolatedPagesCnt: 0, + DoubleBitIsolatedPagesCnt: 0, + }, + }, + } + return cache +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go new file mode 100644 index 0000000..1ecc3a9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go @@ -0,0 +1,312 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "fmt" + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + hccsTxDescs []*prometheus.Desc + hccsRxDescs []*prometheus.Desc + hccsErrDescs []*prometheus.Desc + hccsBWTxDescs []*prometheus.Desc + hccsBWRxDescs []*prometheus.Desc + hccsBWProfilingTime *prometheus.Desc = nil + hccsBWTotalTx *prometheus.Desc = nil + hccsBWTotalRx *prometheus.Desc = nil + + supportedHccsDevices = map[string]bool{ + api.Ascend910B: true, + api.Ascend910A3: true, + } +) + +const ( + // MaxHccsNum max hccs num + MaxHccsNum int = 8 + // hccs info begin index, 1 or 2 + num1 = 1 + num2 = 2 +) + +// init add descs in init method +func init() { + for i := 0; i < MaxHccsNum; i++ { + index := strconv.Itoa(i) + colcommon.BuildDescSlice(&hccsTxDescs, api.Prefix+"tx_cnt_"+index, + "transmitted message count for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsRxDescs, api.Prefix+"rx_cnt_"+index, + "received message count for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsErrDescs, api.Prefix+"crc_err_cnt_"+index, + "crc error count for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsBWTxDescs, api.BwPrefix+"tx_"+index, + "single-link transmission data bandwidth for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsBWRxDescs, api.BwPrefix+"rx_"+index, + "single-link receive data bandwidth for "+api.Hccs+" "+index) + } + hccsBWProfilingTime = colcommon.BuildDesc(api.BwPrefix+"profiling_time", + "sampling interval for "+api.Hccs+" bandwidth") + hccsBWTotalTx = colcommon.BuildDesc(api.BwPrefix+"total_tx", "total sent data bandwidth") + hccsBWTotalRx = colcommon.BuildDesc(api.BwPrefix+"total_rx", "total received data bandwidth") +} + +type hccsCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // hccsStat hccs info of npu chip + hccsStat *common.HccsStatisticInfo + + // hccsBW hccs bandwidth info of npu chip + hccsBW *common.HccsBandwidthInfo +} + +// HccsCollector collect hccs info +type HccsCollector struct { + colcommon.MetricsCollectorAdapter + hccsBeginIndex int + + // Automatically adapt according to the interface call + realGetStatisticInfoFunc func(logicID int32) (*common.HccsStatisticInfo, error) +} + +// IsSupported judge whether the collector is supported +func (c *HccsCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedHccsDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe description of the metric +func (c *HccsCollector) Describe(ch chan<- *prometheus.Desc) { + for _, desc := range hccsTxDescs { + ch <- desc + } + for _, desc := range hccsRxDescs { + ch <- desc + } + for _, desc := range hccsErrDescs { + ch <- desc + } + for _, desc := range hccsBWTxDescs { + ch <- desc + } + for _, desc := range hccsBWRxDescs { + ch <- desc + } + ch <- hccsBWProfilingTime + ch <- hccsBWTotalTx + ch <- hccsBWTotalRx +} + +// CollectToCache collect the metric to cache +func (c *HccsCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + logicID := chip.LogicID + var hccsStatisticInfo *common.HccsStatisticInfo + var err error + if c.realGetStatisticInfoFunc != nil { + hccsStatisticInfo, err = c.realGetStatisticInfoFunc(logicID) + } else { + hccsStatisticInfo = buildFailedHccsInfo() + err = fmt.Errorf("realGetStatisticInfoFunc is nil when get hccs info, " + + "maybe both GetHccsStatisticInfoInU64 and GetHccsStatisticInfo can't be unreached") + } + handleErr(err, colcommon.DomainForHccs, logicID) + + hccsBandwidthInfo, err := n.Dmgr.GetHccsBandwidthInfo(logicID) + handleErr(err, colcommon.DomainForHccsBW, logicID) + c.LocalCache.Store(chip.PhyId, hccsCache{ + chip: chip, + timestamp: time.Now(), + hccsStat: hccsStatisticInfo, + hccsBW: hccsBandwidthInfo}, + ) + } + + colcommon.UpdateCache[hccsCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// PreCollect pre collect hccs info +func (c *HccsCollector) PreCollect(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + if len(chipList) == 0 { + return + } + chipOne := chipList[0] + devType := n.Dmgr.GetDevType() + if devType == api.Ascend910B || common.IsA900A3SuperPod(chipOne.MainBoardId) || + common.Is800IA3Chip(chipOne.MainBoardId) { + // A2 or A900A3 SuperPod or 800IA3 begin at 1st bit + c.hccsBeginIndex = num1 + } else if common.IsA9000A3SuperPod(chipOne.MainBoardId) { + // A9000A3SuperPod begin at 2nd bit + c.hccsBeginIndex = num2 + } else { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: api.Hccs, ID: "0"}, + "not support main board id:%d", chipOne.MainBoardId) + } + + // Both failed, retry 3 times with 2s interval + const retryTimes = 3 + const retryInterval = 2 * time.Second + var success bool + var err1, err2 error + for i := 0; i < retryTimes; i++ { + _, err1 = n.Dmgr.GetHccsStatisticInfoInU64(chipOne.LogicID) + if err1 == nil { + logger.Infof("get hccs statistic info by subCmd(5) succeeded, will use subCmd(5) to get hccs info") + c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfoInU64 + success = true + break + } + _, err2 = n.Dmgr.GetHccsStatisticInfo(chipOne.LogicID) + if err2 == nil { + logger.Infof("get hccs statistic info by subCmd(3) succeeded, will use subCmd(3) to get hccs info") + c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfo + success = true + break + } + time.Sleep(retryInterval) + } + // If still failed after retries, set to nil and log error + if !success { + logger.Errorf("get hccs statistic info failed after trying both subCmd(5) and subCmd(3) with 3 retries, "+ + "err1: %v, err2: %v", err1, err2) + c.realGetStatisticInfoFunc = nil + } + +} + +// UpdatePrometheus update prometheus +func (c *HccsCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hccsCache, cardLabel []string) { + timestamp := cache.timestamp + promUpdateHccsStatisticInfo(ch, cache, c, timestamp, cardLabel) + promUpdateHccsBwInfo(ch, cache, c, timestamp, cardLabel) + } + updateFrame[hccsCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +func promUpdateHccsBwInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, + timestamp time.Time, cardLabel []string) { + bandwidthInfo := cache.hccsBW + if bandwidthInfo == nil { + return + } + if c.hccsBeginIndex < 0 { + logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateMetric(ch, timestamp, bandwidthInfo.TxBandwidth[i], cardLabel, hccsBWTxDescs[i]) + doUpdateMetric(ch, timestamp, bandwidthInfo.RxBandwidth[i], cardLabel, hccsBWRxDescs[i]) + } + doUpdateMetric(ch, timestamp, bandwidthInfo.ProfilingTime, cardLabel, hccsBWProfilingTime) + doUpdateMetric(ch, timestamp, bandwidthInfo.TotalTxbw, cardLabel, hccsBWTotalTx) + doUpdateMetric(ch, timestamp, bandwidthInfo.TotalRxbw, cardLabel, hccsBWTotalRx) +} + +func promUpdateHccsStatisticInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, + timestamp time.Time, cardLabel []string) { + statisticInfo := cache.hccsStat + + if statisticInfo == nil { + return + } + if c.hccsBeginIndex < 0 { + logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateMetric(ch, timestamp, statisticInfo.TxCnt[i], cardLabel, hccsTxDescs[i]) + doUpdateMetric(ch, timestamp, statisticInfo.RxCnt[i], cardLabel, hccsRxDescs[i]) + doUpdateMetric(ch, timestamp, statisticInfo.CrcErrCnt[i], cardLabel, hccsErrDescs[i]) + } +} + +// UpdateTelegraf update telegraf +func (c *HccsCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + telegrafUpdateHccsStatisticInfo(cache, c, fieldMap) + telegrafUpdateHccsBwInfo(cache, c, fieldMap) + } + + return fieldsMap + +} + +func telegrafUpdateHccsBwInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { + bandwidthInfo := cache.hccsBW + if bandwidthInfo == nil || c.hccsBeginIndex < 0 { + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateTelegraf(fieldMap, hccsBWTxDescs[i], bandwidthInfo.TxBandwidth[i], "") + doUpdateTelegraf(fieldMap, hccsBWRxDescs[i], bandwidthInfo.RxBandwidth[i], "") + } + doUpdateTelegraf(fieldMap, hccsBWProfilingTime, bandwidthInfo.ProfilingTime, "") + doUpdateTelegraf(fieldMap, hccsBWTotalTx, bandwidthInfo.TotalTxbw, "") + doUpdateTelegraf(fieldMap, hccsBWTotalRx, bandwidthInfo.TotalRxbw, "") +} + +func telegrafUpdateHccsStatisticInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { + statisticInfo := cache.hccsStat + + if statisticInfo == nil || c.hccsBeginIndex < 0 { + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateTelegraf(fieldMap, hccsTxDescs[i], statisticInfo.TxCnt[i], "") + doUpdateTelegraf(fieldMap, hccsRxDescs[i], statisticInfo.RxCnt[i], "") + doUpdateTelegraf(fieldMap, hccsErrDescs[i], statisticInfo.CrcErrCnt[i], "") + } +} + +// buildFailedHccsInfo build failed hccs info +func buildFailedHccsInfo() *common.HccsStatisticInfo { + errorResult := &common.HccsStatisticInfo{ + TxCnt: make([]uint64, 8), + RxCnt: make([]uint64, 8), + CrcErrCnt: make([]uint64, 8), + } + for i := 0; i < 8; i++ { + errorResult.TxCnt[i] = common.FailedValue + errorResult.RxCnt[i] = common.FailedValue + errorResult.CrcErrCnt[i] = common.FailedValue + } + return errorResult +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go new file mode 100644 index 0000000..4b596df --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go @@ -0,0 +1,150 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" +) + +const ( + mockLogicID int32 = 0 + mockMainBoardId uint32 = 100 + errorMsgWith8001 string = "error code 8001 occurred" + errorMsgWithout8001 string = "error code 8002 occurred" + singleChipList int = 1 + unsupportedBoardId uint32 = 999 +) + +type preCollectTestCase struct { + name string + chipList []colcommon.HuaWeiAIChip + devType string + mainBoardId uint32 + isA900A3SuperPod bool + isA9000A3SuperPod bool + is800IA3Chip bool + getStatInfoErr error + expectedBeginIndex int + expectedFuncSet bool +} + +func TestPreCollect(t *testing.T) { + n := mockNewNpuCollector() + testCases := buildPreCollectTestCases() + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + + setupPatches(patches, n, tc) + collector := &HccsCollector{} + collector.PreCollect(n, tc.chipList) + verifyPreCollectResult(collector, tc) + }) + } +} + +func buildPreCollectTestCases() []preCollectTestCase { + cases := []preCollectTestCase{ + {name: "should return early when chipList is empty", + chipList: []colcommon.HuaWeiAIChip{}, + expectedBeginIndex: 0, + expectedFuncSet: false}, + {name: "should not set beginIndex when mainBoardId is not supported", + chipList: createMockChipList(singleChipList, unsupportedBoardId), + devType: api.Ascend910A3, + mainBoardId: unsupportedBoardId, + getStatInfoErr: nil, + expectedBeginIndex: 0, + expectedFuncSet: true}, + } + cases = append(cases, buildBeginIndexCases()...) + return cases +} + +func buildBeginIndexCases() []preCollectTestCase { + return []preCollectTestCase{ + {name: "should set beginIndex to num1 when devType is Ascend910B", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910B, + mainBoardId: mockMainBoardId, + getStatInfoErr: nil, + expectedBeginIndex: num1, + expectedFuncSet: true}, + {name: "should set beginIndex to num1 when IsA900A3SuperPod returns true", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910A3, + mainBoardId: mockMainBoardId, + isA900A3SuperPod: true, + getStatInfoErr: nil, + expectedBeginIndex: num1, + expectedFuncSet: true}, + {name: "should set beginIndex to num1 when Is800IA3Chip returns true", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910A3, + mainBoardId: mockMainBoardId, + is800IA3Chip: true, + getStatInfoErr: nil, + expectedBeginIndex: num1, + expectedFuncSet: true}, + {name: "should set beginIndex to num2 when IsA9000A3SuperPod returns true", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910A3, + mainBoardId: mockMainBoardId, + isA9000A3SuperPod: true, + getStatInfoErr: nil, + expectedBeginIndex: num2, + expectedFuncSet: true}, + } +} + +func createMockChipList(count int, mainBoardId uint32) []colcommon.HuaWeiAIChip { + if count == 0 { + return []colcommon.HuaWeiAIChip{} + } + return []colcommon.HuaWeiAIChip{ + { + LogicID: mockLogicID, + MainBoardId: mainBoardId, + }, + } +} + +func setupPatches(patches *gomonkey.Patches, n *colcommon.NpuCollector, tc preCollectTestCase) { + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tc.devType) + patches.ApplyFuncReturn(common.IsA900A3SuperPod, tc.isA900A3SuperPod) + patches.ApplyFuncReturn(common.IsA9000A3SuperPod, tc.isA9000A3SuperPod) + patches.ApplyFuncReturn(common.Is800IA3Chip, tc.is800IA3Chip) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", + &common.HccsStatisticInfo{}, tc.getStatInfoErr) +} + +func verifyPreCollectResult(collector *HccsCollector, tc preCollectTestCase) { + convey.So(collector.hccsBeginIndex, convey.ShouldEqual, tc.expectedBeginIndex) + if tc.expectedFuncSet { + convey.So(collector.realGetStatisticInfoFunc, convey.ShouldNotBeNil) + } else { + convey.So(collector.realGetStatisticInfoFunc, convey.ShouldBeNil) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go new file mode 100644 index 0000000..018a370 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go @@ -0,0 +1,190 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +var ( + // bandwidth + descBandwidthTx = colcommon.BuildDesc("npu_chip_info_bandwidth_tx", + "the npu interface transport speed, unit is 'MB/s'") + descBandwidthRx = colcommon.BuildDesc("npu_chip_info_bandwidth_rx", + "the npu interface receive speed, unit is 'MB/s'") + + // linkspeed + npuChipLinkSpeed = colcommon.BuildDesc("npu_chip_link_speed", + "the npu interface receive link speed, unit is 'Mb/s'") + + // linkupNum + npuChipLinkUpNum = colcommon.BuildDesc("npu_chip_link_up_num", "the npu interface receive link-up num") + + // linkstatus + descLinkStatus = colcommon.BuildDesc("npu_chip_info_link_status", "the npu link status") +) + +type netInfoCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + extInfo *common.NpuNetInfo +} + +// NetworkCollector collects the network info +type NetworkCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check if the collector is supported +func (c *NetworkCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := n.Dmgr.IsTrainingCard() + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "only training card supports network related info") + return isSupport +} + +// Describe description of the metric +func (c *NetworkCollector) Describe(ch chan<- *prometheus.Desc) { + // bandwidth + ch <- descBandwidthTx + ch <- descBandwidthRx + // linkspeed + ch <- npuChipLinkSpeed + // linkupNum + ch <- npuChipLinkUpNum + // linkstatus + ch <- descLinkStatus +} + +// CollectToCache collect the metric to cache +func (c *NetworkCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + netInfo := collectNetworkInfo(chip.PhyId) + c.LocalCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: &netInfo}) + } + colcommon.UpdateCache[netInfoCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *NetworkCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache netInfoCache, cardLabel []string) { + netInfo := cache.extInfo + if netInfo == nil { + return + } + time := cache.timestamp + if validateNotNilForEveryElement(netInfo.BandwidthInfo) { + doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.TxValue, cardLabel, descBandwidthTx) + doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.RxValue, cardLabel, descBandwidthRx) + } + if validateNotNilForEveryElement(netInfo.LinkSpeedInfo) { + doUpdateMetricWithValidateNum(ch, time, netInfo.LinkSpeedInfo.Speed, cardLabel, npuChipLinkSpeed) + } + if validateNotNilForEveryElement(netInfo.LinkStatInfo) { + doUpdateMetricWithValidateNum(ch, time, netInfo.LinkStatInfo.LinkUPNum, cardLabel, npuChipLinkUpNum) + } + if validateNotNilForEveryElement(netInfo.LinkStatusInfo) { + doUpdateMetricWithValidateNum(ch, time, float64(hccn.GetLinkStatusCode(netInfo.LinkStatusInfo.LinkState)), + cardLabel, descLinkStatus) + } + } + updateFrame[netInfoCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf update telegraf metrics +func (c *NetworkCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + extInfo := cache.extInfo + if extInfo == nil { + continue + } + if validateNotNilForEveryElement(extInfo.BandwidthInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthTx, extInfo.BandwidthInfo.TxValue, "") + doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthRx, extInfo.BandwidthInfo.RxValue, "") + } + if validateNotNilForEveryElement(extInfo.LinkSpeedInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkSpeed, extInfo.LinkSpeedInfo.Speed, "") + } + if validateNotNilForEveryElement(extInfo.LinkStatInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkUpNum, extInfo.LinkStatInfo.LinkUPNum, "") + } + if validateNotNilForEveryElement(extInfo.LinkStatusInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, descLinkStatus, + float64(hccn.GetLinkStatusCode(extInfo.LinkStatusInfo.LinkState)), "") + } + } + return fieldsMap +} + +func collectNetworkInfo(phyID int32) common.NpuNetInfo { + newNetInfo := common.NpuNetInfo{} + + newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} + if linkState, err := hccn.GetNPULinkStatus(phyID); err == nil { + newNetInfo.LinkStatusInfo.LinkState = linkState + hwlog.ResetErrCnt(colcommon.DomainForLinkState, phyID) + } else { + logErrMetricsWithLimit(colcommon.DomainForLinkState, phyID, err) + newNetInfo.LinkStatusInfo.LinkState = colcommon.Abnormal + } + + if tx, rx, err := hccn.GetNPUInterfaceTraffic(phyID); err == nil { + newNetInfo.BandwidthInfo = &common.BandwidthInfo{} + newNetInfo.BandwidthInfo.RxValue = rx + newNetInfo.BandwidthInfo.TxValue = tx + hwlog.ResetErrCnt(colcommon.DomainForBandwidth, phyID) + } else { + newNetInfo.BandwidthInfo = nil + logErrMetricsWithLimit(colcommon.DomainForBandwidth, phyID, err) + } + if linkUpNum, err := hccn.GetNPULinkUpNum(phyID); err == nil { + newNetInfo.LinkStatInfo = &common.LinkStatInfo{} + newNetInfo.LinkStatInfo.LinkUPNum = float64(linkUpNum) + hwlog.ResetErrCnt(colcommon.DomainForLinkStat, phyID) + } else { + newNetInfo.LinkStatInfo = nil + logErrMetricsWithLimit(colcommon.DomainForLinkStat, phyID, err) + } + + if speed, err := hccn.GetNPULinkSpeed(phyID); err == nil { + newNetInfo.LinkSpeedInfo = &common.LinkSpeedInfo{} + newNetInfo.LinkSpeedInfo.Speed = float64(speed) + hwlog.ResetErrCnt(colcommon.DomainForLinkSpeed, phyID) + } else { + newNetInfo.LinkSpeedInfo = nil + logErrMetricsWithLimit(colcommon.DomainForLinkSpeed, phyID, err) + } + + return newNetInfo +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go new file mode 100644 index 0000000..975ffcf --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go @@ -0,0 +1,453 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "math" + "strconv" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + errorCodeDescs []*prometheus.Desc + cardLabelForProcess = append(colcommon.CardLabel, "process_id", "container_id") + cardLabelForContainer []string + cardLabelForSN []string + cardLabelForNpuName = make([]string, len(colcommon.CardLabel)) +) + +var ( + machineInfoNPUDesc = colcommon.BuildDescWithLabel("machine_npu_nums", "Amount of npu installed on the machine.", nil) + + descUtil = colcommon.BuildDesc("npu_chip_info_utilization", "the ai core utilization") + descOverUtil = colcommon.BuildDesc("npu_chip_info_overall_utilization", "the overall utilization of npu") + descVectorUtil = colcommon.BuildDesc("npu_chip_info_vector_utilization", "the vector ai core utilization") + descTemp = colcommon.BuildDesc("npu_chip_info_temperature", "the npu temperature") + descPower = colcommon.BuildDesc("npu_chip_info_power", "the npu power") + descVoltage = colcommon.BuildDesc("npu_chip_info_voltage", "the npu voltage") + + descAICoreFreq = colcommon.BuildDesc("npu_chip_info_aicore_current_freq", + "the npu ai core current frequency, unit is 'MHz'") + descHealthStatus = colcommon.BuildDesc("npu_chip_info_health_status", "the npu health status") + descDevProcessNum = colcommon.BuildDesc("npu_chip_info_process_info_num", + "the npu process num") + + descDevProcessInfo = colcommon.BuildDescWithLabel("npu_chip_info_process_info", + "the npu process info, unit is 'MB'. if process run on host, container_id and container_name will be empty", + cardLabelForProcess) + + // net status + descNetworkStatus = colcommon.BuildDesc("npu_chip_info_network_status", "the npu network health status") + + // container (vnpu not support this metrics), only report to prometheus + npuCtrUtilization = colcommon.BuildDesc("container_npu_utilization", + "npu ai core utilization in container, unit is '%'") + npuCtrTotalMemory = colcommon.BuildDesc("container_npu_total_memory", + "npu total memory in container, unit is 'MB'") + npuCtrUsedMemory = colcommon.BuildDesc("container_npu_used_memory", + "the npu used memory in container, unit is 'MB'") + + npuCtrInfo *prometheus.Desc = nil + descNpuName *prometheus.Desc = nil + descNPUSerialNumber *prometheus.Desc = nil +) + +func init() { + + colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code", "the npu error code") + for i := 1; i < common.MaxErrorCodeLen; i++ { + colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code_"+strconv.Itoa(i), "the npu error code") + } + + cardLabelForContainer = append(colcommon.CardLabel, "containerID", "containerName") + cardLabelForContainer[0] = "npuID" + npuCtrInfo = colcommon.BuildDescWithLabel("npu_container_info", "the container name and deviceID relationship", + cardLabelForContainer) + + cardLabelForSN = append(colcommon.CardLabel, "serial_number") + // NPU SN related metrics + descNPUSerialNumber = colcommon.BuildDescWithLabel("npu_chip_info_serial_number", + "the npu serial number information", cardLabelForSN) + + copy(cardLabelForNpuName, colcommon.CardLabel) + cardLabelForNpuName[1] = "name" + descNpuName = colcommon.BuildDescWithLabel("npu_chip_info_name", "the Ascend npu name with value '1'", + cardLabelForNpuName) +} + +type chipCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + + // the healthy status of the AI chip + HealthStatus string `json:"health_status"` + // the all error codes of the chip + ErrorCodes []int64 `json:"error_codes"` + // the utilization of the chip + Utilization int `json:"utilization"` + // the overall utilization of the chip + OverallUtilization int `json:"overall_utilization"` + // the vector utilization of the chip + VectorUtilization int `json:"vector_utilization"` + // the temperature of the chip + Temperature int `json:"temperature"` + // the work power of the chip + Power float32 `json:"power"` + // the work voltage of the chip + Voltage float32 `json:"voltage"` + // the AI core current frequency of the chip + AICoreCurrentFreq uint32 `json:"aicore_current_freq"` + // NetHealthStatus chip network health status + NetHealthStatus string `json:"net_health_status"` + // DevProcessInfo chip process info + DevProcessInfo *common.DevProcessInfo +} + +// BaseInfoCollector collects the base info of the chip +type BaseInfoCollector struct { + colcommon.MetricsCollectorAdapter +} + +// Describe collects the base info of the chip +func (c *BaseInfoCollector) Describe(ch chan<- *prometheus.Desc) { + // base info + ch <- machineInfoNPUDesc + ch <- descUtil + ch <- descVectorUtil + ch <- descOverUtil + ch <- descTemp + ch <- descPower + ch <- descVoltage + ch <- descHealthStatus + ch <- descNpuName + ch <- descAICoreFreq + ch <- descNPUSerialNumber + ch <- descDevProcessInfo + // status + ch <- descNetworkStatus + // container + ch <- npuCtrInfo + ch <- npuCtrUtilization + ch <- npuCtrTotalMemory + ch <- npuCtrUsedMemory + + // error code + for _, desc := range errorCodeDescs { + ch <- desc + } +} + +// CollectToCache collects the base info of the chip +func (c *BaseInfoCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + logicID := chip.LogicID + + dmgr := n.Dmgr + + freq, err := dmgr.GetDeviceFrequency(logicID, common.AICoreCurrentFreq) + if err != nil { + freq = common.UnRetError + } + temp, err := dmgr.GetDeviceTemperature(logicID) + if err != nil { + temp = common.RetError + } + vol, err := dmgr.GetDeviceVoltage(logicID) + if err != nil { + vol = common.UnRetError + } + + _, errCodes, err := dmgr.GetDeviceAllErrorCode(logicID) + if err != nil { + errCodes = make([]int64, 0) + } + + cache := &chipCache{ + chip: chip, + AICoreCurrentFreq: freq, + Temperature: int(temp), + Voltage: vol, + HealthStatus: getHealth(logicID, dmgr), + ErrorCodes: errCodes, + } + collectPower(logicID, dmgr, cache) + collectUtil(logicID, dmgr, cache) + setNetHealthStatus(logicID, dmgr, cache) + setProcessInfo(logicID, dmgr, cache) + + cache.timestamp = time.Now() + c.LocalCache.Store(chip.PhyId, *cache) + } + colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +func collectPower(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { + if dmgr.GetDevType() == api.Ascend310P { + cardPower, err := dmgr.GetMcuPowerInfo(chip.chip.CardId) + handleErr(err, colcommon.DomainForMcuPower, chip.chip.CardId) + // Ascend310P use cardPower to replace chipPower + chip.Power = cardPower + } else { + power, err := dmgr.GetDevicePowerInfo(logicID) + handleErr(err, colcommon.DomainForChipPower, logicID) + chip.Power = power + } +} + +// UpdatePrometheus updates the base info of the chip +func (c *BaseInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { + containerInfo := geenContainerInfo(&chipWithVnpu, containerMap) + timestamp := cache.timestamp + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Power), cardLabel, descPower) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Voltage), cardLabel, descVoltage) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.AICoreCurrentFreq), cardLabel, descAICoreFreq) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Temperature), cardLabel, descTemp) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Utilization), cardLabel, descUtil) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.OverallUtilization), cardLabel, descOverUtil) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.VectorUtilization), cardLabel, descVectorUtil) + doUpdateMetricWithValidateNum(ch, timestamp, 1, cardLabel, descNpuName) + doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.HealthStatus)), cardLabel, descHealthStatus) + doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.NetHealthStatus)), + cardLabel, descNetworkStatus) + + updateContainerInfo(ch, containerInfo, cardLabel, &cache, chipWithVnpu) + + updateProcessInfoForPrometheus(ch, &cache, containerInfo, timestamp, cardLabel) + updateErrorCodesInfo(ch, &cache, timestamp, cardLabel) + // Update NPU serial number info + if cache.chip.ElabelInfo != nil { + snLabel := append(cardLabel, cache.chip.ElabelInfo.SerialNumber) + doUpdateMetricWithValidateNum(ch, timestamp, 1, snLabel, descNPUSerialNumber) + } + } + updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + + ch <- prometheus.MustNewConstMetric(machineInfoNPUDesc, prometheus.GaugeValue, float64(len(chips))) +} + +func updateContainerInfo(ch chan<- prometheus.Metric, containerInfo container.DevicesInfo, + cardLabel []string, chip *chipCache, chipWithVnpu colcommon.HuaWeiAIChip) { + containerName := getContainerNameArray(containerInfo) + if len(containerName) != colcommon.ContainerNameLen { + return + } + // based on chipType , container_npu_total_memory、container_npu_used_memory reported in hbm or ddr group + doUpdateMetric(ch, chip.timestamp, 1, append(cardLabel, containerInfo.ID, strings.Join(containerName, "_")), + npuCtrInfo) + + // vnpu not support this metrics + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + + doUpdateMetricWithValidateNum(ch, chip.timestamp, float64(chip.Utilization), cardLabel, npuCtrUtilization) +} + +func updateErrorCodesInfo(ch chan<- prometheus.Metric, chip *chipCache, timestamp time.Time, cardLabel []string) { + if len(chip.ErrorCodes) > common.MaxErrorCodeLen { + logger.Warnf("Error code number is larger than %v, only the first %v will be reported, "+ + "all errorCode is: %v", common.MaxErrorCodeLen, common.MaxErrorCodeLen, chip.ErrorCodes) + } + for i := 0; i < len(chip.ErrorCodes) && i < len(errorCodeDescs); i++ { + doUpdateMetricWithValidateNum(ch, timestamp, float64(chip.ErrorCodes[i]), cardLabel, errorCodeDescs[i]) + } +} + +func updateProcessInfoForPrometheus(ch chan<- prometheus.Metric, chip *chipCache, + containerInfo container.DevicesInfo, timestamp time.Time, cardLabel []string) { + devProcessInfo := chip.DevProcessInfo + if devProcessInfo == nil { + return + } + doUpdateMetric(ch, timestamp, devProcessInfo.ProcNum, cardLabel, descDevProcessNum) + + containerID := "" + containerName := "" + cNameArray := getContainerNameArray(containerInfo) + if len(cNameArray) == colcommon.ContainerNameLen { + containerID = containerInfo.ID + containerName = strings.Join(cNameArray, "_") + } + + newCardLabel := make([]string, len(cardLabel)) + copy(newCardLabel, cardLabel) + // containerName in process info is namespace_podName_containerName + newCardLabel[len(newCardLabel)-1] = containerName + + if devProcessInfo.ProcNum == 0 { + doUpdateMetric(ch, timestamp, 0, append(newCardLabel, "", containerID), descDevProcessInfo) + return + } + + for i := int32(0); i < devProcessInfo.ProcNum; i++ { + procInfo := devProcessInfo.DevProcArray[i] + doUpdateMetric(ch, timestamp, procInfo.MemUsage, + append(newCardLabel, strconv.FormatInt(int64(procInfo.Pid), colcommon.Base), containerID), descDevProcessInfo) + } +} + +// UpdateTelegraf updates the base info of the chip +func (c *BaseInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + doUpdateTelegrafWithValidateNum(fieldMap, descTemp, float64(cache.Temperature), "") + doUpdateTelegrafWithValidateNum(fieldMap, descPower, float64(cache.Power), "") + doUpdateTelegrafWithValidateNum(fieldMap, descVoltage, float64(cache.Voltage), "") + doUpdateTelegrafWithValidateNum(fieldMap, descAICoreFreq, float64(cache.AICoreCurrentFreq), "") + doUpdateTelegrafWithValidateNum(fieldMap, descUtil, float64(cache.Utilization), "") + doUpdateTelegrafWithValidateNum(fieldMap, descVectorUtil, float64(cache.VectorUtilization), "") + doUpdateTelegrafWithValidateNum(fieldMap, descOverUtil, float64(cache.OverallUtilization), "") + doUpdateTelegrafWithValidateNum(fieldMap, descHealthStatus, float64(getHealthCode(cache.HealthStatus)), "") + doUpdateTelegrafWithValidateNum(fieldMap, descNetworkStatus, float64(getHealthCode(cache.NetHealthStatus)), "") + doUpdateTelegraf(fieldMap, descNpuName, chip.ChipInfo.Name, "") + + updateProcessInfoForTelegraf(&cache, fieldMap) + updateErrorCode(&cache, fieldMap) + // Update NPU serial number info + if cache.chip.ElabelInfo != nil { + doUpdateTelegraf(fieldMap, descNPUSerialNumber, cache.chip.ElabelInfo.SerialNumber, "") + } + + } + + if fieldsMap[colcommon.GeneralDevTagKey] == nil { + fieldsMap[colcommon.GeneralDevTagKey] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[colcommon.GeneralDevTagKey], machineInfoNPUDesc, len(chips), "") + return fieldsMap +} + +func updateErrorCode(chip *chipCache, fieldMap map[string]interface{}) { + if len(errorCodeDescs) == 0 { + return + } + descErrorCode := errorCodeDescs[0] + for i := 0; i < len(chip.ErrorCodes); i++ { + extInfo := "" + if i != 0 { + extInfo = "_" + strconv.Itoa(i) + } + doUpdateTelegrafWithValidateNum(fieldMap, descErrorCode, float64(chip.ErrorCodes[i]), extInfo) + } +} + +func updateProcessInfoForTelegraf(chip *chipCache, fieldMap map[string]interface{}) { + devProcessInfo := chip.DevProcessInfo + doUpdateTelegraf(fieldMap, descDevProcessNum, devProcessInfo.ProcNum, "") + if devProcessInfo.ProcNum == 0 { + doUpdateTelegraf(fieldMap, descDevProcessInfo, 0, "") + return + } + for i := int32(0); i < devProcessInfo.ProcNum; i++ { + procInfo := devProcessInfo.DevProcArray[i] + doUpdateTelegraf(fieldMap, descDevProcessInfo, procInfo.MemUsage, "_"+strconv.Itoa(int(procInfo.Pid))) + } +} + +func collectUtil(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { + util, err := dmgr.GetDeviceUtilizationRate(logicID, common.AICore) + handleErr(err, colcommon.DomainForAICoreUtilization, logicID) + chip.Utilization = int(util) + + overAllUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.Overall) + handleErr(err, colcommon.DomainForOverallUtilization, logicID) + chip.OverallUtilization = int(overAllUtil) + + vecUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.VectorCore) + handleErr(err, colcommon.DomainForVectorCoreUtilization, logicID) + chip.VectorUtilization = int(vecUtil) +} + +func setNetHealthStatus(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { + chip.NetHealthStatus = colcommon.Abnormal + if !dmgr.IsTrainingCard() { + return + } + + netCode, err := dmgr.GetDeviceNetWorkHealth(logicID) + logger.Debugf("chip %d network healthy code is %d", logicID, netCode) + if err != nil { + netCode = math.MaxUint32 + } + chip.NetHealthStatus = getNetworkHealthy(netCode) +} + +func getNetworkHealthy(netCode uint32) string { + if netCode == math.MaxUint32 { + return colcommon.Abnormal + } + + if netCode == common.NetworkInit || netCode == common.NetworkSuccess { + return colcommon.Healthy + } + + return colcommon.UnHealthy +} + +func getHealth(logicID int32, dmgr devmanager.DeviceInterface) string { + health, err := dmgr.GetDeviceHealth(logicID) + if err != nil || health != 0 { + return colcommon.UnHealthy + } + return colcommon.Healthy +} + +func getHealthCode(health string) int { + if health == colcommon.Abnormal { + return common.RetError + } + + if colcommon.Healthy == health { + return 1 + } + return 0 +} + +func setProcessInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *chipCache) { + productTypes := dmgr.GetProductTypeArray() + info, err := dmgr.GetDevProcessInfo(logicID) + if err != nil { + if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { + logger.Debugf("process info is not supported on %s", common.Atlas200ISoc) + hwChip.DevProcessInfo = &common.DevProcessInfo{} + return + } + handleErr(err, colcommon.DomainForProcess, logicID) + info = &common.DevProcessInfo{} + } + hwChip.DevProcessInfo = info +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go new file mode 100644 index 0000000..ca49804 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go @@ -0,0 +1,200 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + txPower0 = "Tx_Power0" + txPower1 = "Tx_Power1" + txPower2 = "Tx_Power2" + txPower3 = "Tx_Power3" + + rxPower0 = "Rx_Power0" + rxPower1 = "Rx_Power1" + rxPower2 = "Rx_Power2" + rxPower3 = "Rx_Power3" + + notPresent = "not present" + present = "present" + temperature = "temperature" + voltage = "Vcc" +) + +var ( + + // optical + descOpticalState = colcommon.BuildDesc("npu_chip_optical_state", "the npu interface receive optical-state") + descOpticalVcc = colcommon.BuildDesc("npu_chip_optical_vcc", "the npu interface receive optical-vcc") + descOpticalTemp = colcommon.BuildDesc("npu_chip_optical_temp", "the npu interface receive optical-temperature") + descOpticalTxPower0 = colcommon.BuildDesc("npu_chip_optical_tx_power_0", "npu interface receive optical-tx-power-0") + descOpticalTxPower1 = colcommon.BuildDesc("npu_chip_optical_tx_power_1", "npu interface receive optical-tx-power-1") + descOpticalTxPower2 = colcommon.BuildDesc("npu_chip_optical_tx_power_2", "npu interface receive optical-tx-power-2") + descOpticalTxPower3 = colcommon.BuildDesc("npu_chip_optical_tx_power_3", "npu interface receive optical-tx-power-3") + + descOpticalRxPower0 = colcommon.BuildDesc("npu_chip_optical_rx_power_0", "npu interface receive optical-rx-power-0") + descOpticalRxPower1 = colcommon.BuildDesc("npu_chip_optical_rx_power_1", "npu interface receive optical-rx-power-1") + descOpticalRxPower2 = colcommon.BuildDesc("npu_chip_optical_rx_power_2", "npu interface receive optical-rx-power-2") + descOpticalRxPower3 = colcommon.BuildDesc("npu_chip_optical_rx_power_3", "npu interface receive optical-rx-power-3") +) + +type opticalCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo indicates the optical module information + extInfo *common.OpticalInfo +} + +// OpticalCollector collect the optical metrics +type OpticalCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported judge whether the collector is supported +func (c *OpticalCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := n.Dmgr.IsTrainingCard() + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "only training card supports network related info") + return isSupport +} + +// Describe description of the metric +func (c *OpticalCollector) Describe(ch chan<- *prometheus.Desc) { + // optical + ch <- descOpticalState + ch <- descOpticalTxPower0 + ch <- descOpticalTxPower1 + ch <- descOpticalTxPower2 + ch <- descOpticalTxPower3 + ch <- descOpticalRxPower0 + ch <- descOpticalRxPower1 + ch <- descOpticalRxPower2 + ch <- descOpticalRxPower3 + ch <- descOpticalVcc + ch <- descOpticalTemp +} + +// CollectToCache collect the metric to cache +func (c *OpticalCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + opticalInfo, err := hccn.GetNPUOpticalInfo(chip.PhyId) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForOptical, chip.PhyId, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForOptical, chip.PhyId) + info := getMainOptInfo(opticalInfo) + c.LocalCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), extInfo: info}) + } + colcommon.UpdateCache[opticalCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *OpticalCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache opticalCache, cardLabel []string) { + opticalInfo := cache.extInfo + if opticalInfo == nil { + return + } + timestamp := cache.timestamp + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalState, cardLabel, descOpticalState) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalVcc, cardLabel, descOpticalVcc) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTemp, cardLabel, descOpticalTemp) + + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower0, cardLabel, descOpticalTxPower0) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower1, cardLabel, descOpticalTxPower1) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower2, cardLabel, descOpticalTxPower2) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower3, cardLabel, descOpticalTxPower3) + + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower0, cardLabel, descOpticalRxPower0) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower1, cardLabel, descOpticalRxPower1) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower2, cardLabel, descOpticalRxPower2) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower3, cardLabel, descOpticalRxPower3) + } + + updateFrame[opticalCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *OpticalCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalState, extInfo.OpticalState, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalVcc, extInfo.OpticalVcc, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTemp, extInfo.OpticalTemp, "") + + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower0, extInfo.OpticalTxPower0, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower1, extInfo.OpticalTxPower1, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower2, extInfo.OpticalTxPower2, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower3, extInfo.OpticalTxPower3, "") + + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower0, extInfo.OpticalRxPower0, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower1, extInfo.OpticalRxPower1, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower2, extInfo.OpticalRxPower2, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower3, extInfo.OpticalRxPower3, "") + } + return fieldsMap +} + +func getMainOptInfo(opticalInfo map[string]string) *common.OpticalInfo { + mainOpticalInfo := common.OpticalInfo{} + mainOpticalInfo.OpticalTxPower0 = hccn.GetFloatDataFromStr(opticalInfo[txPower0], txPower0) + mainOpticalInfo.OpticalTxPower1 = hccn.GetFloatDataFromStr(opticalInfo[txPower1], txPower1) + mainOpticalInfo.OpticalTxPower2 = hccn.GetFloatDataFromStr(opticalInfo[txPower2], txPower2) + mainOpticalInfo.OpticalTxPower3 = hccn.GetFloatDataFromStr(opticalInfo[txPower3], txPower3) + mainOpticalInfo.OpticalRxPower0 = hccn.GetFloatDataFromStr(opticalInfo[rxPower0], rxPower0) + mainOpticalInfo.OpticalRxPower1 = hccn.GetFloatDataFromStr(opticalInfo[rxPower1], rxPower1) + mainOpticalInfo.OpticalRxPower2 = hccn.GetFloatDataFromStr(opticalInfo[rxPower2], rxPower2) + mainOpticalInfo.OpticalRxPower3 = hccn.GetFloatDataFromStr(opticalInfo[rxPower3], rxPower3) + mainOpticalInfo.OpticalVcc = hccn.GetFloatDataFromStr(opticalInfo[voltage], voltage) + mainOpticalInfo.OpticalTemp = hccn.GetFloatDataFromStr(opticalInfo[temperature], temperature) + var optState float64 + if opticalInfo[present] == present { + optState = 1.0 + } else if opticalInfo[present] == notPresent { + optState = 0.0 + } else { + optState = common.RetError + } + mainOpticalInfo.OpticalState = optState + + return &mainOpticalInfo +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go new file mode 100644 index 0000000..f68f95b --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go @@ -0,0 +1,234 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + pcieBwType = "pcie_bw_type" + avgPcieBw = "avgPcieBw" + minPcieBw = "minPcieBw" + maxPcieBw = "maxPcieBw" + + avgPostfix = "_avgPcieBw" + minPostfix = "_minPcieBw" + maxPostfix = "_maxPcieBw" +) + +var ( + pcieBwLabel = append(colcommon.CardLabel, pcieBwType) + + descRxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_p_bw", + "the npu write bw to remote‘s speed, unit is 'MB/ms'", pcieBwLabel) + + descRxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_np_bw", + "the npu read bw's speed from remote, unit is 'MB/ms'", pcieBwLabel) + + descRxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_cpl_bw", + "the npu reply remote read operate cpl's speed, unit is 'MB/ms'", pcieBwLabel) + + descTxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_p_bw", + "the npu receive remote write operate's speed, unit is 'MB/ms'", pcieBwLabel) + + descTxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_np_bw", + "the npu receive remote read operate's speed, unit is 'MB/ms'", pcieBwLabel) + + descTxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_cpl_bw", + "the npu read cpl's responese bw speed from remote, unit is 'MB/ms'", pcieBwLabel) +) +var ( + supportedPcieDevices = map[string]bool{ + api.Ascend910B: true, + } +) + +type pcieCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo pcie transport and receive bandwidth, have six metrics + extInfo *common.PCIEBwStat +} + +// PcieCollector collect pcie info +type PcieCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *PcieCollector) IsSupported(n *colcommon.NpuCollector) bool { + // only 910A2 supports pcie info + isSupport := supportedPcieDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe description of the metric +func (c *PcieCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descRxPBW + ch <- descTxPBW + ch <- descRxNpBW + ch <- descTxNpBW + ch <- descRxCplBW + ch <- descTxCplBW +} + +// CollectToCache collect the metric to cache +func (c *PcieCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + pcieBwInfo, err := n.Dmgr.GetPCIEBandwidth(chip.LogicID, common.ProfilingTime) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForPcieBandwidth, chip.LogicID, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForPcieBandwidth, chip.LogicID) + c.LocalCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieBwInfo}) + } + colcommon.UpdateCache[pcieCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *PcieCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache pcieCache, cardLabel []string) { + pcieBwInfo := cache.extInfo + if pcieBwInfo == nil { + return + } + + if cache.chip.VDevActivityInfo != nil && common.IsValidVDevID(cache.chip.VDevActivityInfo.VDevID) { + logger.Debug("vnpu does not supports pcie info query") + return + } + + timestamp := cache.timestamp + + updateAvgPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) + updateMinPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) + updateMaxPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) + } + + updateFrame[pcieCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *PcieCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieAvgBw, avgPostfix) + + doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMinBw, minPostfix) + + doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMaxBw, maxPostfix) + + } + return fieldsMap +} + +func pcieBwLabelVal(cardLabels []string, pcieBwType string) []string { + return append(cardLabels, pcieBwType) +} + +func metricWithPcieBw(labelsVal []string, metrics *prometheus.Desc, val float64, valType string) prometheus.Metric { + return prometheus.MustNewConstMetric(metrics, prometheus.GaugeValue, val, pcieBwLabelVal(labelsVal, valType)...) +} + +func updateAvgPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, + cardLabel []string) { + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieAvgBw), avgPcieBw)) +} + +func updateMinPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, + cardLabel []string) { + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMinBw), minPcieBw)) +} + +func updateMaxPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, + cardLabel []string) { + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMaxBw), maxPcieBw)) +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go new file mode 100644 index 0000000..b1d307c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go @@ -0,0 +1,263 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + macRxMacPauseNum = "mac_rx_mac_pause_num" + macTxMacPauseNum = "mac_tx_mac_pause_num" + macRxPfcPktNum = "mac_rx_pfc_pkt_num" + macTxPfcPktNum = "mac_tx_pfc_pkt_num" + macRxBadPktNum = "mac_rx_bad_pkt_num" + macTxBadPktNum = "mac_tx_bad_pkt_num" + roCERxAllPktNum = "roce_rx_all_pkt_num" + roCETxAllPktNum = "roce_tx_all_pkt_num" + roCERxErrPktNum = "roce_rx_err_pkt_num" + roCETxErrPktNum = "roce_tx_err_pkt_num" + roCERxCnpPktNum = "roce_rx_cnp_pkt_num" + roCETxCnpPktNum = "roce_tx_cnp_pkt_num" + macRxBadOctNum = "mac_rx_bad_oct_num" + macTxBadOctNum = "mac_tx_bad_oct_num" + roCEUnexpectedAckNum = "roce_unexpected_ack_num" + roCEOutOfOrderNum = "roce_out_of_order_num" + roCEVerificationErrNum = "roce_verification_err_num" + roCEQpStatusErrNum = "roce_qp_status_err_num" + roCENewPktRtyNum = "roce_new_pkt_rty_num" + roCEEcnDBNum = "roce_ecn_db_num" + macRXFcsErrPktNum = "mac_rx_fcs_err_pkt_num" +) + +var ( + // mac + descMacRxPauseNum = colcommon.BuildDesc("npu_chip_mac_rx_pause_num", "npu interface receive mac-rx-pause-num") + descMacTxPauseNum = colcommon.BuildDesc("npu_chip_mac_tx_pause_num", "npu interface receive mac-tx-pause-num") + descMacRxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_rx_pfc_pkt_num", "npu interface receive mac-rx-pfc-pkt-num") + descMacTxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_tx_pfc_pkt_num", "npu interface receive mac-tx-pfc-pkt-num") + descMacRxBadPktNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_pkt_num", "npu interface receive mac-rx-bad-pkt-num") + descMacTxBadPktNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_pkt_num", "npu interface receive mac-tx-bad-pkt-num") + descMacTxBadOctNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_oct_num", "npu interface receive mac-tx-bad-oct-num") + descMacRxBadOctNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_oct_num", "npu interface receive mac-rx-bad-oct-num") + + descRxFCSNum = colcommon.BuildDesc("npu_chip_info_rx_fcs_num", "the npu network fcs receive number") + descRxECNNum = colcommon.BuildDesc("npu_chip_info_rx_ecn_num", "the npu network ecn receive number") + + // roce + descRoceRxAllPktNum = colcommon.BuildDesc("npu_chip_roce_rx_all_pkt_num", "npu interface receive roce-rx-all-pkt-num") + descRoceTxAllPktNum = colcommon.BuildDesc("npu_chip_roce_tx_all_pkt_num", "npu interface receive roce-tx-all-pkt-num") + descRoceRxErrPktNum = colcommon.BuildDesc("npu_chip_roce_rx_err_pkt_num", "npu interface receive roce-rx-err-pkt-num") + descRoceTxErrPktNum = colcommon.BuildDesc("npu_chip_roce_tx_err_pkt_num", "npu interface receive roce-tx-err-pkt-num") + descRoceRxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_rx_cnp_pkt_num", "npu interface receive roce-rx-cnp-pkt-num") + descRoceTxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_tx_cnp_pkt_num", "npu interface receive roce-tx-cnp-pkt-num") + + descRoceNewPktRtyNum = colcommon.BuildDesc("npu_chip_roce_new_pkt_rty_num", + "npu interface receive roce-new-pkt-rty-num") + descRoceOutOfOrderNum = colcommon.BuildDesc("npu_chip_roce_out_of_order_num", + "the npu interface receive roce-out-of-order-num") + descRoceQpStatusErrNum = colcommon.BuildDesc("npu_chip_roce_qp_status_err_num", + "the npu interface receive roce-qp-status-err-num") + descRoceUnexpectedAcktNum = colcommon.BuildDesc("npu_chip_roce_unexpected_ack_num", + "the npu interface receive roce-unexpected-ack-num") + descRoceVerificationErrNum = colcommon.BuildDesc("npu_chip_roce_verification_err_num", + "the npu interface receive roce-verification-err-num") +) + +type roceCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo the statistics about packets + extInfo *common.StatInfo +} + +// RoceCollector collect roce info +type RoceCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *RoceCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := n.Dmgr.IsTrainingCard() + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "only training card supports network related info") + return isSupport +} + +// Describe description of the metric +func (c *RoceCollector) Describe(ch chan<- *prometheus.Desc) { + + // mac + ch <- descMacRxPauseNum + ch <- descMacTxPauseNum + ch <- descMacRxPfcPktNum + ch <- descMacTxPfcPktNum + ch <- descMacRxBadPktNum + ch <- descMacTxBadPktNum + ch <- descMacTxBadOctNum + ch <- descMacRxBadOctNum + ch <- descRxFCSNum + + // roce + ch <- descRoceRxAllPktNum + ch <- descRoceTxAllPktNum + ch <- descRoceRxErrPktNum + ch <- descRoceTxErrPktNum + ch <- descRoceRxCnpPktNum + ch <- descRoceTxCnpPktNum + ch <- descRoceNewPktRtyNum + ch <- descRoceUnexpectedAcktNum + ch <- descRoceOutOfOrderNum + ch <- descRoceVerificationErrNum + ch <- descRoceQpStatusErrNum + ch <- descRxECNNum + +} + +// CollectToCache collect the metric to cache +func (c *RoceCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + statInfo, err := hccn.GetNPUStatInfo(chip.DeviceID) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForRoce, chip.LogicID, err) + return + } + hwlog.ResetErrCnt(colcommon.DomainForRoce, chip.LogicID) + c.LocalCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), extInfo: getMainStatInfo(statInfo)}) + } + colcommon.UpdateCache[roceCache](n, colcommon.GetCacheKey(c), &c.LocalCache) + +} + +// UpdatePrometheus update prometheus metrics +func (c *RoceCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache roceCache, cardLabel []string) { + statInfo := cache.extInfo + if statInfo == nil { + return + } + updateStatInfoOfMac(ch, cache.timestamp, statInfo, cardLabel) + updateStatInfoOfRoCE(ch, cache.timestamp, statInfo, cardLabel) + } + updateFrame[roceCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *RoceCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + doUpdateTelegraf(fieldMap, descMacRxPauseNum, extInfo.MacRxPauseNum, "") + doUpdateTelegraf(fieldMap, descMacTxPauseNum, extInfo.MacTxPauseNum, "") + doUpdateTelegraf(fieldMap, descMacRxPfcPktNum, extInfo.MacRxPfcPktNum, "") + doUpdateTelegraf(fieldMap, descMacTxPfcPktNum, extInfo.MacTxPfcPktNum, "") + doUpdateTelegraf(fieldMap, descMacRxBadPktNum, extInfo.MacRxBadPktNum, "") + doUpdateTelegraf(fieldMap, descMacTxBadPktNum, extInfo.MacTxBadPktNum, "") + doUpdateTelegraf(fieldMap, descMacTxBadOctNum, extInfo.MacTxBadOctNum, "") + doUpdateTelegraf(fieldMap, descMacRxBadOctNum, extInfo.MacRxBadOctNum, "") + doUpdateTelegraf(fieldMap, descRxFCSNum, extInfo.MacRXFcsErrPktNum, "") + + doUpdateTelegraf(fieldMap, descRoceRxAllPktNum, extInfo.RoceRxAllPktNum, "") + doUpdateTelegraf(fieldMap, descRoceTxAllPktNum, extInfo.RoceTxAllPktNum, "") + doUpdateTelegraf(fieldMap, descRoceRxErrPktNum, extInfo.RoceRxErrPktNum, "") + doUpdateTelegraf(fieldMap, descRoceTxErrPktNum, extInfo.RoceTxErrPktNum, "") + doUpdateTelegraf(fieldMap, descRoceRxCnpPktNum, extInfo.RoceRxCnpPktNum, "") + doUpdateTelegraf(fieldMap, descRoceTxCnpPktNum, extInfo.RoceTxCnpPktNum, "") + doUpdateTelegraf(fieldMap, descRoceNewPktRtyNum, extInfo.RoceNewPktRtyNum, "") + doUpdateTelegraf(fieldMap, descRoceUnexpectedAcktNum, extInfo.RoceUnexpectedAckNum, "") + doUpdateTelegraf(fieldMap, descRoceOutOfOrderNum, extInfo.RoceOutOfOrderNum, "") + doUpdateTelegraf(fieldMap, descRoceVerificationErrNum, extInfo.RoceVerificationErrNum, "") + doUpdateTelegraf(fieldMap, descRoceQpStatusErrNum, extInfo.RoceQpStatusErrNum, "") + doUpdateTelegraf(fieldMap, descRxECNNum, extInfo.RoceEcnDBNum, "") + } + return fieldsMap +} +func getMainStatInfo(statInfo map[string]int) *common.StatInfo { + mainStatInfo := common.StatInfo{} + mainStatInfo.MacRxPauseNum = float64(statInfo[macRxMacPauseNum]) + mainStatInfo.MacTxPauseNum = float64(statInfo[macTxMacPauseNum]) + mainStatInfo.MacRxPfcPktNum = float64(statInfo[macRxPfcPktNum]) + mainStatInfo.MacTxPfcPktNum = float64(statInfo[macTxPfcPktNum]) + mainStatInfo.MacRxBadPktNum = float64(statInfo[macRxBadPktNum]) + mainStatInfo.MacTxBadPktNum = float64(statInfo[macTxBadPktNum]) + mainStatInfo.RoceRxAllPktNum = float64(statInfo[roCERxAllPktNum]) + mainStatInfo.RoceTxAllPktNum = float64(statInfo[roCETxAllPktNum]) + mainStatInfo.RoceRxErrPktNum = float64(statInfo[roCERxErrPktNum]) + mainStatInfo.RoceTxErrPktNum = float64(statInfo[roCETxErrPktNum]) + mainStatInfo.RoceRxCnpPktNum = float64(statInfo[roCERxCnpPktNum]) + mainStatInfo.RoceTxCnpPktNum = float64(statInfo[roCETxCnpPktNum]) + mainStatInfo.MacRxBadOctNum = float64(statInfo[macRxBadOctNum]) + mainStatInfo.MacTxBadOctNum = float64(statInfo[macTxBadOctNum]) + mainStatInfo.RoceUnexpectedAckNum = float64(statInfo[roCEUnexpectedAckNum]) + mainStatInfo.RoceOutOfOrderNum = float64(statInfo[roCEOutOfOrderNum]) + mainStatInfo.RoceVerificationErrNum = float64(statInfo[roCEVerificationErrNum]) + mainStatInfo.RoceQpStatusErrNum = float64(statInfo[roCEQpStatusErrNum]) + mainStatInfo.RoceNewPktRtyNum = float64(statInfo[roCENewPktRtyNum]) + mainStatInfo.RoceEcnDBNum = float64(statInfo[roCEEcnDBNum]) + mainStatInfo.MacRXFcsErrPktNum = float64(statInfo[macRXFcsErrPktNum]) + + return &mainStatInfo +} + +func updateStatInfoOfMac(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { + doUpdateMetric(ch, ts, statInfo.MacRxPauseNum, cardLabel, descMacRxPauseNum) + doUpdateMetric(ch, ts, statInfo.MacTxPauseNum, cardLabel, descMacTxPauseNum) + doUpdateMetric(ch, ts, statInfo.MacRxPfcPktNum, cardLabel, descMacRxPfcPktNum) + doUpdateMetric(ch, ts, statInfo.MacTxPfcPktNum, cardLabel, descMacTxPfcPktNum) + doUpdateMetric(ch, ts, statInfo.MacRxBadPktNum, cardLabel, descMacRxBadPktNum) + doUpdateMetric(ch, ts, statInfo.MacTxBadPktNum, cardLabel, descMacTxBadPktNum) + doUpdateMetric(ch, ts, statInfo.MacTxBadOctNum, cardLabel, descMacTxBadOctNum) + doUpdateMetric(ch, ts, statInfo.MacRxBadOctNum, cardLabel, descMacRxBadOctNum) + doUpdateMetric(ch, ts, statInfo.MacRXFcsErrPktNum, cardLabel, descRxFCSNum) +} + +func updateStatInfoOfRoCE(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { + doUpdateMetric(ch, ts, statInfo.RoceRxAllPktNum, cardLabel, descRoceRxAllPktNum) + doUpdateMetric(ch, ts, statInfo.RoceTxAllPktNum, cardLabel, descRoceTxAllPktNum) + doUpdateMetric(ch, ts, statInfo.RoceRxErrPktNum, cardLabel, descRoceRxErrPktNum) + doUpdateMetric(ch, ts, statInfo.RoceTxErrPktNum, cardLabel, descRoceTxErrPktNum) + doUpdateMetric(ch, ts, statInfo.RoceRxCnpPktNum, cardLabel, descRoceRxCnpPktNum) + doUpdateMetric(ch, ts, statInfo.RoceTxCnpPktNum, cardLabel, descRoceTxCnpPktNum) + doUpdateMetric(ch, ts, statInfo.RoceNewPktRtyNum, cardLabel, descRoceNewPktRtyNum) + doUpdateMetric(ch, ts, statInfo.RoceUnexpectedAckNum, cardLabel, descRoceUnexpectedAcktNum) + doUpdateMetric(ch, ts, statInfo.RoceOutOfOrderNum, cardLabel, descRoceOutOfOrderNum) + doUpdateMetric(ch, ts, statInfo.RoceVerificationErrNum, cardLabel, descRoceVerificationErrNum) + doUpdateMetric(ch, ts, statInfo.RoceQpStatusErrNum, cardLabel, descRoceQpStatusErrNum) + doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) + doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go new file mode 100644 index 0000000..918469c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go @@ -0,0 +1,120 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +var ( + descSioCrcTxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_tx_err_cnt", + "sio transmitted error count between die") + descSioCrcRxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_rx_err_cnt", + "sio received error count between die") +) +var ( + supportedSioDevices = map[string]bool{ + api.Ascend910A3: true, + } +) + +type sioCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo sio status between dies, only support super pod + extInfo *common.SioCrcErrStatisticInfo +} + +// SioCollector collect sio info +type SioCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *SioCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedSioDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "sio information cannot be queried.") + return isSupport +} + +// Describe description of the metric +func (c *SioCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descSioCrcTxErrCnt + ch <- descSioCrcRxErrCnt +} + +// CollectToCache collect the metric to cache +func (c *SioCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + logicID := chip.LogicID + sioInfo, err := n.Dmgr.GetSioInfo(logicID) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForSio, logicID, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForSio, logicID) + + c.LocalCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: sioInfo}) + } + colcommon.UpdateCache[sioCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *SioCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache sioCache, cardLabel []string) { + extInfo := cache.extInfo + if extInfo == nil { + return + } + doUpdateMetric(ch, cache.timestamp, extInfo.TxErrCnt, cardLabel, descSioCrcTxErrCnt) + doUpdateMetric(ch, cache.timestamp, extInfo.RxErrCnt, cardLabel, descSioCrcRxErrCnt) + } + updateFrame[sioCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf update telegraf metrics +func (c *SioCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + + doUpdateTelegraf(fieldMap, descSioCrcTxErrCnt, extInfo.TxErrCnt, "") + doUpdateTelegraf(fieldMap, descSioCrcRxErrCnt, extInfo.RxErrCnt, "") + } + return fieldsMap +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go new file mode 100644 index 0000000..8cb32bd --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go @@ -0,0 +1,56 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/versions" +) + +var ( + versionInfoDesc = common.BuildDescWithLabel("npu_exporter_version_info", "exporter version with value '1'", + []string{"exporterVersion"}) +) + +// VersionCollector collect sio info +type VersionCollector struct { + common.MetricsCollectorAdapter +} + +// Describe description of the metric +func (c *VersionCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- versionInfoDesc +} + +// UpdatePrometheus update prometheus metric +func (c *VersionCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { + ch <- prometheus.MustNewConstMetric(versionInfoDesc, prometheus.GaugeValue, 1, []string{versions.BuildVersion}...) +} + +// UpdateTelegraf update telegraf metric +func (c *VersionCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + + if fieldsMap[common.GeneralDevTagKey] == nil { + fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], versionInfoDesc, versions.BuildVersion, "") + return fieldsMap +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go new file mode 100644 index 0000000..5117ec9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go @@ -0,0 +1,169 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + cardLabelForVNpuName = make([]string, len(colcommon.CardLabel)) + podAiCoreUtilizationRate *prometheus.Desc = nil + podTotalMemory *prometheus.Desc = nil + podUsedMemory *prometheus.Desc = nil +) + +var ( + supportedVnpuDevices = map[string]bool{ + api.Ascend310P: true, + } +) + +const ( + vNpuUUID = "v_dev_id" + aiCoreCnt = "aicore_count" + isVirtual = "is_virtual" +) + +func init() { + cardLabelForVNpuName = append(colcommon.CardLabel, isVirtual) + cardLabelForVNpuName[2] = vNpuUUID + cardLabelForVNpuName[3] = aiCoreCnt + + podAiCoreUtilizationRate = colcommon.BuildDescWithLabel("vnpu_pod_aicore_utilization", + "the vnpu aicore utilization rate, unit is '%'", cardLabelForVNpuName) + podTotalMemory = colcommon.BuildDescWithLabel("vnpu_pod_total_memory", + "the vnpu total memory on pod, unit is 'KB'", cardLabelForVNpuName) + podUsedMemory = colcommon.BuildDescWithLabel("vnpu_pod_used_memory", + "the vnpu used memory on pod, unit is 'KB'", cardLabelForVNpuName) + +} + +// VnpuCollector collect vnpu info +type VnpuCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *VnpuCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedVnpuDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe description of the metric +func (c *VnpuCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- podAiCoreUtilizationRate + ch <- podTotalMemory + ch <- podUsedMemory +} + +// CollectToCache collect the metric to cache +func (c *VnpuCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + cache := &chipCache{ + chip: chip, + } + cache.timestamp = time.Now() + c.LocalCache.Store(chip.PhyId, *cache) + } + colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *VnpuCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { + if chipWithVnpu.VDevActivityInfo == nil { + return + } + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if !common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + containerName := getContainerNameArray(containerMap[int32(vDevActivityInfo.VDevID)]) + if len(containerName) != colcommon.ContainerNameLen { + return + } + cardLabel = getPodDisplayInfo(&chipWithVnpu, containerName) + doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevAiCoreRate, cardLabel, podAiCoreUtilizationRate) + doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevTotalMem, cardLabel, podTotalMemory) + doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevUsedMem, cardLabel, podUsedMemory) + } + + updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *VnpuCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + + vDevActivityInfo := chip.VDevActivityInfo + if vDevActivityInfo == nil || !common.IsValidVDevID(vDevActivityInfo.VDevID) { + continue + } + + devTagKey := strconv.Itoa(int(cache.chip.LogicID)) + "_" + strconv.Itoa(int(vDevActivityInfo.VDevID)) + + if fieldsMap[devTagKey] == nil { + fieldsMap[devTagKey] = make(map[string]interface{}) + } + + doUpdateTelegraf(fieldsMap[devTagKey], podAiCoreUtilizationRate, vDevActivityInfo.VDevAiCoreRate, "") + doUpdateTelegraf(fieldsMap[devTagKey], podTotalMemory, vDevActivityInfo.VDevTotalMem, "") + doUpdateTelegraf(fieldsMap[devTagKey], podUsedMemory, vDevActivityInfo.VDevUsedMem, "") + } + return fieldsMap +} + +func getPodDisplayInfo(chip *colcommon.HuaWeiAIChip, containerName []string) []string { + if len(containerName) != colcommon.ContainerNameLen { + logger.Errorf("container name length %v is not %v", len(containerName), colcommon.ContainerNameLen) + return nil + } + + chipInfo := common.DeepCopyChipInfo(chip.ChipInfo) + vDevActivityInfo := common.DeepCopyVDevActivityInfo(chip.VDevActivityInfo) + + return []string{ + strconv.Itoa(int(chip.DeviceID)), + common.GetNpuName(chipInfo), + strconv.Itoa(int(vDevActivityInfo.VDevID)), + strconv.FormatFloat(vDevActivityInfo.VDevAiCore, 'f', colcommon.DecimalPlaces, colcommon.BitSize), + containerName[colcommon.NameSpaceIdx], + containerName[colcommon.PodNameIdx], + containerName[colcommon.ConNameIdx], + strconv.FormatBool(vDevActivityInfo.IsVirtualDev), + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go new file mode 100644 index 0000000..d57ade0 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go @@ -0,0 +1,202 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "strconv" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + vnpuMetricNum = 3 + validVnpuID = 100 + invalidVnpuID = 1 +) + +// TestVnpuCollectorIsSupported test VnpuCollector IsSupported +func TestVnpuCollectorIsSupported(t *testing.T) { + n := mockNewNpuCollector() + cases := []testCase{ + buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), + buildTestCase("VnpuCollector: testIsSupported on other type", &VnpuCollector{}, "OTHER", false), + } + + for _, c := range cases { + patches := gomonkey.NewPatches() + convey.Convey(c.name, t, func() { + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) + isSupported := c.collectorType.IsSupported(n) + convey.So(isSupported, convey.ShouldEqual, c.expectValue) + }) + } +} + +func TestVnpuCollectorDescribe(t *testing.T) { + collector := &VnpuCollector{} + convey.Convey("TestVnpuCollectorDescribe", t, func() { + ch := make(chan *prometheus.Desc, vnpuMetricNum) + collector.Describe(ch) + convey.So(len(ch), convey.ShouldEqual, vnpuMetricNum) + close(ch) + }) +} + +func TestVnpuCollectorCollectToCache(t *testing.T) { + collector := &VnpuCollector{} + n := mockNewNpuCollector() + testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} + + convey.Convey("TestVnpuCollectorCollectToCache", t, func() { + collector.CollectToCache(n, testChips) + cacheInfo := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(collector)) + convey.So(cacheInfo, convey.ShouldNotBeNil) + }) +} + +func TestVnpuCollectorUpdatePrometheus(t *testing.T) { + collector := &VnpuCollector{} + n := mockNewNpuCollector() + containerMap := mockContainerInfo() + + testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} + collector.CollectToCache(n, testChips) + chip := createValidVnpuChip() + testCases := []struct { + name string + preHandleFunc func() + expectValue int + }{ + {name: "TestVnpuCollectorUpdatePrometheus_effective virtual device scenarios", + preHandleFunc: func() {}, + expectValue: vnpuMetricNum, + }, + {name: "TestVnpuCollectorUpdatePrometheus_there is no container info", + preHandleFunc: func() { + containerMap = map[int32]container.DevicesInfo{} + }, + expectValue: 0, + }, + {name: "TestVnpuCollectorUpdatePrometheus_the vdevid is invalid", + preHandleFunc: func() { + chip.VDevActivityInfo.VDevID = invalidVnpuID + }, + expectValue: 0, + }, + {name: "TestVnpuCollectorUpdatePrometheus_there is no vdev info", + preHandleFunc: func() { + chip.VDevActivityInfo = nil + }, + expectValue: 0, + }, + } + ch := make(chan prometheus.Metric, vnpuMetricNum) + defer close(ch) + for _, tt := range testCases { + convey.Convey(tt.name, t, func() { + tt.preHandleFunc() + collector.UpdatePrometheus(ch, n, containerMap, []colcommon.HuaWeiAIChip{chip}) + convey.So(len(ch), convey.ShouldEqual, tt.expectValue) + //clean ch + for { + if len(ch) == 0 { + break + } + <-ch + } + }) + } +} + +func mockContainerInfo() map[int32]container.DevicesInfo { + containerMap := map[int32]container.DevicesInfo{ + validVnpuID: { + Devices: []int{0}, + ID: strconv.Itoa(validVnpuID), + Name: "nsName_podName_ctrName", + }, + } + return containerMap +} + +func TestVnpuCollectorUpdateTelegraf(t *testing.T) { + collector := &VnpuCollector{} + n := mockNewNpuCollector() + containerMap := mockContainerInfo() + testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} + collector.CollectToCache(n, testChips) + chip := createValidVnpuChip() + convey.Convey("TestVnpuCollectorUpdateTelegraf", t, func() { + convey.Convey("effective virtual device scenarios", func() { + chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} + newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) + convey.So(len(newFieldMaps), convey.ShouldEqual, 1) + convey.So(len(newFieldMaps["0_100"]), convey.ShouldEqual, vnpuMetricNum) + }) + convey.Convey("there is no container info", func() { + chip.VDevActivityInfo = nil + chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} + containerMap = map[int32]container.DevicesInfo{} + newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) + convey.So(len(newFieldMaps), convey.ShouldEqual, 0) + }) + + }) +} + +func TestGetPodDisplayInfo(t *testing.T) { + const num8 = 8 + convey.Convey("TestGetPodDisplayInfo", t, func() { + chip := createValidVnpuChip() + convey.Convey("valid container information", func() { + containerNames := []string{"namespace", "pod-name", "container-name"} + labels := getPodDisplayInfo(&chip, containerNames) + convey.Convey("should return 8 metrics", func() { + convey.So(len(labels), convey.ShouldEqual, num8) + convey.So(labels[len(labels)-1], convey.ShouldEqual, "true") + }) + }) + + convey.Convey("invalid container information", func() { + containerNames := []string{"short"} + labels := getPodDisplayInfo(&chip, containerNames) + convey.Convey("should return nil", func() { + convey.So(labels, convey.ShouldBeNil) + }) + }) + }) +} + +func createValidVnpuChip() colcommon.HuaWeiAIChip { + chip := createChip() + chip.VDevActivityInfo = &common.VDevActivityInfo{ + VDevID: validVnpuID, + VDevAiCore: 1, + VDevTotalMem: 1, + VDevUsedMem: 1, + IsVirtualDev: true, + } + return chip +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go new file mode 100644 index 0000000..7524c68 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go @@ -0,0 +1,548 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "strconv" + "sync" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + maxMetricsCount = 2000 + num5 = 5 + mockContainerName = "mockContainerName" + maxChipNum int32 = 8 +) + +var ( + collectorChain []colcommon.MetricsCollector +) + +// TestDescribe test Describe +func TestDescribe(t *testing.T) { + + convey.Convey("test prometheus desc ", t, func() { + ch := make(chan *prometheus.Desc, maxMetricsCount) + for _, c := range collectorChain { + c.Describe(ch) + } + t.Logf("Describe len(ch):%v", len(ch)) + convey.So(ch, convey.ShouldNotBeEmpty) + }) +} + +type testCase struct { + name string + collectorType colcommon.MetricsCollector + deviceType string + expectValue bool +} + +func buildTestCase(name string, collectorType colcommon.MetricsCollector, deviceType string, + expectValue bool) testCase { + return testCase{ + name: name, + collectorType: collectorType, + deviceType: deviceType, + expectValue: expectValue, + } +} + +// testIsSupported test IsSupported +func TestIsSupported(t *testing.T) { + n := mockNewNpuCollector() + cases := []testCase{ + buildTestCase("DdrCollector: testIsSupported on Ascend310", &DdrCollector{}, api.Ascend310, true), + buildTestCase("DdrCollector: testIsSupported on Ascend310P", &DdrCollector{}, api.Ascend310P, true), + buildTestCase("DdrCollector: testIsSupported on Ascend910", &DdrCollector{}, api.Ascend910, true), + buildTestCase("DdrCollector: testIsSupported on Ascend910B", &DdrCollector{}, api.Ascend910B, false), + buildTestCase("DdrCollector: testIsSupported on Ascend910A3", &DdrCollector{}, api.Ascend910A3, false), + + buildTestCase("HccsCollector: testIsSupported on Ascend310", &HccsCollector{}, api.Ascend310, false), + buildTestCase("HccsCollector: testIsSupported on Ascend310P", &HccsCollector{}, api.Ascend310P, false), + buildTestCase("HccsCollector: testIsSupported on Ascend910", &HccsCollector{}, api.Ascend910, false), + buildTestCase("HccsCollector: testIsSupported on Ascend910B", &HccsCollector{}, api.Ascend910B, true), + buildTestCase("HccsCollector: testIsSupported on Ascend910A3", &HccsCollector{}, api.Ascend910A3, true), + + buildTestCase("SioCollector: testIsSupported on Ascend310", &SioCollector{}, api.Ascend310, false), + buildTestCase("SioCollector: testIsSupported on Ascend310P", &SioCollector{}, api.Ascend310P, false), + buildTestCase("SioCollector: testIsSupported on Ascend910", &SioCollector{}, api.Ascend910, false), + buildTestCase("SioCollector: testIsSupported on Ascend910B", &SioCollector{}, api.Ascend910B, false), + buildTestCase("SioCollector: testIsSupported on Ascend910A3", &SioCollector{}, api.Ascend910A3, true), + + buildTestCase("VnpuCollector: testIsSupported on Ascend310", &VnpuCollector{}, api.Ascend310, false), + buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), + buildTestCase("VnpuCollector: testIsSupported on Ascend910", &VnpuCollector{}, api.Ascend910, false), + buildTestCase("VnpuCollector: testIsSupported on Ascend910B", &VnpuCollector{}, api.Ascend910B, false), + buildTestCase("VnpuCollector: testIsSupported on Ascend910A3", &VnpuCollector{}, api.Ascend910A3, false), + } + + for _, c := range cases { + patches := gomonkey.NewPatches() + convey.Convey(c.name, t, func() { + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) + isSupported := c.collectorType.IsSupported(n) + convey.So(isSupported, convey.ShouldEqual, c.expectValue) + }) + } +} + +// TestIsSupported2 test IsSupported +func TestIsSupported2(t *testing.T) { + n := mockNewNpuCollector() + convey.Convey("TestIsSupported ", t, func() { + for _, c := range collectorChain { + c.IsSupported(n) + } + }) + +} + +// TestCollectToCache test CollectToCache +func TestCollectToCache(t *testing.T) { + n := mockNewNpuCollector() + + convey.Convey("TestCollectToCache", t, func() { + + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceMemoryInfo", mockMemoryInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHbmInfo", mockHbmAggregateInfo().HbmInfo, nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceEccInfo", mockHbmAggregateInfo().ECCInfo, nil) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfo", mockHccsStaticsInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", mockHccsStaticsInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsBandwidthInfo", mockHccsBWInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetPCIEBandwidth", mockPcieInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetSioInfo", mockSioInfo(), nil) + patches.ApplyFuncReturn(hccn.GetNPULinkStatus, "UP", nil) + patches.ApplyFuncReturn(hccn.GetNPUInterfaceTraffic, float64(0), float64(0), nil) + patches.ApplyFuncReturn(hccn.GetNPULinkUpNum, 0, nil) + patches.ApplyFuncReturn(hccn.GetNPULinkSpeed, 0, nil) + patches.ApplyFuncReturn(hccn.GetNPUOpticalInfo, mockOpticalInfo(), nil) + patches.ApplyFuncReturn(hccn.GetNPUStatInfo, mockRoceInfoMap(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceFrequency", uint32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceTemperature", int32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceVoltage", float32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceAllErrorCode", int32(1), []int64{0}, nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHealth", uint32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDevicePowerInfo", float32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceUtilizationRate", uint32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDevProcessInfo", mockProcessInfo(), nil) + + chips := mockGetNPUChipList() + for _, c := range collectorChain { + c.PreCollect(n, chips) + c.CollectToCache(n, chips) + } + + convey.So(colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(&DdrCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(&HbmCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(&HccsCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(&NetworkCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(&BaseInfoCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(&OpticalCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(&PcieCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(&RoceCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(&SioCollector{})), + convey.ShouldNotBeEmpty) + + }) +} + +// TestUpdatePrometheus test UpdatePrometheus +func TestUpdatePrometheus(t *testing.T) { + n := mockNewNpuCollector() + + convey.Convey("TestUpdatePrometheus", t, func() { + + ch := make(chan prometheus.Metric, maxMetricsCount) + + patches := gomonkey.NewPatches() + defer patches.Reset() + containerInfos := mockGetContainerNPUInfo() + chips := mockGetNPUChipList() + + mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) + mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) + mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) + mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) + mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) + mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) + mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) + mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) + mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) + + for _, c := range collectorChain { + c.UpdatePrometheus(ch, n, containerInfos, chips) + } + + t.Logf("TestUpdatePrometheus len(ch):%v", len(ch)) + convey.So(ch, convey.ShouldNotBeEmpty) + }) +} + +// TestUpdateTelegraf test UpdateTelegraf +func TestUpdateTelegraf(t *testing.T) { + n := mockNewNpuCollector() + + convey.Convey("TestUpdatePrometheus", t, func() { + + patches := gomonkey.NewPatches() + defer patches.Reset() + containerInfos := mockGetContainerNPUInfo() + chips := mockGetNPUChipList() + + mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) + mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) + mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) + mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) + mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) + mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) + mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) + mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) + mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) + fieldsMap := make(map[string]map[string]interface{}) + + for _, c := range collectorChain { + c.UpdateTelegraf(fieldsMap, n, containerInfos, chips) + } + + t.Logf("fieldsMap len(ch):%v", len(fieldsMap)) + convey.So(fieldsMap, convey.ShouldNotBeEmpty) + }) +} + +func mockRoceCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), + extInfo: getMainStatInfo(mockRoceInfoMap())}) + } + colcommon.UpdateCache[roceCache](n, cacheKey, &localCache) +} + +func mockRoceInfoMap() map[string]int { + return map[string]int{ + macRxMacPauseNum: 0, + macTxMacPauseNum: 0, + macRxPfcPktNum: 0, + macTxPfcPktNum: 0, + macRxBadPktNum: 0, + macTxBadPktNum: 0, + roCERxAllPktNum: 0, + roCETxAllPktNum: 0, + roCERxErrPktNum: 0, + roCETxErrPktNum: 0, + roCERxCnpPktNum: 0, + roCETxCnpPktNum: 0, + macRxBadOctNum: 0, + macTxBadOctNum: 0, + roCEUnexpectedAckNum: 0, + roCEOutOfOrderNum: 0, + roCEVerificationErrNum: 0, + roCEQpStatusErrNum: 0, + roCENewPktRtyNum: 0, + roCEEcnDBNum: 0, + macRXFcsErrPktNum: 0, + } +} + +func mockDdrCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mockMemoryInfo()}) + } + colcommon.UpdateCache[ddrCache](n, cacheKey, &localCache) +} + +func mockHccsCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, hccsCache{chip: chip, timestamp: time.Now(), + hccsStat: mockHccsStaticsInfo(), hccsBW: mockHccsBWInfo()}) + } + colcommon.UpdateCache[hccsCache](n, cacheKey, &localCache) +} + +func mockHccsBWInfo() *common.HccsBandwidthInfo { + return &common.HccsBandwidthInfo{ + ProfilingTime: 0, + RxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, + TxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, + TotalRxbw: 0, + TotalTxbw: 0, + } +} + +func mockHccsStaticsInfo() *common.HccsStatisticInfo { + return &common.HccsStatisticInfo{ + TxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + RxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + CrcErrCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + } +} + +func mockSioCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: mockSioInfo()}) + } + colcommon.UpdateCache[sioCache](n, cacheKey, &localCache) +} + +func mockSioInfo() *common.SioCrcErrStatisticInfo { + return &common.SioCrcErrStatisticInfo{ + TxErrCnt: 0, + RxErrCnt: 0, + } +} +func mockPcieCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + pcieInfo := mockPcieInfo() + localCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieInfo}) + } + colcommon.UpdateCache[pcieCache](n, cacheKey, &localCache) +} + +func mockPcieInfo() common.PCIEBwStat { + return common.PCIEBwStat{ + PcieRxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieRxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieRxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieTxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieTxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieTxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + } +} + +func mockOpticalCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), + extInfo: getMainOptInfo(mockOpticalInfo())}) + } + colcommon.UpdateCache[opticalCache](n, cacheKey, &localCache) +} + +func mockOpticalInfo() map[string]string { + return map[string]string{ + txPower0: "1 mW", + txPower1: "1 mW", + txPower2: "1 mW", + txPower3: "1 mW", + rxPower0: "1 mW", + rxPower1: "1 mW", + rxPower2: "1 mW", + rxPower3: "1 mW", + voltage: "1 mV", + temperature: "50 C", + present: "1.0", + } +} + +func mockHbmCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, hbmCache{chip: chip, timestamp: time.Now(), extInfo: mockHbmAggregateInfo(), + hbmUtilization: 0}, + ) + } + colcommon.UpdateCache[hbmCache](n, cacheKey, &localCache) +} + +func mockNetInfoCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: mockNetInfo()}) + } + colcommon.UpdateCache[netInfoCache](n, cacheKey, &localCache) +} + +func mockNetInfo() *common.NpuNetInfo { + return &common.NpuNetInfo{ + LinkStatusInfo: &common.LinkStatusInfo{LinkState: "0"}, + BandwidthInfo: &common.BandwidthInfo{RxValue: 0, TxValue: 0}, + LinkStatInfo: &common.LinkStatInfo{LinkUPNum: 0}, + LinkSpeedInfo: &common.LinkSpeedInfo{Speed: 0}, + } +} + +func mockChipCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, chipCache{chip: chip, timestamp: time.Now(), + HealthStatus: "Healthy", + ErrorCodes: []int64{0}, + Utilization: 0, + OverallUtilization: 0, + VectorUtilization: 0, + Temperature: 0, + Power: 0, + Voltage: 0, + AICoreCurrentFreq: 0, + NetHealthStatus: "Healthy", + DevProcessInfo: mockProcessInfo(), + }) + } + colcommon.UpdateCache[chipCache](n, cacheKey, &localCache) +} + +func mockProcessInfo() *common.DevProcessInfo { + return &common.DevProcessInfo{ + ProcNum: 1, + DevProcArray: []common.DevProcInfo{{Pid: 0, MemUsage: 0}}, + } +} + +func mockMemoryInfo() *common.MemoryInfo { + return &common.MemoryInfo{ + MemorySize: 0, + MemoryAvailable: 0, + Frequency: 0, + Utilization: 0, + } +} + +func mockHbmAggregateInfo() *common.HbmAggregateInfo { + return &common.HbmAggregateInfo{ + HbmInfo: &common.HbmInfo{ + MemorySize: 1, + Frequency: 1, + Usage: 1, + Temp: 1, + BandWidthUtilRate: 1, + }, + ECCInfo: &common.ECCInfo{ + EnableFlag: 1, + }, + } +} + +func mockNewNpuCollector() *colcommon.NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: time.Duration(num5) * time.Second, + updateTime: time.Duration(num5) * time.Second, + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := colcommon.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +func mockGetNPUChipList() []colcommon.HuaWeiAIChip { + chips := make([]colcommon.HuaWeiAIChip, 0) + for id := int32(0); id < maxChipNum; id++ { + chip := colcommon.HuaWeiAIChip{ + CardId: id, + PhyId: id, + DeviceID: id, + LogicID: id, + ChipInfo: &common.ChipInfo{ + Name: api.Ascend910, + Type: "Ascend", + Version: "V1", + }, + } + + chips = append(chips, chip) + } + return chips +} + +func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { + containsInfo := make(map[int32]container.DevicesInfo) + for id := int32(0); id < maxChipNum; id++ { + + containerInfo := container.DevicesInfo{ + ID: strconv.Itoa(int(id)), + Name: mockContainerName, + Devices: []int{int(id)}, + } + containsInfo[id] = containerInfo + } + return containsInfo +} + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + + initChain() +} + +func initChain() { + collectorChain = []colcommon.MetricsCollector{ + &HccsCollector{}, + &BaseInfoCollector{}, + &SioCollector{}, + &VersionCollector{}, + &HbmCollector{}, + &DdrCollector{}, + &VnpuCollector{}, + &PcieCollector{}, + &NetworkCollector{}, + &RoceCollector{}, + &OpticalCollector{}, + } +} + +func createChip() colcommon.HuaWeiAIChip { + return colcommon.HuaWeiAIChip{ + CardId: 0, + PhyId: 0, + DeviceID: 0, + LogicID: 0, + ChipInfo: &common.ChipInfo{ + Name: api.Ascend910, + Type: "Ascend", + Version: "V1", + }, + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go new file mode 100644 index 0000000..7a0697d --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go @@ -0,0 +1,193 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics offer common utils for collector +package metrics + +import ( + "math" + "reflect" + "strconv" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +func validateNum(num float64) bool { + if num == -1 || num == math.MaxUint32 || float32(num) == math.MaxUint32 { + return false + } + + return true +} + +func doUpdateTelegrafWithValidateNum(fieldMap map[string]interface{}, desc *prometheus.Desc, + value float64, extInfo string) { + if validateNum(value) { + doUpdateTelegraf(fieldMap, desc, value, extInfo) + } +} + +func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { + fieldMap[utils.GetDescName(desc)+extInfo] = value +} + +func doUpdateMetricWithValidateNum(ch chan<- prometheus.Metric, timestamp time.Time, value float64, + cardLabel []string, desc *prometheus.Desc) { + if validateNum(value) { + doUpdateMetric(ch, timestamp, value, cardLabel, desc) + } +} +func doUpdateMetric(ch chan<- prometheus.Metric, timestamp time.Time, value interface{}, + cardLabel []string, desc *prometheus.Desc) { + var finalValue float64 + + switch value.(type) { + case int: + finalValue = float64(value.(int)) + case int32: + finalValue = float64(value.(int32)) + case int64: + finalValue = float64(value.(int64)) + case uint32: + finalValue = float64(value.(uint32)) + case uint64: + finalValue = float64(value.(uint64)) + case float32: + finalValue = float64(value.(float32)) + case float64: + finalValue = value.(float64) + default: + logger.Errorf("invalid param in function doUpdateMetric,"+ + "metrics name is (%v), value type is (%T),value is (%v)", utils.GetDescName(desc), value, value) + } + // collect failed, set value to -1 + if finalValue == common.FailedValue { + finalValue = common.FailedMetricValue + } + ch <- prometheus.NewMetricWithTimestamp(timestamp, + prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, finalValue, cardLabel...)) +} + +func getContainerInfoWithDefault(cNameArray []string) (containerName, namespaceValue, podNameValue string) { + if len(cNameArray) == colcommon.ContainerNameLen { + namespaceValue = cNameArray[colcommon.NameSpaceIdx] + podNameValue = cNameArray[colcommon.PodNameIdx] + containerName = cNameArray[colcommon.ConNameIdx] + } + return containerName, namespaceValue, podNameValue +} + +func geenGeneralCardLabel(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) []string { + + containerInfo := geenContainerInfo(chip, containerMap) + + containerName, namespaceValue, podNameValue := getContainerInfoWithDefault(getContainerNameArray(containerInfo)) + cardLabel := collectCardLabelValue(chip, namespaceValue, podNameValue, containerName) + return cardLabel +} + +func geenContainerInfo(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) container.DevicesInfo { + deviceID := chip.DeviceID + if chip.VDevActivityInfo != nil && chip.VDevActivityInfo.IsVirtualDev { + deviceID = int32(chip.VDevActivityInfo.VDevID) + } + containerInfo, ok := containerMap[deviceID] + if !ok { + containerInfo = container.DevicesInfo{} + } + return containerInfo +} +func collectCardLabelValue(chip *colcommon.HuaWeiAIChip, namespaceValue, podNameValue, containerName string) []string { + + return []string{strconv.FormatInt(int64(chip.DeviceID), colcommon.Base), common.GetNpuName(chip.ChipInfo), chip.VDieID, + chip.PCIeBusInfo, namespaceValue, podNameValue, containerName} +} + +func getContainerNameArray(devInfo container.DevicesInfo) []string { + if devInfo.Name == "" { + return nil + } + + return strings.Split(devInfo.Name, "_") +} + +func getFieldMap(fieldsMap map[string]map[string]interface{}, devTagKey int32) map[string]interface{} { + devTagKeyStr := strconv.Itoa(int(devTagKey)) + if fieldsMap[devTagKeyStr] == nil { + fieldsMap[devTagKeyStr] = make(map[string]interface{}) + } + return fieldsMap[devTagKeyStr] +} + +func handleErr(err error, domain string, logicID int32) { + if err != nil { + logErrMetricsWithLimit(domain, logicID, err) + } else { + hwlog.ResetErrCnt(domain, logicID) + } +} + +func logErrMetricsWithLimit(metric string, logicID int32, err error) { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{ + Domain: metric, + ID: logicID}, + "logicID(%d),%v", logicID, err) +} + +func validateNotNilForEveryElement(objs ...interface{}) bool { + for _, v := range objs { + val := reflect.ValueOf(v) + if val.Kind() != reflect.Ptr { + return false + } + if val.IsNil() { + return false + } + } + return true +} +func logForUnSupportDevice(isSupport bool, devType string, group string, extInfo string) { + if !isSupport { + logger.Infof("devType %v does not support [%v], %v", devType, group, extInfo) + } +} + +func updateFrame[T any](cacheKey string, n *colcommon.NpuCollector, containerMap map[int32]container.DevicesInfo, + chips []colcommon.HuaWeiAIChip, callBack func(chipWithVnpu colcommon.HuaWeiAIChip, cache T, cardLabel []string)) { + + caches := colcommon.GetInfoFromCache[T](n, cacheKey) + if len(caches) == 0 { + logger.Debugf("cacheKey(%v) not found", cacheKey) + return + } + for _, chip := range chips { + cardLabel := geenGeneralCardLabel(&chip, containerMap) + cache, ok := caches[chip.PhyId] + if !ok { + logger.Warnf("cacheKey(%v) not found, chip.PhyId(%v)", cacheKey, chip.PhyId) + continue + } + + callBack(chip, cache, cardLabel) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go new file mode 100644 index 0000000..9cb88bd --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go @@ -0,0 +1,165 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics offer common utils for collector +package metrics + +import ( + "math" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" +) + +const ( + invalidNum = -1 + num100 = 100 +) + +// TestValidateNum test numerical verification +func TestValidateNum(t *testing.T) { + convey.Convey("TestValidateNum", t, func() { + convey.Convey("return true when the num is valid", func() { + convey.So(validateNum(0), convey.ShouldBeTrue) + convey.So(validateNum(num100), convey.ShouldBeTrue) + }) + + convey.Convey("return false when the num is invalid", func() { + convey.So(validateNum(invalidNum), convey.ShouldBeFalse) + convey.So(validateNum(math.MaxUint32), convey.ShouldBeFalse) + }) + }) +} + +// TestDoUpdateTelegraf test update telegraf +func TestDoUpdateTelegraf(t *testing.T) { + convey.Convey("TestDoUpdateTelegraf", t, func() { + fieldMap := make(map[string]interface{}) + desc := prometheus.NewDesc("test_metric", "", nil, nil) + + convey.Convey("update when num is valid", func() { + doUpdateTelegrafWithValidateNum(fieldMap, desc, num100, "_suffix") + convey.So(fieldMap["test_metric_suffix"], convey.ShouldEqual, num100) + }) + + convey.Convey("don't update when num is invalid", func() { + doUpdateTelegrafWithValidateNum(fieldMap, desc, -1, "_suffix") + convey.So(fieldMap, convey.ShouldBeEmpty) + }) + }) +} + +// TestDoUpdateMetric test update prometheus +func TestDoUpdateMetric(t *testing.T) { + const ( + num10 = 10 + num100 = 100 + negaNum = -5 + floatNum = 3.14 + ) + convey.Convey("TestDoUpdateMetric", t, func() { + ch := make(chan prometheus.Metric, 1) + desc := prometheus.NewDesc("test_metric", "", []string{"label"}, nil) + + convey.Convey("convert the various numeric types correctly", func() { + testCases := []struct { + input interface{} + expected float64 + }{ + {int(num10), num10}, + {int32(negaNum), negaNum}, + {uint64(num100), num100}, + {float32(floatNum), floatNum}, + } + + for _, tc := range testCases { + doUpdateMetric(ch, time.Now(), tc.input, []string{"label"}, desc) + m := <-ch + convey.So(m, convey.ShouldNotBeEmpty) + } + }) + }) +} + +// TestContainerInfo test container information processing +func TestContainerInfo(t *testing.T) { + convey.Convey("TestContainerInfo", t, func() { + convey.Convey("correctly split the array of container names", func() { + testCases := []struct { + input []string + expected []string + }{ + {[]string{"ns", "pod", "container"}, []string{"container", "ns", "pod"}}, + {[]string{"short"}, []string{"", "", ""}}, + } + + for _, tc := range testCases { + c, ns, pod := getContainerInfoWithDefault(tc.input) + convey.So([]string{c, ns, pod}, convey.ShouldResemble, tc.expected) + } + }) + }) +} + +// TestCardLabel test card label generation +func TestCardLabel(t *testing.T) { + convey.Convey("TestCardLabel", t, func() { + chip := &colcommon.HuaWeiAIChip{ + DeviceID: 0, + ChipInfo: &common.ChipInfo{Name: "1", Type: "1", Version: "1"}, + VDieID: "die1", + PCIeBusInfo: "0000:00:01.0", + } + + expected := []string{ + "0", + "1-1-1", + "die1", + "0000:00:01.0", + "test-ns", + "test-pod", + "test-container", + } + + convey.Convey("correctly generate an array of tags", func() { + labels := collectCardLabelValue(chip, "test-ns", "test-pod", "test-container") + convey.So(labels, convey.ShouldResemble, expected) + }) + }) +} + +// TestNilValidation test null pointer validation +func TestNilValidation(t *testing.T) { + convey.Convey("TestNilValidation", t, func() { + var nilPtr *int + val := 10 + + convey.Convey("all non null pointers should return true", func() { + convey.So(validateNotNilForEveryElement(&val), convey.ShouldBeTrue) + }) + + convey.Convey("a null pointer should return false", func() { + convey.So(validateNotNilForEveryElement(nilPtr), convey.ShouldBeFalse) + }) + + convey.Convey("non pointer types should return false", func() { + convey.So(validateNotNilForEveryElement(val), convey.ShouldBeFalse) + }) + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics new file mode 100644 index 0000000..8f51362 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics @@ -0,0 +1,166 @@ +# HELP machine_npu_nums Amount of npu installed on the machine. +# TYPE machine_npu_nums gauge +machine_npu_nums 8 +# HELP npu_chip_info_aicore_current_freq the npu ai core current frequency, unit is 'MHz' +# TYPE npu_chip_info_aicore_current_freq gauge +npu_chip_info_aicore_current_freq{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_bandwidth_rx the npu interface receive speed, unit is 'MB/s' +# TYPE npu_chip_info_bandwidth_rx gauge +npu_chip_info_bandwidth_rx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_bandwidth_tx the npu interface transport speed, unit is 'MB/s' +# TYPE npu_chip_info_bandwidth_tx gauge +npu_chip_info_bandwidth_tx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_error_code the npu error code +# TYPE npu_chip_info_error_code gauge +npu_chip_info_error_code{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_hbm_total_memory the npu hbm total memory +# TYPE npu_chip_info_hbm_total_memory gauge +npu_chip_info_hbm_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_hbm_used_memory the npu hbm used memory +# TYPE npu_chip_info_hbm_used_memory gauge +npu_chip_info_hbm_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_health_status the npu health status +# TYPE npu_chip_info_health_status gauge +npu_chip_info_health_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +# HELP npu_chip_info_link_status the npu link status +# TYPE npu_chip_info_link_status gauge +npu_chip_info_link_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_name the Ascend npu name with value '1' +# TYPE npu_chip_info_name gauge +npu_chip_info_name{container_name="",id="0",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="1",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="2",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="3",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="4",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="5",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="6",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="7",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +# HELP npu_chip_info_network_status the npu network health status +# TYPE npu_chip_info_network_status gauge +npu_chip_info_network_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_power the npu power +# TYPE npu_chip_info_power gauge +npu_chip_info_power{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_temperature the npu temperature +# TYPE npu_chip_info_temperature gauge +npu_chip_info_temperature{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_total_memory the npu total memory +# TYPE npu_chip_info_total_memory gauge +npu_chip_info_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_used_memory the npu used memory +# TYPE npu_chip_info_used_memory gauge +npu_chip_info_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_utilization the ai core utilization +# TYPE npu_chip_info_utilization gauge +npu_chip_info_utilization{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_voltage the npu voltage +# TYPE npu_chip_info_voltage gauge +npu_chip_info_voltage{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_exporter_version_info exporter version with value '1' +# TYPE npu_exporter_version_info gauge +npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 new file mode 100644 index 0000000..bd501ee --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 @@ -0,0 +1,6 @@ +# HELP machine_npu_nums Amount of npu installed on the machine. +# TYPE machine_npu_nums gauge +machine_npu_nums 0 +# HELP npu_exporter_version_info exporter version with value '1' +# TYPE npu_exporter_version_info gauge +npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/go.mod b/mind-cluster/component/npu-exporter/go.mod new file mode 100644 index 0000000..0d84960 --- /dev/null +++ b/mind-cluster/component/npu-exporter/go.mod @@ -0,0 +1,63 @@ +module huawei.com/npu-exporter/v6 + +go 1.18 + +require ( + ascend-common v0.0.0 + github.com/agiledragon/gomonkey/v2 v2.8.0 + github.com/golang/protobuf v1.5.3 + github.com/influxdata/telegraf v1.26.3 + github.com/prometheus/client_golang v1.15.0 + github.com/smartystreets/goconvey v1.6.4 + github.com/stretchr/testify v1.8.2 + google.golang.org/grpc v1.57.2 + google.golang.org/protobuf v1.30.0 + k8s.io/cri-api v0.25.13 +) + +require ( + github.com/BurntSushi/toml v1.2.1 // indirect + github.com/alecthomas/participle v0.4.1 // indirect + github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect + github.com/awnumar/memcall v0.1.2 // indirect + github.com/awnumar/memguard v0.22.3 // indirect + github.com/benbjohnson/clock v1.3.3 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/blues/jsonata-go v1.5.4 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/coreos/go-semver v0.3.1 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/fatih/color v1.15.0 // indirect + github.com/fsnotify/fsnotify v1.6.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect + github.com/gosnmp/gosnmp v1.35.0 // indirect + github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 // indirect + github.com/jtolds/gls v4.20.0+incompatible // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.17 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/naoina/go-stringutil v0.1.0 // indirect + github.com/philhofer/fwd v1.1.2 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_model v0.3.0 // indirect + github.com/prometheus/common v0.42.0 // indirect + github.com/prometheus/procfs v0.9.0 // indirect + github.com/prometheus/prometheus v0.42.0 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/sleepinggenius2/gosmi v0.4.4 // indirect + github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect + github.com/tinylib/msgp v1.1.8 // indirect + golang.org/x/crypto v0.31.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/sys v0.28.0 // indirect + golang.org/x/text v0.21.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apimachinery v0.26.2 // indirect +) + +replace ascend-common => ../ascend-common diff --git a/mind-cluster/component/npu-exporter/go.sum b/mind-cluster/component/npu-exporter/go.sum new file mode 100644 index 0000000..d638dd1 --- /dev/null +++ b/mind-cluster/component/npu-exporter/go.sum @@ -0,0 +1,561 @@ +cloud.google.com/go v0.110.1 h1:oDJ19Fu9TX9Xs06iyCw4yifSqZ7JQ8BeuVHcTmWQlOA= +cloud.google.com/go/bigquery v1.51.1 h1:qI/8vkBbzLkv0BJmzE7ajA6uZqQC+C31MAwgb+vJe2U= +cloud.google.com/go/compute v1.19.1 h1:am86mquDUgjGNWxiGn+5PGLbmgiWXlE/yNWpIpNvuXY= +cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= +cloud.google.com/go/iam v1.0.0 h1:hlQJMovyJJwYjZcTohUH4o1L8Z8kYz+E+W/zktiLCBc= +cloud.google.com/go/monitoring v1.13.0 h1:2qsrgXGVoRXpP7otZ14eE1I568zAa92sJSDPyOJvwjM= +cloud.google.com/go/pubsub v1.30.1 h1:RdzTlwhswvROjPIoTfnSJ9tEp0LY2S5ATX90anOw7E8= +cloud.google.com/go/storage v1.29.0 h1:6weCgzRvMg7lzuUurI4697AqIRPU1SvzHhynwpW31jI= +code.cloudfoundry.org/clock v1.0.0 h1:kFXWQM4bxYvdBw2X8BbBeXwQNgfoWv1vqAk2ZZyBN2o= +collectd.org v0.5.0 h1:y4uFSAuOmeVhG3GCRa3/oH+ysePfO/+eGJNfd0Qa3d8= +github.com/Azure/azure-amqp-common-go/v4 v4.0.0 h1:mV5O74KYmonn0ZXtwfMjGUtZ9Z+Hv7AIFVS1s03sRvo= +github.com/Azure/azure-event-hubs-go/v3 v3.4.0 h1:LtH0nHkXivyV/GajOu5ZFC5sb/5KZ8j+9U8UsfHVTOo= +github.com/Azure/azure-kusto-go v0.8.0 h1:AeO6VBRGzB1BhmWeheSyN+WSrx+1wmhHm47vzptitdw= +github.com/Azure/azure-pipeline-go v0.2.3 h1:7U9HBg1JFK3jHl5qmo4CTZKFTVgMwdFHMVtCdfBE21U= +github.com/Azure/azure-sdk-for-go v65.0.0+incompatible h1:HzKLt3kIwMm4KeJYTdx9EbjRYTySD/t8i1Ee/W5EGXw= +github.com/Azure/azure-sdk-for-go/sdk/azcore v0.21.1 h1:qoVeMsc9/fh/yhxVaA0obYjVH/oI/ihrOoMwsLS9KSA= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v0.13.2 h1:mM/yraAumqMMIYev6zX0oxHqX6hreUs5wXf76W47r38= +github.com/Azure/azure-sdk-for-go/sdk/internal v0.9.1 h1:sLZ/Y+P/5RRtsXWylBjB5lkgixYfm0MQPiwrSX//JSo= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor v0.4.1 h1:P6UDRqlbywdpvhpVZeiB5p+DuhMTrVD4xfvPW55bs8M= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v0.3.1 h1:EXTDtCSTfPauGawsG+Ae/W46B1PkrgzuKNrcFqy4ljM= +github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v0.3.0 h1:Px2UA+2RvSSvv+RvJNuUB6n7rs5Wsel4dXLe90Um2n4= +github.com/Azure/azure-storage-blob-go v0.15.0 h1:rXtgp8tN1p29GvpGgfJetavIG0V7OgcSXPpwp3tx6qk= +github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd h1:b3wyxBl3vvr15tUAziPBPK354y+LSdfPCpex5oBttHo= +github.com/Azure/go-amqp v0.18.0 h1:95bTiJq0oxjK1RUlt5T3HF/THj6jWTRZpSXMPSOJLz8= +github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= +github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= +github.com/Azure/go-autorest/autorest v0.11.28 h1:ndAExarwr5Y+GaHE6VCaY1kyS/HwwGGyuimVhWsHOEM= +github.com/Azure/go-autorest/autorest/adal v0.9.23 h1:Yepx8CvFxwNKpH6ja7RZ+sKX+DWYNldbLiALMC3BTz8= +github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 h1:wkAZRgT/pn8HhFyzfe9UnqOjJYqlembgCTi72Bm/xKk= +github.com/Azure/go-autorest/autorest/azure/cli v0.4.5 h1:0W/yGmFdTIT77fvdlGZ0LMISoLHFJ7Tx4U0yeB+uFs4= +github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= +github.com/Azure/go-autorest/autorest/to v0.4.0 h1:oXVqrxakqqV1UZdSazDOPOLvOIz+XA683u8EctwboHk= +github.com/Azure/go-autorest/autorest/validation v0.3.1 h1:AgyqjAd94fwNAoTjl/WQXg4VvFeRFpO+UhNyRXqF1ac= +github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= +github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= +github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e h1:NeAW1fUYUEWhft7pkxDf6WoUvEZJ/uOKsvtpjLnn8MU= +github.com/AzureAD/microsoft-authentication-library-for-go v0.4.0 h1:WVsrXCnHlDDX8ls+tootqRE87/hL9S/g4ewig9RsD/c= +github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= +github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/ClickHouse/clickhouse-go v1.5.4 h1:cKjXeYLNWVJIx2J1K6H2CqyRmfwVJVY1OV1coaaFcI0= +github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= +github.com/Mellanox/rdmamap v0.0.0-20191106181932-7c3c4763a6ee h1:atI/FFjXh6hIVlPE1Jup9m8N4B9q/OSbMUe2EBahs+w= +github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= +github.com/Shopify/sarama v1.38.1 h1:lqqPUPQZ7zPqYlWpTh+LQ9bhYNu2xJL6k1SJN4WVe2A= +github.com/aerospike/aerospike-client-go/v5 v5.11.0 h1:z3ZmDSm3I10VMXXIIrsFCFq3IenwFqTCnLNyvnFVzrk= +github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= +github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= +github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2/go.mod h1:CxCgO+NdpMdi9SsTlGbc0W+/UNxO3I0AabOEJZ3w61w= +github.com/alecthomas/kong v0.2.1/go.mod h1:+inYUSluD+p4L8KdviBSgzcqEjUQOfC5fQDRFuc36lI= +github.com/alecthomas/participle v0.4.1 h1:P2PJWzwrSpuCWXKnzqvw0b0phSfH1kJo4p2HvLynVsI= +github.com/alecthomas/participle v0.4.1/go.mod h1:T8u4bQOSMwrkTWOSyt8/jSFPEnRtd0FKFMjVfYBlqPs= +github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ= +github.com/alecthomas/repr v0.0.0-20210301060118-828286944d6a/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= +github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= +github.com/aliyun/alibaba-cloud-sdk-go v1.62.193 h1:Cwd5cNwrQqtOzOJ1vqswYe3amU3vOz3v0wQF8WizmXI= +github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= +github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= +github.com/antchfx/jsonquery v1.3.1 h1:kh3599hMLpygvcxoENcj99eCvnS++JjRX10LjNYhK58= +github.com/antchfx/xmlquery v1.3.15 h1:aJConNMi1sMha5G8YJoAIF5P+H+qG1L73bSItWHo8Tw= +github.com/antchfx/xpath v1.2.5-0.20230505064641-588960cceeac h1:Et7H7mEPWuivbFEXi3dWa8hobnvF380TS2mq7JmgjEI= +github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ= +github.com/apache/arrow/go/v12 v12.0.0 h1:xtZE63VWl7qLdB0JObIXvvhGjoVNrQ9ciIHG2OK5cmc= +github.com/apache/iotdb-client-go v0.12.2-0.20220722111104-cd17da295b46 h1:28HyUQcr8ZCyCAatR0gkf9PuLr52U2T+66tx5Th0nxI= +github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg= +github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 h1:Bmjk+DjIi3tTAU0wxGaFbfjGUqlxxSXARq9A96Kgoos= +github.com/aristanetworks/goarista v0.0.0-20190325233358-a123909ec740 h1:FD4/ikKOFxwP8muWDypbmBWc634+YcAs3eBrYAmRdZY= +github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA= +github.com/awnumar/memcall v0.1.2 h1:7gOfDTL+BJ6nnbtAp9+HQzUFjtP1hEseRQq8eP055QY= +github.com/awnumar/memcall v0.1.2/go.mod h1:S911igBPR9CThzd/hYQQmTc9SWNu3ZHIlCGaWsWsoJo= +github.com/awnumar/memguard v0.22.3 h1:b4sgUXtbUjhrGELPbuC62wU+BsPQy+8lkWed9Z+pj0Y= +github.com/awnumar/memguard v0.22.3/go.mod h1:mmGunnffnLHlxE5rRgQc3j+uwPZ27eYb61ccr8Clz2Y= +github.com/aws/aws-sdk-go-v2 v1.18.0 h1:882kkTpSFhdgYRKVZ/VCgf7sd0ru57p2JCxz4/oN5RY= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= +github.com/aws/aws-sdk-go-v2/config v1.18.8 h1:lDpy0WM8AHsywOnVrOHaSMfpaiV2igOw8D7svkFkXVA= +github.com/aws/aws-sdk-go-v2/credentials v1.13.20 h1:oZCEFcrMppP/CNiS8myzv9JgOzq2s0d3v3MXYil/mxQ= +github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.2.0 h1:8kvinmbIDObqsWegKP0JjeanYPiA4GUVpAtciNWE+jw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.2 h1:jOzQAesnBFDmz93feqKnsTHsXrlwWORNZMFHMV+WLFU= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.7.1 h1:p9Dys1g2YdaqMalnp6AwCA+tpMMdJNGw5YYKP/u3sUk= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.32 h1:dpbVNUjczQ8Ae3QKHbpHBpfvaVkRdesxpTOe9pTouhU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.26 h1:QH2kOS3Ht7x+u0gHCh06CXL/h6G8LQJFpZfFBYBNboo= +github.com/aws/aws-sdk-go-v2/internal/ini v1.3.28 h1:KeTxcGdNnQudb46oOl4d90f2I33DF/c6q3RnZAmvQdQ= +github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.25.9 h1:7jgW378oM948BxuOBarXeeaKSrRaCj7didsdeSwYGGo= +github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.20.9 h1:sXs+JjIwgKA27t+5O8YgXl0cmZpEmctyDVO5y6cMdqA= +github.com/aws/aws-sdk-go-v2/service/dynamodb v1.17.3 h1:2oB4ikNEMLaPtu6lbNFJyTSayBILvrOfa2VfOffcuvU= +github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.4.0 h1:QbFWJr2SAyVYvyoOHvJU6sCGLnqNT94ZbWElJMEI1JY= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.10 h1:dpiPHgmFstgkLG07KaYAewvuptq5kvo52xn7tVSrtrQ= +github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.7.23 h1:5AwQnYQT3ZX/N7hPTAx4ClWyucaiqr2esQRMNbJIby0= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.26 h1:uUt4XctZLhl9wBE1L8lobU3bVN8SNUP7T+olb0bWBO4= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.9.0 h1:0BOlTqnNnrEO04oYKzDxMMe68t107pmIotn18HtVonY= +github.com/aws/aws-sdk-go-v2/service/kinesis v1.17.8 h1:9Kk24woetm1Tm4cAZNoJStJW1VQAeh92lLD9XZ4176g= +github.com/aws/aws-sdk-go-v2/service/s3 v1.19.0 h1:5mRAms4TjSTOGYsqKYte5kHr1PzpMJSyLThjF3J+hw0= +github.com/aws/aws-sdk-go-v2/service/sso v1.12.8 h1:5cb3D6xb006bPTqEfCNaEA6PPEfBXxxy4NNeX/44kGk= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.8 h1:NZaj0ngZMzsubWZbrEFSB4rgSQRbFq38Sd6KBxHuOIU= +github.com/aws/aws-sdk-go-v2/service/sts v1.18.9 h1:Qf1aWwnsNkyAoqDqmdM3nHwN78XQjec27LjM6b9vyfI= +github.com/aws/aws-sdk-go-v2/service/timestreamwrite v1.16.0 h1:HHVOprdnZxhM6F5JgljW8nCklfwUyOlbd/wuca6vORA= +github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= +github.com/awslabs/kinesis-aggregation/go v0.0.0-20210630091500-54e17340d32f h1:Pf0BjJDga7C98f0vhw+Ip5EaiE07S3lTKpIYPNS0nMo= +github.com/benbjohnson/clock v1.3.3 h1:g+rSsSaAzhHJYcIQE78hJ3AhyjjtQvleKDjlhdBnIhc= +github.com/benbjohnson/clock v1.3.3/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blues/jsonata-go v1.5.4 h1:XCsXaVVMrt4lcpKeJw6mNJHqQpWU751cnHdCFUq3xd8= +github.com/blues/jsonata-go v1.5.4/go.mod h1:uns2jymDrnI7y+UFYCqsRTEiAH22GyHnNXrkupAVFWI= +github.com/bmatcuk/doublestar/v3 v3.0.0 h1:TQtVPlDnAYwcrVNB2JiGuMc++H5qzWZd9PhkNo5WyHI= +github.com/bufbuild/protocompile v0.4.0 h1:LbFKd2XowZvQ/kajzguUp2DC9UEIQhIq77fZZlaQsNA= +github.com/caio/go-tdigest v3.1.0+incompatible h1:uoVMJ3Q5lXmVLCCqaMGHLBWnbGoN6Lpu7OAUPR60cds= +github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= +github.com/cenkalti/backoff/v4 v4.2.0 h1:HN5dHm3WBOgndBH6E8V0q2jIYIR3s9yglV8k/+MN3u4= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cisco-ie/nx-telemetry-proto v0.0.0-20230117155933-f64c045c77df h1:GmrltUp5Qf5XhT+LmqMDizsgm/6VHTSxPWRdrq21yRo= +github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg= +github.com/containerd/containerd v1.6.18 h1:qZbsLvmyu+Vlty0/Ex5xc0z2YtKpIsb5n45mAMI+2Ns= +github.com/coocood/freecache v1.2.3 h1:lcBwpZrwBZRZyLk/8EMyQVXRiFl663cCuMOrjCALeto= +github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= +github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec= +github.com/couchbase/go-couchbase v0.1.1 h1:ClFXELcKj/ojyoTYbsY34QUrrYCBi/1G749sXSCkdhk= +github.com/couchbase/gomemcached v0.1.3 h1:HIc5qMYNbuhB7zNaiEtj61DCYkquAwrQlf64q7JzdEY= +github.com/couchbase/goutils v0.1.0 h1:0WLlKJilu7IBm98T8nS9+J36lBFVLRUSIUtyD/uWpAE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/denisenkom/go-mssqldb v0.12.3 h1:pBSGx9Tq67pBOTLmxNuirNTeB8Vjmf886Kx+8Y+8shw= +github.com/devigned/tab v0.1.1 h1:3mD6Kb1mUOYeLpJvTVSDwSg5ZsfSxfvxGRTxRsJsITA= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/digitalocean/go-libvirt v0.0.0-20220811165305-15feff002086 h1:FTREXo+EVmU9nOCaQ46PvH0hs1Rt2/diCoTAtxzDxrA= +github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= +github.com/djherbis/times v1.5.0 h1:79myA211VwPhFTqUk8xehWrsEO+zcIZj0zT8mXPVARU= +github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= +github.com/docker/docker v23.0.4+incompatible h1:Kd3Bh9V/rO+XpTP/BLqM+gx8z7+Yb0AA2Ibj+nNo4ek= +github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/doclambda/protobufquery v0.0.0-20220727165953-0da287796ee9 h1:677nbAF3nq56BEZ2R/VMl0wROQqJo4vJ/ZWuzm+vsUU= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dynatrace-oss/dynatrace-metric-utils-go v0.5.0 h1:wHGPJSXvwKQVf/XfhjUPyrhpcPKWNy8F3ikH+eiwoBg= +github.com/eapache/go-resiliency v1.3.0 h1:RRL0nge+cWGlxXbUzJ7yMcq6w2XBEr19dCN6HECGaT0= +github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6 h1:8yY/I9ndfrgrXUbOGObLHKBR4Fl3nZXwM2c7OYTT8hM= +github.com/eapache/queue v1.1.0 h1:YOEu7KNc61ntiQlcEeUIoDTJ2o8mQznoNvUhiigpIqc= +github.com/eclipse/paho.golang v0.10.0 h1:oUGPjRwWcZQRgDD9wVDV7y7i7yBSxts3vcvcNJo8B4Q= +github.com/eclipse/paho.mqtt.golang v1.4.2 h1:66wOzfUHSSI1zamx7jR6yMEI5EuHnT1G6rNA5PM12m4= +github.com/emicklei/go-restful/v3 v3.10.1 h1:rc42Y5YTp7Am7CS630D7JmhRjq4UlEUuEKfrDac4bSQ= +github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= +github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/form3tech-oss/jwt-go v3.2.5+incompatible h1:/l4kBbb4/vGSsdtB5nUe8L7B9mImVMaBPw9L/0TBHU8= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= +github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro= +github.com/go-asn1-ber/asn1-ber v1.5.4 h1:vXT6d/FNDiELJnLb6hGNa309LMsrCoYFvpwHDF0+Y1A= +github.com/go-ldap/ldap/v3 v3.4.4 h1:qPjipEpt+qDa6SI/h1fzuGWoRUY+qqQ9sOZq67/PYUs= +github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= +github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-redis/redis/v7 v7.4.1 h1:PASvf36gyUpr2zdOUS/9Zqc80GbM+9BDyiJSJDDOrTI= +github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= +github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= +github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= +github.com/go-stomp/stomp v2.1.4+incompatible h1:D3SheUVDOz9RsjVWkoh/1iCOwD0qWjyeTZMUZ0EXg2Y= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= +github.com/gofrs/uuid v4.2.0+incompatible h1:yyYWMnhkhrKwwr8gAOcOCYxOOscHgDS9yZgBrnJfGa0= +github.com/gofrs/uuid/v5 v5.0.0 h1:p544++a97kEL+svbcFbCQVM9KFu0Yo25UoISXGNNH9M= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c= +github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg= +github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= +github.com/golang-sql/sqlexp v0.1.0 h1:ZCD6MBpcuOVfGVqsEmY5/4FtYiKz6tSyUv9LPEDei6A= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/flatbuffers v23.3.3+incompatible h1:5PJI/WbJkaMTvpGxsHVKG/LurN/KnWXNyGpwSCDgen0= +github.com/google/gnostic v0.6.9 h1:ZK/5VhkoX835RikCHpSUJV9a+S3e1zLh59YnyWeBW+0= +github.com/google/gnxi v0.0.0-20221016143401-2aeceb5a2901 h1:xlsMG0I0F6Ou3a4zRWu3cThivTt2N2V1cZafIloTBTU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-github/v32 v32.1.0 h1:GWkQOdXqviCPx7Q7Fj+KyPoGm4SwHRh8rheoPhd27II= +github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/s2a-go v0.1.3 h1:FAgZmpLl/SXurPEZyCMPBIiiYeTbqfjlbdnCNTAkbGE= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/googleapis/enterprise-certificate-proxy v0.2.3 h1:yk9/cqRKtT9wXZSsRH9aurXEpJX+U6FLtpYTdC3R06k= +github.com/googleapis/gax-go/v2 v2.8.0 h1:UBtEZqx1bjXtOQ5BVTkuYghXrr3N4V123VKJK67vJZc= +github.com/gopcua/opcua v0.3.7 h1:iGjLW3D+ztnjtZQPKsJ0nwibHyDw1m11NfqOU8KSFQ8= +github.com/gophercloud/gophercloud v1.2.0 h1:1oXyj4g54KBg/kFtCdMM6jtxSzeIyg8wv4z1HoGPp1E= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gosnmp/gosnmp v1.35.0 h1:EuWWNPxTCdAUx2/NbQcSa3WdNxjzpy4Phv57b4MWpJM= +github.com/gosnmp/gosnmp v1.35.0/go.mod h1:2AvKZ3n9aEl5TJEo/fFmf/FGO4Nj4cVeEc5yuk88CYc= +github.com/grid-x/modbus v0.0.0-20211113184042-7f2251c342c9 h1:Q7e9kXS3sRbTjsNDKazbcbDSGAKjFdk096M3qYbwNpE= +github.com/grid-x/serial v0.0.0-20211107191517-583c7356b3aa h1:Rsn6ARgNkXrsXJIzhkE4vQr5Gbx2LvtEMv4BJOK4LyU= +github.com/gwos/tcg/sdk v0.0.0-20220621192633-df0eac0a1a4c h1:pVr0TkSFnMP4BWSsEak/4bxD8/K+foJ9V8DGyZ6PIDE= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= +github.com/harlow/kinesis-consumer v0.3.6-0.20211204214318-c2b9f79d7ab6 h1:38nI+nE+oUmLmlNjuByhvnmuBrcQVLNkOJhSSM4eJv0= +github.com/hashicorp/consul/api v1.20.0 h1:9IHTjNVSZ7MIwjlW3N3a7iGiykCMDpxZu8jsxFJh0yc= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-hclog v1.4.0 h1:ctuWFGrhFha8BnnzxqeRGidlEcQkDyL5u8J8t5eA11I= +github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= +github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= +github.com/hashicorp/packer-plugin-sdk v0.3.1 h1:Gr/mnihsdUcPfGiruFL93BQkiFh3EFPwyxxTWkwvRsQ= +github.com/hashicorp/serf v0.10.1 h1:Z1H2J60yRKvfDYAOZLd2MU0ND4AH/WDz7xYHDWQsIPY= +github.com/huandu/xstrings v1.3.2 h1:L18LIDzqlW6xN2rEkpdV8+oL/IXWJ1APd+vsdYy4Wdw= +github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= +github.com/influxdata/go-syslog/v3 v3.0.0 h1:jichmjSZlYK0VMmlz+k4WeOQd7z745YLsvGMqwtYt4I= +github.com/influxdata/influxdb-observability/common v0.3.3 h1:fzsgJKiV/bucNPRYggLE1F6UgpePQaYh72Lqj1rPEmI= +github.com/influxdata/influxdb-observability/influx2otel v0.3.3 h1:KWesgMC0sqRLfvPZXnCzJauCZ82XoHtKTFJVKmEk63M= +github.com/influxdata/influxdb-observability/otel2influx v0.3.3 h1:zdesvjHJYXccZ4vd6hP6vXwbd6YbAj7AGMhOjk9pt0k= +github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= +github.com/influxdata/tail v1.0.1-0.20210707231403-b283181d1fa7 h1:0rQOs1VHLVFpAAOIR0mJEvVOIaMYFgYdreeVbgI9sII= +github.com/influxdata/telegraf v1.26.3 h1:wawD3VTdnPDbHnJ1RBGgCf0YB7vlxREZ70rvEepHdGs= +github.com/influxdata/telegraf v1.26.3/go.mod h1:w+VUZ4NRDzfhRmhEdBbbNZBNT7E8qRkLiL73j/pD0ug= +github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 h1:vvyMtD5LTJc1W9sQKjDkAWdcg0478CszSdzlHtiAXCY= +github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65/go.mod h1:zApaNFpP/bTpQItGZNNUMISDMDAnTXu9UqJ4yT3ocz8= +github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8 h1:W2IgzRCb0L9VzMujq/QuTaZUKcH8096jWwP519mHN6Q= +github.com/intel/iaevents v1.1.0 h1:FzxMBfXk/apG2EUXUCfaq3gUQ+q+TgZ1HNMjjUILUGE= +github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= +github.com/jackc/pgconn v1.13.0 h1:3L1XMNV2Zvca/8BYhzcRFS70Lr0WlDg16Di6SFGAbys= +github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgproto3/v2 v2.3.1 h1:nwj7qwf0S+Q7ISFfBndqeLwSwxs+4DPsbRFjECT1Y4Y= +github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b h1:C8S2+VttkHFdOOCXJe+YGfa4vHYwlt4Zx+IVXQ97jYg= +github.com/jackc/pgtype v1.12.0 h1:Dlq8Qvcch7kiehm8wPGIW0W3KsCCHJnRacKW0UM8n5w= +github.com/jackc/pgx/v4 v4.17.1 h1:tASdE79tX9LOQu3MMvioWT6YaZkf58ZhmLHhV4sv5WM= +github.com/jackc/puddle v1.3.0 h1:eHK/5clGOatcjX3oWGBO/MpxpbHzSwud5EWTSCI+MX0= +github.com/jaegertracing/jaeger v1.38.0 h1:rDQ36TnSxUX4gTskMQzEdpieS0BGYdfXXnUJmGnNMGw= +github.com/james4k/rcon v0.0.0-20120923215419-8fbb8268b60a h1:JxcWget6X/VfBMKxPIc28Jel37LGREut2fpV+ObkwJ0= +github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= +github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= +github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg= +github.com/jcmturner/gokrb5/v8 v8.4.3 h1:iTonLeSJOn7MVUtyMT+arAn5AKAPrkilzhGw8wE/Tq8= +github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY= +github.com/jeremywohl/flatten/v2 v2.0.0-20211013061545-07e4a09fb8e4 h1:eA9wi6ZzpIRobvXkn/S2Lyw1hr2pc71zxzOPl7Xjs4w= +github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/native v1.0.0 h1:Ts/E8zCSEsG17dUqv7joXJFybuMLjQfWE04tsBODTxk= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/karrick/godirwalk v1.16.2 h1:eY2INUWoB2ZfpF/kXasyjWJ3Ncuof6qZuNWYZFN3kAI= +github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= +github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= +github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk= +github.com/knadh/koanf v1.5.0 h1:q2TSd/3Pyc/5yP9ldIrSdIz26MCcyNQzW0pEAugLPNs= +github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b h1:udzkj9S/zlT5X367kqJis0QP7YMxobob6zhzq6Yre00= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/leodido/ragel-machinery v0.0.0-20181214104525-299bdde78165 h1:bCiVCRCs1Heq84lurVinUPy19keqGEe4jh5vtK37jcg= +github.com/linkedin/goavro/v2 v2.12.0 h1:rIQQSj8jdAUlKQh6DttK8wCRv4t4QO09g1C4aBWXslg= +github.com/logzio/azure-monitor-metrics-receiver v1.0.0 h1:TAzhIZL2ueyyc81qIw8FGg4nUbts4Hvc3oOxSobY1IA= +github.com/lufia/plan9stats v0.0.0-20220913051719-115f729f3c8c h1:VtwQ41oftZwlMnOEbMWQtSEUgU64U4s+GHk7hZK+jtY= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-ieproxy v0.0.1 h1:qiyop7gCflfhwCzGyeT0gro3sF9AIg9HU98JORTkqfI= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= +github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/mdlayher/apcupsd v0.0.0-20220319200143-473c7b5f3c6a h1:JOlLsLUQnokTyWWwEvOVoKH3XUl6oDMP8jisO54l6J8= +github.com/mdlayher/genetlink v1.2.0 h1:4yrIkRV5Wfk1WfpWTcoOlGmsWgQj3OtQN9ZsbrE+XtU= +github.com/mdlayher/netlink v1.6.0 h1:rOHX5yl7qnlpiVkFWoqccueppMtXzeziFjWAjLg6sz0= +github.com/mdlayher/socket v0.2.3 h1:XZA2X2TjdOwNoNPVPclRCURoX/hokBY8nkTmRZFEheM= +github.com/microsoft/ApplicationInsights-Go v0.4.4 h1:G4+H9WNs6ygSCe6sUyxRc2U81TI5Es90b2t/MwX5KqY= +github.com/miekg/dns v1.1.51 h1:0+Xg7vObnhrz/4ZCZcZh7zPXlmU0aveS2HDBd0m0qSo= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= +github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= +github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= +github.com/moby/ipvs v1.1.0 h1:ONN4pGaZQgAx+1Scz5RvWV4Q7Gb+mvfRh3NsPS+1XQQ= +github.com/moby/patternmatcher v0.5.0 h1:YCZgJOeULcxLw1Q+sVR636pmS7sPEn1Qo2iAN6M7DBo= +github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= +github.com/moby/term v0.0.0-20221128092401-c43b287e0e0f h1:J/7hjLaHLD7epG0m6TBMGmp4NQ+ibBYLfeyJWdAIFLA= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/montanaflynn/stats v0.6.6 h1:Duep6KMIDpY4Yo11iFsvyqJDyfzLF9+sndUKT+v64GQ= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/multiplay/go-ts3 v1.1.0 h1:OWOjRxBCRds+FbpyM1JKSscRbbmYr/IIrh6V78CM5Xw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks= +github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0= +github.com/nats-io/jwt/v2 v2.3.0 h1:z2mA1a7tIf5ShggOFlR1oBPgd6hGqcDYsISxZByUzdI= +github.com/nats-io/nats-server/v2 v2.9.9 h1:bmj0RhvHOc8+z5/RuhI38GqPwtkFAHQuU3e99FVA/TI= +github.com/nats-io/nats.go v1.24.0 h1:CRiD8L5GOQu/DcfkmgBcTTIQORMwizF+rPk6T0RaHVQ= +github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8= +github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= +github.com/netsampler/goflow2 v1.3.3 h1:uheCMgWwbaHnVdsvc2bqbdQe93E73pVF77WGu/kPE7U= +github.com/newrelic/newrelic-telemetry-sdk-go v0.8.1 h1:6OX5VXMuj2salqNBc41eXKz6K+nV6OB/hhlGnAKCbwU= +github.com/nsqio/go-nsq v1.1.0 h1:PQg+xxiUjA7V+TLdXw7nVrJ5Jbl3sN86EhGCQj4+FYE= +github.com/olivere/elastic v6.2.37+incompatible h1:UfSGJem5czY+x/LqxgeCBgjDn6St+z8OnsCuxwD3L0U= +github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.73.0 h1:b62Oq3dniQm3eg8OcnBnlZCyZ4O85iyKPFuCIeYNCKk= +github.com/openconfig/gnmi v0.9.1 h1:hVOdLTaRjdy68oCGJbkf2vrmnUoQ5xbINqBOAMix4xM= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/image-spec v1.1.0-rc2 h1:2zx/Stx4Wc5pIPDvIxHXvXtQFW/7XWJGmnM7r3wg034= +github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= +github.com/opensearch-project/opensearch-go/v2 v2.2.0 h1:6RicCBiqboSVtLMjSiKgVQIsND4I3sxELg9uwWe/TKM= +github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b h1:FfH+VrHHk6Lxt9HdVS0PXzSXFyS2NbZKXv33FYPol0A= +github.com/p4lang/p4runtime v1.3.0 h1:3fUhHj0JtsGcL2Bh0uxpACdBJBDqpZyLgj93tqKzoJY= +github.com/pborman/ansi v1.0.0 h1:OqjHMhvlSuCCV5JT07yqPuJPQzQl+WXsiZ14gZsqOrQ= +github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw= +github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0= +github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc= +github.com/pion/dtls/v2 v2.2.6 h1:yXMxKr0Skd+Ub6A8UqXTRLSywskx93ooMRHsQUtd+Z4= +github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= +github.com/pion/transport/v2 v2.0.2 h1:St+8o+1PEzPT51O9bv+tH/KYYLMNR5Vwm5Z3Qkjsywg= +github.com/pion/udp/v2 v2.0.1 h1:xP0z6WNux1zWEjhC7onRA3EwwSliXqu1ElUZAQhUP54= +github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 h1:KoWmjvw+nsYOo29YJK9vDA65RGE3NrOnUtO7a+RF9HU= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c h1:NRoLoZvkBTKvR5gQLgA3e0hqjkY9u1wm+iOL45VN/qI= +github.com/prometheus-community/pro-bing v0.1.0 h1:zjzLGhfNPP0bP1OlzGB+SJcguOViw7df12LPg2vUJh8= +github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= +github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= +github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= +github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= +github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/prometheus/prometheus v0.42.0 h1:G769v8covTkOiNckXFIwLx01XE04OE6Fr0JPA0oR2nI= +github.com/prometheus/prometheus v0.42.0/go.mod h1:Pfqb/MLnnR2KK+0vchiaH39jXxvLMBk+3lnIGP4N7Vk= +github.com/rabbitmq/amqp091-go v1.8.0 h1:GBFy5PpLQ5jSVVSYv8ecHGqeX7UTLYR4ItQbDCss9MM= +github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/riemann/riemann-go-client v0.5.1-0.20211206220514-f58f10cdce16 h1:bGXoxRwUpPTCaQ86DRE+3wqE9vh3aH8W0HH5L/ygOFM= +github.com/robbiet480/go.nut v0.0.0-20220219091450-bd8f121e1fa1 h1:YmFqprZILGlF/X3tvMA4Rwn3ySxyE3hGUajBHkkaZbM= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/safchain/ethtool v0.3.0 h1:gimQJpsI6sc1yIqP/y8GYgiXn/NjgvpM0RNoWLVVmP0= +github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e h1:CGjiMQ0wMH4wtNWrlj6kiTbkPt2F3rbYnhGX6TWLfco= +github.com/shirou/gopsutil/v3 v3.23.3 h1:Syt5vVZXUDXPEXpIBt5ziWsJ4LdSAAxF4l/xZeQgSEE= +github.com/shoenig/go-m1cpu v0.1.4 h1:SZPIgRM2sEF9NJy50mRHu9PKGwxyyTTJIWvCtgVbozs= +github.com/showwin/speedtest-go v1.4.2 h1:3YjBajURQTJCv/rVwJsd5UtCYlaiqCihg5NhPxJapk8= +github.com/signalfx/com_signalfx_metrics_protobuf v0.0.3 h1:32k2QLgsKhcEs55q4REPKyIadvid5FPy2+VMgvbmKJ0= +github.com/signalfx/gohistogram v0.0.0-20160107210732-1ccfd2ff5083 h1:WsShHmu12ZztYPfh9b+I+VjYD1o8iOHhB67WZCMEEE8= +github.com/signalfx/golib/v3 v3.3.50 h1:TTBpfzsO00F8ep6rhLgBmRIPUpRqBenacezjE4xCweI= +github.com/signalfx/sapm-proto v0.12.0 h1:OtOe+Jm8L61Ml8K6X8a89zc8/RlaaMRElCImeGKR/Ew= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sleepinggenius2/gosmi v0.4.4 h1:xgu+Mt7CptuB10IPt3SVXBAA9tARToT4B9xGzjjxQX8= +github.com/sleepinggenius2/gosmi v0.4.4/go.mod h1:l8OniPmd3bJzw0MXP2/qh7AhP/e+bTY2CNivIhsnDT0= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/snowflakedb/gosnowflake v1.6.13 h1:r8iozak/p3P2jYfjF3EbeteqMMzPWjwmVrdENJDW6EI= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= +github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/testcontainers/testcontainers-go v0.18.0 h1:8RXrcIQv5xX/uBOSmZd297gzvA7F0yuRA37/918o7Yg= +github.com/thomasklein94/packer-plugin-libvirt v0.3.4 h1:K+NkHFcZuiUTp4ZiDdBhWRMZiSMdsXwGuzyg4THKDAU= +github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= +github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0= +github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw= +github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= +github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= +github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= +github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= +github.com/vapourismo/knx-go v0.0.0-20220829185957-fb5458a5389d h1:BJMc7MNW/p80cCkC46JimNuowOWCnSSW5IHjtUrXzNk= +github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs= +github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= +github.com/vjeantet/grok v1.0.1 h1:2rhIR7J4gThTgcZ1m2JY4TrJZNgjn985U28kT2wQrJ4= +github.com/vmware/govmomi v0.28.1-0.20220921224932-b4b508abf208 h1:IDVzGQ2aczmTEfTos4hzmFw20tGQ4zZsVnel9C6VEpA= +github.com/wavefronthq/wavefront-sdk-go v0.13.0 h1:3s9maJmzI4orW+hiVBfCNp/SIu8ISXi6rtewmDGzheE= +github.com/wvanbergen/kafka v0.0.0-20171203153745-e2edea948ddf h1:TOV5PC6fIWwFOFra9xJfRXZcL2pLhMI8oNuDugNxg9Q= +github.com/wvanbergen/kazoo-go v0.0.0-20180202103751-f72d8611297a h1:ILoU84rj4AQ3q6cjQvtb9jBjx4xzR/Riq/zYhmDQiOk= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg/scram v1.0.5 h1:TuS0RFmt5Is5qm9Tm2SoD89OPqe4IRiFtyFY4iwWXsw= +github.com/xdg/stringprep v1.0.3 h1:cmL5Enob4W83ti/ZHuZLuKD/xqJfus4fVPwE+/BDm+4= +github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/gopher-lua v0.0.0-20200816102855-ee81675732da h1:NimzV1aGyq29m5ukMK0AMWEhFaL/lrEOaephfuoiARg= +github.com/yusufpapurcu/wmi v1.2.2 h1:KBNDSne4vP5mbSWnJbO+51IMOXJB67QiYCSBrubbPRg= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +go.mongodb.org/mongo-driver v1.11.2 h1:+1v2rDQUWNcGW7/7E0Jvdz51V38XXxJfhzbV17aNHCw= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opentelemetry.io/collector v0.73.0 h1:oEBFtf5WcXiIPGXcjOM5gSQ3GNh/3d6pHf0IThhGmfw= +go.opentelemetry.io/collector/component v0.73.0 h1:ka24yVJoVETCru+l5Fm85xGc2y0HwvGfYwyRe7qmjq0= +go.opentelemetry.io/collector/confmap v0.73.0 h1:tC8x8sDk7JQ3QcbosqrxLe756sYcg4iUdTXsx7Ie4CM= +go.opentelemetry.io/collector/consumer v0.73.0 h1:gy89oaG198A7KGbXIsMIdN4lWVQqqSdx6dsBCfzLujU= +go.opentelemetry.io/collector/featuregate v0.73.0 h1:hpHKXmRiJqMLefIzXwIuqDo9df2HcI/66IAKLo+g7nc= +go.opentelemetry.io/collector/pdata v1.0.0-rcv0011 h1:7lT0vseP89mHtUpvgmWYRvQZ0eY+SHbVsnXY20xkoMg= +go.opentelemetry.io/collector/semconv v0.73.0 h1:gF4f6z1q8YfWzzo/gPKysjFmmM4Pv4nC2bWrTPxTPaE= +go.opentelemetry.io/otel v1.14.0 h1:/79Huy8wbf5DnIPhemGB+zEPVwnN6fuQybr/SRXa6hM= +go.opentelemetry.io/otel/metric v0.37.0 h1:pHDQuLQOZwYD+Km0eb657A25NaRzy0a+eLyKfDXedEs= +go.opentelemetry.io/otel/sdk v1.14.0 h1:PDCppFRDq8A1jL9v6KMI6dYesaq+DFcDZvjsoGvxGzY= +go.opentelemetry.io/otel/sdk/metric v0.37.0 h1:haYBBtZZxiI3ROwSmkZnI+d0+AVzBWeviuYQDeBWosU= +go.opentelemetry.io/otel/trace v1.14.0 h1:wp2Mmvj41tDsyAJXiWDWpfNsOiIyd38fy85pyKcFq/M= +go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/exp v0.0.0-20230307190834-24139beb5833 h1:SChBja7BCQewoTAU7IgvucQKMIXrEpFxNMs0spT3/5s= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= +golang.zx2c4.com/wireguard v0.0.0-20211209221555-9c9e7e272434 h1:3zl8RkJNQ8wfPRomwv/6DBbH2Ut6dgMaWTxM0ZunWnE= +golang.zx2c4.com/wireguard/wgctrl v0.0.0-20211230205640-daad0b7ba671 h1:tJAYx7pB6b5bNqi7XatStqFT2zFAxhXcGDq1R6FqqjU= +google.golang.org/api v0.121.0 h1:8Oopoo8Vavxx6gt+sgs8s8/X60WBAtKQq6JqnkF+xow= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 h1:9NWlQfY2ePejTmfwUH1OWwmznFa+0kKcHGPDvcPza9M= +google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 h1:m8v1xLLLzMe1m5P+gCTF8nJB9epwZQUBERm20Oy1poQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 h1:0nDDozoAU19Qb2HwhXadU8OcsiO/09cnTqhUtq2MEOM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= +google.golang.org/grpc v1.57.2 h1:uw37EN34aMFFXB2QPW7Tq6tdTbind1GpRxw5aOX3a5k= +google.golang.org/grpc v1.57.2/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/fatih/pool.v2 v2.0.0 h1:xIFeWtxifuQJGk/IEPKsTduEKcKvPmhoiVDGpC40nKg= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= +gopkg.in/gorethink/gorethink.v3 v3.0.5 h1:e2Uc/Xe+hpcVQFsj6MuHlYog3r0JYpnTzwDj/y2O4MU= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/olivere/elastic.v5 v5.0.86 h1:xFy6qRCGAmo5Wjx96srho9BitLhZl2fcnpuidPwduXM= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637 h1:yiW+nvdHb9LVqSHQBXfZCieqV4fzYhNBql77zY0ykqs= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.2.2 h1:MNh1AVMyVX23VUHE2O27jm6lNj3vjO5DexS4A1xvnzk= +k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ= +k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ= +k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I= +k8s.io/client-go v0.26.2 h1:s1WkVujHX3kTp4Zn4yGNFK+dlDXy1bAAkIl+cFAiuYI= +k8s.io/cri-api v0.25.13 h1:FaVci3+y5COQuyAFWUckdfOxRpD+m0cnaW2q0OPVm1Q= +k8s.io/cri-api v0.25.13/go.mod h1:yKsLus3raCZ+WbR2m5hS+3hUs5BgSldj2CFJTWyx48M= +k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= +k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw= +k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d h1:VcFq5n7wCJB2FQMCIHfC+f+jNcGgNMar1uKd6rVlifU= +k8s.io/utils v0.0.0-20230308161112-d77c459e9343 h1:m7tbIjXGcGIAtpmQr7/NAi7RsWoW3E7Zcm4jI1HicTc= +layeh.com/radius v0.0.0-20221205141417-e7fbddd11d68 h1:2NDro2Jzkrqfngy/sA5GVnChs7fx8EzcQKFi/lI2cfg= +lukechampine.com/uint128 v1.2.0 h1:mBi/5l91vocEN8otkC5bDLhi2KdCticRiwbdB0O+rjI= +modernc.org/cc/v3 v3.40.0 h1:P3g79IUS/93SYhtoeaHW+kRCIrYaxJ27MFPv+7kaTOw= +modernc.org/ccgo/v3 v3.16.13 h1:Mkgdzl46i5F/CNR/Kj80Ri59hC8TKAhZrYSaqvkwzUw= +modernc.org/libc v1.22.3 h1:D/g6O5ftAfavceqlLOFwaZuA5KYafKwmr30A6iSqoyY= +modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= +modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= +modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= +modernc.org/sqlite v1.21.0 h1:4aP4MdUf15i3R3M2mx6Q90WHKz3nZLoz96zlB6tNdow= +modernc.org/strutil v1.1.3 h1:fNMm+oJklMGYfU9Ylcywl0CO5O6nTfaowNsh2wpPjzY= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go new file mode 100644 index 0000000..1318957 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go @@ -0,0 +1,20 @@ +//go:build !custom || inputs || inputs.npu + +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package all this for register +package all + +import _ "github.com/influxdata/telegraf/plugins/inputs/npu" // register plugin diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md b/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md new file mode 100644 index 0000000..72fc73e --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md @@ -0,0 +1,107 @@ +# npu plugin of telegraf +## 使用介绍 +该插件代码可根据以下两种方法来使用(选择其一即可): + +### 1、源码集成使用(适合未安装Telegraf的情况) +对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/write_external_plugin/ +#### **编译步骤:** +拉取telegraf v1.26.0分支源码 +```shell +git clone -b v1.26.0 https://github.com/influxdata/telegraf.git +``` +拉取插件源码 +```shell +git clone -b [latest_tag] https://gitcode.com/Ascend/mind-cluster.git +# [latest_tag]此tag请自行修改,建议采用仓库的最新标签,否则可能导致引用函数失效 +``` +将插件代码集成到telegraf源码中(其中路径按实际修改) +```shell +cp -r mind-cluster/component/npu-exporter/platforms/inputs/npu telegraf/plugins/inputs +``` +将插件注册到telegraf(其中路径按实际修改) +```shell +cp -r mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go telegraf/plugins/inputs/all +``` +将telegraf源码中的Makefile里的“CGO_ENABLED=0”改为“CGO_ENABLED=1” +```shell +cd telegraf +sed -i s"/CGO_ENABLED=0/CGO_ENABLED=1/" Makefile +``` + +将如下内容加入到telegraf源码的go.mod的文件里 +注意:[latest_tag]请自行修改为commitID/分支名称/tag名称中的一种,建议采用仓库的最新标签,否则可能导致引用函数失效 +```go.mod +require huawei.com/npu-exporter/v6 v6.0.0-RC1 + +replace huawei.com/npu-exporter/v6 => gitcode.com/Ascend/mind-cluster.git/component/npu-exporter/v6 [latest_tag] +replace ascend-common => gitcode.com/Ascend/mind-cluster.git/component/ascend-common [latest_tag] +``` + +然后执行 +```shell +go mod tidy +``` +接着编译telegraf +```shell +make all +``` +运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) +```shell +mkdir -m 750 /var/log/mindx-dl/npu-exporter +``` +源码集成时,该日志可通过hwlog.LogConfig{}结构体来配置,该结构体的详细信息如下 +```go +type LogConfig struct { + // log file path, default "/var/log/mindx-dl/npu-exporter/npu-plugin.log" in npu plugin + LogFileName string + // only write to std out, default value: false + OnlyToStdout bool + // only write to file, default value: false + OnlyToFile bool + // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 + LogLevel int + // size of a single log file (MB), default value: 2MB in npu plugin + FileMaxSize int + // MaxLineLength Max length of each log line, default value: 256 + MaxLineLength int + // maximum number of backup log files, set as 2 in npu plugin + MaxBackups int + // maximum number of days for backup log files, default value: 2 + MaxAge int + // whether backup files need to be compressed, default value: false + IsCompress bool + // expiration time for log cache, default value: 1s + ExpiredTime int + // Size of log cache space, default: 2048 + CacheSize int +} +``` +#### **使用示例:** +使用插件中提供的配置文件运行telegraf +```shell +./telegraf --config path_to_plugins/inputs/npu/sample.conf +``` + +### 2、二进制集成,使用telegraf的execd机制(适合已安装Telegraf的情况) +对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/shim/ + +从[MindCluster社区](https://www.hiascend.com/developer/download/community/result?module=cluster)获取npu-exporter软件包,并从中解压出npu-exporter二进制文件 + +### 使用 +运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) +```shell +mkdir -m 750 /var/log/mindx-dl/npu-exporter +``` +先编写配置文件,如test.conf +``` +[[inputs.execd]] + command = ["path_to_npu_plugin/npu-exporter", "-platform=Telegraf"] + signal = "none" + +[[outputs.file]] + files=["stdout"] +``` +然后运行telegraf +```shell +./telegraf --config path_to_config_file/test.conf +``` \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go new file mode 100644 index 0000000..4c200e0 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go @@ -0,0 +1,104 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package npu this for parse and pack +package npu + +import ( + _ "embed" + "strings" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/inputs" + + "ascend-common/api" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +//go:embed sample.conf +var sampleConfig string + +const ( + num2 = 2 +) + +// WatchNPU npu watch struct +type WatchNPU struct { + collector *common.NpuCollector +} + +// SampleConfig used to return sampleConfig +func (*WatchNPU) SampleConfig() string { + return sampleConfig +} + +// Gather used to gather information from dcmi info and hccn tool info +func (npu *WatchNPU) Gather(acc telegraf.Accumulator) error { + + fieldsMap := make(map[string]map[string]interface{}) + const devName = "ascend" + + devTagValue := "" + if cardType := npu.collector.Dmgr.GetDevType(); cardType == api.Ascend910A3 || cardType == api.Ascend910B || + cardType == api.Ascend910A { + devTagValue = strings.ToLower(api.Ascend910) + } else { + devTagValue = strings.ToLower(cardType) + } + logger.DynamicConfigure(logger.Config{Acc: acc}) + + containerMap := common.GetContainerNPUInfo(npu.collector) + chips := common.GetChipListWithVNPU(npu.collector) + + fieldsMap = npu.gatherChain(fieldsMap, common.ChainForSingleGoroutine, containerMap, chips) + fieldsMap = npu.gatherChain(fieldsMap, common.ChainForMultiGoroutine, containerMap, chips) + fieldsMap = npu.gatherChain(fieldsMap, common.ChainForCustomPlugin, containerMap, chips) + + generalFields := fieldsMap[common.GeneralDevTagKey] + acc.AddFields(devName, generalFields, map[string]string{"device": devTagValue}) + + // after the report is completed, deleted to avoid repeated reporting in the for loop + delete(fieldsMap, common.GeneralDevTagKey) + for key, fields := range fieldsMap { + + ids := strings.Split(key, "_") + devTag := map[string]string{"device": devTagValue + "-" + ids[0]} + if len(ids) >= num2 { + devTag["vdev_id"] = ids[1] + } + + acc.AddFields(devName, fields, devTag) + } + + return nil +} + +func (npu *WatchNPU) gatherChain(fieldsMap map[string]map[string]interface{}, chain []common.MetricsCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + + for _, collector := range chain { + fieldsMap = collector.UpdateTelegraf(fieldsMap, npu.collector, containerMap, chips) + } + return fieldsMap +} + +func init() { + inputs.Add("npu", func() telegraf.Input { + return &WatchNPU{ + collector: common.Collector, + } + }) +} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go new file mode 100644 index 0000000..c8adef4 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go @@ -0,0 +1,174 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package npu this for parse and pack +package npu + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/influxdata/telegraf" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + num5 = 5 +) + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + initChain() +} + +func initChain() { + common.ChainForSingleGoroutine = []common.MetricsCollector{ + &metrics.VersionCollector{}, + } +} + +func mockNewNpuCollector() *common.NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: time.Duration(num5), + updateTime: time.Duration(num5), + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +// TestGather verifies different device type scenarios +func TestGather(t *testing.T) { + tests := []struct { + name string + deviceType string + expectedTag string + }{ + {name: api.Ascend910A3, + deviceType: api.Ascend910A3, + expectedTag: api.Ascend910, + }, + {name: api.Ascend310P, + deviceType: api.Ascend310P, + expectedTag: api.Ascend310P, + }, + } + npu := &WatchNPU{ + collector: mockNewNpuCollector(), + } + acc := &MockAccumulator{} + + for _, tt := range tests { + convey.Convey(tt.name, t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + + patches.ApplyMethodReturn(npu.collector.Dmgr, "GetDevType", tt.deviceType) + patches.ApplyFuncReturn(common.GetContainerNPUInfo, nil) + patches.ApplyFuncReturn(common.GetChipListWithVNPU, nil) + patches.ApplyMethodReturn(common.ChainForSingleGoroutine[0], "UpdateTelegraf", + map[string]map[string]interface{}{ + common.GeneralDevTagKey: {"npu_exporter_version_info": "7.0.0"}, + "0": {"npu_chip_info_power": "1"}, + "1_100": {"npu_chip_info_voltage": "1"}, + }) + + err := npu.Gather(acc) + convey.So(err, convey.ShouldBeNil) + convey.So(acc.fields["ascend,device="+strings.ToLower(tt.expectedTag)], convey.ShouldNotBeEmpty) + }) + } +} + +// TestGatherChain tests the gatherChain method of WatchNPU +func TestGatherChain(t *testing.T) { + npu := &WatchNPU{} + fieldsMap := make(map[string]map[string]interface{}) + chain := []common.MetricsCollector{&metrics.VersionCollector{}} + + convey.Convey("TestGatherChain", t, func() { + result := npu.gatherChain(fieldsMap, chain, nil, nil) + logger.Infof("result:%v", result) + convey.So(len(result), convey.ShouldEqual, 1) + }) +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +// MockAccumulator is a mock implementation of telegraf.Accumulator +type MockAccumulator struct { + fields map[string]map[string]interface{} +} + +func (m *MockAccumulator) AddFields(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { + if m.fields == nil { + m.fields = make(map[string]map[string]interface{}) + } + pairs := make([]string, 0, len(tags)) + for k, v := range tags { + pairs = append(pairs, fmt.Sprintf("%s=%v", k, v)) + } + metricKey := measurement + "," + strings.Join(pairs, ",") + m.fields[metricKey] = fields +} + +func (m *MockAccumulator) AddGauge(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddCounter(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddSummary(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddHistogram(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddMetric(metric telegraf.Metric) { +} + +func (m *MockAccumulator) SetPrecision(precision time.Duration) { +} + +func (m *MockAccumulator) AddError(err error) { +} + +func (m *MockAccumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator { + return nil +} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf b/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf new file mode 100644 index 0000000..11fe998 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf @@ -0,0 +1,9 @@ +[agent] + interval="20s" + flush_interval="20s" + +[[inputs.npu]] + npu_log_level = 1 + +[[outputs.file]] + files=["stdout"] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go new file mode 100644 index 0000000..088eeb9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go @@ -0,0 +1,103 @@ +/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package prometheus for prometheus collector +package prom + +import ( + "github.com/prometheus/client_golang/prometheus" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +// CollectorForPrometheus Entry point for collecting and converting +type CollectorForPrometheus struct { + collector *common.NpuCollector +} + +// NewPrometheusCollector create an instance of prometheus Collector +func NewPrometheusCollector(collector *common.NpuCollector) *CollectorForPrometheus { + promCollector := &CollectorForPrometheus{ + collector: collector, + } + return promCollector +} + +// Describe desc metrics of prometheus +func (*CollectorForPrometheus) Describe(ch chan<- *prometheus.Desc) { + if ch == nil { + logger.Error("ch is nil ") + return + } + const cacheSize = 100 + tempCh := make(chan *prometheus.Desc, cacheSize) + done := make(chan bool) + + go func() { + seenMetrics := make(map[string]struct{}) + for desc := range tempCh { + if desc == nil { + continue + } + descKey := utils.GetDescName(desc) + if _, exists := seenMetrics[descKey]; exists { + logger.Warnf("duplicate metric description detected, keeping first declaration, ignoring duplicate: %s", desc) + continue + } + seenMetrics[descKey] = struct{}{} + ch <- desc + } + // tempCh closed + done <- true + }() + + describeChain(tempCh, common.ChainForSingleGoroutine) + describeChain(tempCh, common.ChainForMultiGoroutine) + describeChain(tempCh, common.ChainForCustomPlugin) + + close(tempCh) + + <-done +} + +func describeChain(ch chan<- *prometheus.Desc, chain []common.MetricsCollector) { + for _, collector := range chain { + if collector != nil { + collector.Describe(ch) + } + } +} + +// Collect update metrics of prometheus +func (n *CollectorForPrometheus) Collect(ch chan<- prometheus.Metric) { + containerMap := common.GetContainerNPUInfo(n.collector) + chips := common.GetChipListWithVNPU(n.collector) + collectChain(ch, n, containerMap, chips, common.ChainForSingleGoroutine) + collectChain(ch, n, containerMap, chips, common.ChainForMultiGoroutine) + collectChain(ch, n, containerMap, chips, common.ChainForCustomPlugin) +} + +func collectChain(ch chan<- prometheus.Metric, n *CollectorForPrometheus, containerMap map[int32]container.DevicesInfo, + chips []common.HuaWeiAIChip, chain []common.MetricsCollector) { + if ch == nil { + logger.Error("ch is nil") + return + } + for _, collector := range chain { + collector.UpdatePrometheus(ch, n.collector, containerMap, chips) + } +} diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go new file mode 100644 index 0000000..331ca66 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go @@ -0,0 +1,159 @@ +/* +Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package prometheus for prometheus collector +package prom + +import ( + "strconv" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + maxMetricsCount = 2000 + num5 = 5 + mockContainerName = "mockContainerName" + maxChipNum int32 = 8 +) + +func TestDescribe(t *testing.T) { + + convey.Convey("test prometheus desc ", t, func() { + collector := NewPrometheusCollector(nil) + + convey.Convey("test prometheus desc when ch is nil", func() { + collector.Describe(nil) + }) + convey.Convey("test prometheus desc when ch is not nil", func() { + ch := make(chan *prometheus.Desc, maxMetricsCount) + collector.Describe(ch) + t.Logf("Describe len(ch):%v", len(ch)) + + convey.So(ch, convey.ShouldNotBeEmpty) + }) + + }) +} + +func TestCollect(t *testing.T) { + convey.Convey("test prometheus collect ", t, func() { + npuCollector := mockNewNpuCollector() + collector := NewPrometheusCollector(npuCollector) + + convey.Convey("test prometheus collect when ch is nil", func() { + collector.Collect(nil) + }) + convey.Convey("test prometheus collect when ch is not nil", func() { + + ch := make(chan prometheus.Metric, maxMetricsCount) + + patches := gomonkey.NewPatches() + collector.Collect(ch) + + patches.ApplyFuncReturn(common.GetChipListWithVNPU, mockGetNPUChipList()) + patches.ApplyFuncReturn(common.GetContainerNPUInfo, mockGetContainerNPUInfo()) + + t.Logf("Describe len(ch):%v", len(ch)) + convey.So(ch, convey.ShouldNotBeEmpty) + }) + }) +} + +func mockNewNpuCollector() *common.NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: time.Duration(num5), + updateTime: time.Duration(num5), + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +func mockGetNPUChipList() []common.HuaWeiAIChip { + chips := make([]common.HuaWeiAIChip, 0) + for id := int32(0); id < maxChipNum; id++ { + chip := common.HuaWeiAIChip{ + CardId: id, + PhyId: id, + DeviceID: id, + LogicID: id, + } + + chips = append(chips, chip) + } + return chips +} + +func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { + containsInfo := make(map[int32]container.DevicesInfo) + for id := int32(0); id < maxChipNum; id++ { + + containerInfo := container.DevicesInfo{ + ID: strconv.Itoa(int(id)), + Name: mockContainerName, + Devices: []int{int(id)}, + } + containsInfo[id] = containerInfo + } + return containsInfo +} + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + + initChain() +} + +func initChain() { + common.ChainForSingleGoroutine = []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.BaseInfoCollector{}, + &metrics.SioCollector{}, + &metrics.VersionCollector{}, + &metrics.HbmCollector{}, + &metrics.DdrCollector{}, + &metrics.VnpuCollector{}, + &metrics.PcieCollector{}, + } + common.ChainForMultiGoroutine = []common.MetricsCollector{ + &metrics.NetworkCollector{}, + &metrics.RoceCollector{}, + &metrics.OpticalCollector{}, + } +} diff --git a/mind-cluster/component/npu-exporter/plugins/README.md b/mind-cluster/component/npu-exporter/plugins/README.md new file mode 100644 index 0000000..5690dac --- /dev/null +++ b/mind-cluster/component/npu-exporter/plugins/README.md @@ -0,0 +1,388 @@ +## 自定义插件开发说明 + +用户可参考提供的demo,或将代码拷贝到plugins目录下,重新编译部署,下面对demo中各文件进行说明 + +- `dcmi.go` 、`dcmi_interface_api.h`:用户自定义NPU指标的接口声明与cgo实现,用于对接驱动dcmi接口,具体可参考demo实现,全部dcmi接口续参考驱动的dcmi接口文档。 +- `custom_metrics.go` 实现`MetricCollector`的接口,用于指标采集与上报,需要实现下面的接口,具体可参考demo实现: + - Describe:prometheus上报指标前,需要先定义指标的,该接口用于prometheus的指标定义 + - CollectToCache: 指标采集方法,每个采集周期都会执行,从外部获取数据,并传入到内部缓存中 + - UpdatePrometheus: 按照prometheus的格式,将缓存中的数据返回 + - UpdateTelagraf:按照telagraf的格式,将缓存中的数据返回。 + - IsSupporterd:检测当前环境,判断是否支持当前设备的检测。 + - PreCollect:正式开始采集前执行一次,可用于设备初始化。可以为空。 + - PostCollect:采集结束后执行一次,可用于数据的回收。可以为空。 +- `register.go`,提供插件注册函数,在npu-exporter启动时完成插件注册并完成dcmi接口初始化,**RegisterPlugin函数签名不要修改**,自定义插件通过`AddPluginCollector`接口注册,指标名称需要与`pluginConfiguration.json`中的指标组名称保持一致 + +对于插件指标组内定义的指标名称,不要与现有代码中已定义的插件指标(当前NPU指标、插件指标)重名 + +自定义插件采集时间超过10s后,npu-exporter会打印日志,提示插件采集时间过长,执行下一个插件采集。 + +### 编译部署 + +插件开发完后,执行Npu-exporter代码目录下的`build/build.sh`完成编译,需要提前准备go开发环境。 + +编译完成后,会在output目录下生成新的二进制文件与相关配置文件,根据需要打开或关闭相应开关,根据安装部署章节的安装指导,重新作镜像部署即可 + + + +`dcmi.go` + +```go +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins this for dcmi interface +package plugins + +// #cgo LDFLAGS: -ldl +/* + #include + #include + #include + #include + + #include "dcmi_interface_api.h" + + static void *dcmiHandle; + #define SO_NOT_FOUND -99999 + #define FUNCTION_NOT_FOUND -99998 + #define SUCCESS 0 + #define ERROR_UNKNOWN -99997 + #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); + + static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); + int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ + CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) + } + + // load .so files and functions + static int dcmiLoad_dl(const char* dcmiLibPath){ + if (dcmiLibPath == NULL) { + fprintf (stderr,"lib path is null\n"); + return SO_NOT_FOUND; + } + dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); + if (dcmiHandle == NULL){ + fprintf (stderr,"%s\n",dlerror()); + return SO_NOT_FOUND; + } + + dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); + + return SUCCESS; + } + + static int dcmiShutDown(void){ + if (dcmiHandle == NULL) { + return SUCCESS; + } + return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); + } +*/ +import "C" +import ( + "fmt" + + "unsafe" + + "ascend-common/common-utils/utils" + "ascend-common/devmanager/common" +) + +const ( + dcmiLibraryName = "libdcmi.so" +) + +// DcLoad load dcmi symbol +func DcLoad() error { + dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) + if err != nil { + return err + } + cDcmiTemplateName := C.CString(dcmiLibPath) + defer C.free(unsafe.Pointer(cDcmiTemplateName)) + if retCode := C.dcmiLoad_dl(cDcmiTemplateName); retCode != C.SUCCESS { + return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) + } + return nil +} + +// DcShutDown clean the dynamically loaded resource +func DcShutDown() error { + if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) + } + + return nil +} + +// DcGetDeviceHealth get device health +func DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var health C.uint + if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), + &health); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ + "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) + } + if common.IsGreaterThanOrEqualInt32(int64(health)) { + return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ + "health: %d", cardID, deviceID, int64(health)) + } + return int32(health), nil +} + +``` + + + +`dcmi_interface_api.h` + +```c++ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __DCMI_INTERFACE_API_H__ +#define __DCMI_INTERFACE_API_H__ + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif +#endif /* __cplusplus */ + +#define DCMIDLLEXPORT static + +DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); + +#ifdef __cplusplus +#if __cplusplus +} +#endif +#endif /* __cplusplus */ + +#endif /* __DCMI_INTERFACE_API_H__ */ +``` + + + +`custom_metrics.go` + +```go +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +import ( + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + PluginInfoDesc = prometheus.NewDesc("plugin_info", "exporter custom plugin info", + []string{"plugin_label"}, nil) + + PluginNpuInfoDesc = prometheus.NewDesc("npu_plugin_info", "exporter custom npu plugin info", + []string{"npu_plugin_label"}, nil) +) + +const ( + pluginInfoKey = "pluginInfoKey" + pluginInfoValue = 1.11111 + pluginLabel = "pluginLabel" + npuPluginLabel = "npuPluginInfoKey" + npuPluginInfoKey = "npuPluginInfoKey" + pluginName = "MyPlugin" +) + +// PluginInfoCollector collect custom plugin info +type PluginInfoCollector struct { + common.MetricsCollectorAdapter + Cache sync.Map +} + +// Describe description of the metric +func (c *PluginInfoCollector) Describe(ch chan<- *prometheus.Desc) { + // add desc + logger.Debug("PluginInfoCollector Describe") + ch <- PluginInfoDesc + ch <- PluginNpuInfoDesc +} + +// CollectToCache collect the metric to cache +func (c *PluginInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { + // collect metric to cache + logger.Debug("PluginInfoCollector CollectToCache") + c.Cache.Store(pluginInfoKey, pluginInfoValue) + health, err := DcGetDeviceHealth(0, 0) + if err != nil { + logger.Error(err) + return + } + c.Cache.Store(npuPluginInfoKey, health) +} + +// UpdatePrometheus update prometheus metric +func (c *PluginInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { + logger.Debug("PluginInfoCollector UpdatePrometheus") + // get metric from cache + pluginCache, _ := c.Cache.Load(pluginInfoKey) + npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) + // update plugin info + ch <- prometheus.NewMetricWithTimestamp(time.Now(), + prometheus.MustNewConstMetric(PluginInfoDesc, prometheus.GaugeValue, pluginCache.(float64), pluginLabel)) + // update npu plugin info + value := float64(npuPluginCache.(int32)) + ch <- prometheus.NewMetricWithTimestamp(time.Now(), + prometheus.MustNewConstMetric(PluginNpuInfoDesc, prometheus.GaugeValue, value, npuPluginLabel)) + +} + +// UpdateTelegraf update telegraf metric +func (c *PluginInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + logger.Debug("PluginInfoCollector UpdateTelegraf") + // get metric from cache + pluginCache, _ := c.Cache.Load(pluginInfoKey) + npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) + // update plugin info + if fieldsMap[common.GeneralDevTagKey] == nil { + fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], PluginInfoDesc, pluginCache.(float64), "") + // update npu plugin info + const NpuLogicID = "1" + value := float64(npuPluginCache.(int32)) + if fieldsMap[NpuLogicID] == nil { + fieldsMap[NpuLogicID] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[NpuLogicID], PluginNpuInfoDesc, value, "") + return fieldsMap +} + +// PreCollect pre handle before collect +func (c *PluginInfoCollector) PreCollect(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { + logger.Debug("PluginInfoCollector PreCollect") +} + +// PostCollect post handle after collect +func (c *PluginInfoCollector) PostCollect(n *common.NpuCollector) { + logger.Debug("PluginInfoCollector PostCollect") +} + +// IsSupported Check whether the current hardware supports this metric +func (c *PluginInfoCollector) IsSupported(n *common.NpuCollector) bool { + logger.Debug("PluginInfoCollector IsSupported") + return true +} + +// getDescName parse metrics name from prometheus.Desc object +func getDescName(desc *prometheus.Desc) string { + str := desc.String() + startIndex := strings.Index(str, "fqName: ") + len("fqName: ") + readfqName := str[startIndex:] + + endIndex := strings.Index(readfqName, ",") + if endIndex != -1 { + readfqName = readfqName[:endIndex] + } + + readfqName = strings.Trim(readfqName, "\"") + return readfqName +} + +func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { + fieldMap[getDescName(desc)+extInfo] = value +} + + +``` + + + +`register.go` + +```go +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +import ( + "huawei.com/npu-exporter/v6/collector/config" + "huawei.com/npu-exporter/v6/utils/logger" +) + +// RegisterPlugin register plugin collector +func RegisterPlugin() { + err := config.AddPluginCollector(pluginName, &PluginInfoCollector{}) + if err != nil { + logger.Errorf("add plugin failed: %v\n", err) + } + logger.Infof("add plugin ok: %v\n", pluginName) + err = DcLoad() + if err != nil { + logger.Errorf("dcmi init failed: %v\n", err) + return + } +} + +``` + diff --git a/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go b/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go new file mode 100644 index 0000000..db462a4 --- /dev/null +++ b/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go @@ -0,0 +1,358 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +import ( + "encoding/json" + "fmt" + "os" + "sort" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/config" + "huawei.com/npu-exporter/v6/collector/container" + npuutils "huawei.com/npu-exporter/v6/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + metricDesc *prometheus.Desc + labelKeys []string // a list of tag keys extracted from the datalist + jsonFilePath string + isSupported bool + currentVersion versionInfo +) + +const ( + size100k = 100 * 1024 + maxLabelSize = 10 + num1000 = 1000 + maxDataListSize = 128 + maxMetricNameSize = 128 + maxDescSize = 1024 + fileMetricsDisabledMsg = "file metrics collection will be disabled" + skipCurrentCollectionMsg = "will skip current collection and report cached metrics" + excludedPermission = 0111 // file should not have any execute permission +) + +type versionInfo struct { + name string + desc string + version string +} + +// TextMetricData represents the JSON structure +type TextMetricData struct { + Version string `json:"version"` + Desc string `json:"desc"` + Name string `json:"name"` + Timestamp int64 `json:"timestamp"` + DataList []DataItem `json:"data_list"` +} + +// DataItem represents each item in data_list +type DataItem struct { + Label map[string]string `json:"label"` + Value float64 `json:"value"` +} + +// InitTextMetricsDesc init text metric +func InitTextMetricsDesc(filePath string) { + if filePath == "" { + return + } + paths := strings.Split(filePath, ",") + if len(paths) > 1 { + logger.Warnf("multiple file paths detected in filePath: %s, only the first file will be used", filePath) + jsonFilePath = strings.TrimSpace(paths[0]) + } else { + jsonFilePath = filePath + } + if utils.IsDir(jsonFilePath) { + logger.Errorf("file path %s is a directory, only support specify file path", filePath) + return + } + fileData, err := waitForFile(jsonFilePath, time.Minute) + if err != nil { + logger.Warnf("read json file %s failed, %s: %v", jsonFilePath, fileMetricsDisabledMsg, err) + return + } + var metricsData TextMetricData + if err := json.Unmarshal(fileData, &metricsData); err != nil { + logger.Warnf("unmarshal json file %s failed, %s: %v, "+ + "Possible causes:\n1. The file is not in JSON format\n2. File size is more than 100KB ", jsonFilePath, fileMetricsDisabledMsg, err) + return + } + + if err := isDataOk(&metricsData); err != nil { + logger.Warnf("%v, %s", err, fileMetricsDisabledMsg) + return + } + + desc := metricsData.Desc + labelKeys = make([]string, 0, len(metricsData.DataList[0].Label)) + for key := range metricsData.DataList[0].Label { + labelKeys = append(labelKeys, key) + } + sort.Strings(labelKeys) + logger.Infof("init text metric succeeded, metricName: %v, version: %v, desc: %v, labels: %v", + metricsData.Name, metricsData.Version, desc, labelKeys) + + metricDesc = prometheus.NewDesc(metricsData.Name, desc, labelKeys, nil) + isSupported = true + currentVersion = versionInfo{ + name: metricsData.Name, + desc: desc, + version: metricsData.Version, + } + err = config.AddPluginCollector("text", &TextMetricsInfoCollector{}) + if err != nil { + logger.Errorf("%v", err) + } +} + +func isDataOk(metricsData *TextMetricData) error { + if len(metricsData.DataList) == 0 { + return fmt.Errorf("dataList is empty in json file %s", jsonFilePath) + } + if len(metricsData.DataList) > maxDataListSize { + return fmt.Errorf("size of dataList(%d) is more than max allowed dataList size(%d) in json file %s", + len(metricsData.DataList), maxDataListSize, jsonFilePath) + } + if len(metricsData.DataList[0].Label) > maxLabelSize { + return fmt.Errorf("size of first item's Label(%d) is more than max allowed label size(%d) in json file %s", + len(metricsData.DataList[0].Label), maxLabelSize, jsonFilePath) + } + if metricsData.Name == "" { + return fmt.Errorf("name field is empty in json file %s", jsonFilePath) + } + if len(metricsData.Name) > maxMetricNameSize { + return fmt.Errorf("length of metric name should not larger than %d, but current is %d", + maxMetricNameSize, len(metricsData.Name)) + } + if metricsData.Desc == "" { + return fmt.Errorf("desc field is empty in json file %s", jsonFilePath) + } + if len(metricsData.Desc) > maxDescSize { + return fmt.Errorf("length of metric desc should not larger than %d, but current is %d", + maxDescSize, len(metricsData.Desc)) + } + if metricsData.Version == "" { + return fmt.Errorf("version field is empty in json file %s", jsonFilePath) + } + // only support 1.0 version currently + if metricsData.Version != "1.0" { + return fmt.Errorf("version should be 1.0, but current is %s", metricsData.Version) + } + if metricsData.Timestamp <= 0 { + return fmt.Errorf("timestamp field is empty or not correct in json file %s", jsonFilePath) + } + return nil +} + +// waitForFile wait for file to exist +func waitForFile(filePath string, timeout time.Duration) ([]byte, error) { + const tickerDuration = 100 + deadline := time.Now().Add(timeout) + ticker := time.NewTicker(tickerDuration * time.Millisecond) + defer ticker.Stop() + once := sync.Once{} + + for { + fileData, err := utils.ReadLimitBytes(filePath, size100k) + err2 := checkFile(filePath) + if err2 != nil { + hwlog.RunLog.Errorf("check file err, %s: %v", filePath, err2) + } + if err2 != nil && !os.IsNotExist(err2) { + return nil, err2 + } + + if err == nil && err2 == nil && len(fileData) > 0 { + logger.Infof("successfully read json file %s", filePath) + return fileData, nil + } + if os.IsNotExist(err) || len(fileData) == 0 { + once.Do(func() { + logger.Warnf("file [%v] is not exist or file is empty, will wait 1 minute", filePath) + }) + if time.Now().After(deadline) { + return nil, fmt.Errorf("file %s does not exist or file is empty after waiting %v", filePath, timeout) + } + select { + case <-ticker.C: + continue + } + } + return nil, err + } +} + +func checkFile(filePath string) error { + absFilePath, err := utils.CheckPath(filePath) + if err != nil { + return err + } + if err = utils.DoCheckOwnerAndPermission(absFilePath, excludedPermission, 0); err != nil { + logger.Errorf("file permission should not included %04o: %v", excludedPermission, err) + return err + } + return nil +} + +// TextMetricsInfoCollector collect custom plugin info +type TextMetricsInfoCollector struct { + common.MetricsCollectorAdapter + Cache sync.Map +} + +// Describe description of the metric +func (c *TextMetricsInfoCollector) Describe(ch chan<- *prometheus.Desc) { + // add desc + if metricDesc != nil { + ch <- metricDesc + } +} + +// CollectToCache collect the metric to cache +func (c *TextMetricsInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { + // collect metric to cache + logger.Debugf("TextMetricsInfoCollector CollectToCache") + + fileData, err := utils.ReadLimitBytes(jsonFilePath, size100k) + if err != nil { + logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "readFileErr"}, + "read json file %s failed: %v", jsonFilePath, err) + return + } + hwlog.ResetErrCnt("textMetrics", "readFileErr") + + var metricsData TextMetricData + if err := json.Unmarshal(fileData, &metricsData); err != nil { + logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "unmarshalFileErr"}, + "unmarshal json file %s failed: %v", jsonFilePath, err) + return + } + hwlog.ResetErrCnt("textMetrics", "unmarshalFileErr") + + if err := isDataOk(&metricsData); err != nil { + logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "dataNotOk"}, + "%v, %s", err, skipCurrentCollectionMsg) + return + } + hwlog.ResetErrCnt("textMetrics", "dataNotOk") + + if versionChanged(metricsData) { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "textMetrics", ID: "versionChanged"}, + "json file base info changed, old: %v, new: %v", currentVersion, + versionInfo{name: metricsData.Name, desc: metricsData.Desc, version: metricsData.Version}) + return + } + hwlog.ResetErrCnt("textMetrics", "versionChanged") + + c.Cache.Store(common.GetCacheKey(c), metricsData) +} + +func versionChanged(data TextMetricData) bool { + if currentVersion.name != data.Name || currentVersion.desc != data.Desc || + currentVersion.version != data.Version { + return true + } + return false +} + +// UpdatePrometheus update prometheus metric +func (c *TextMetricsInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { + logger.Debug("TextMetricsInfoCollector UpdatePrometheus") + if metricDesc == nil { + logger.Warnf("metricDesc is not initialized, skip UpdatePrometheus") + return + } + cacheKey := common.GetCacheKey(c) + data, ok := c.Cache.Load(cacheKey) + if !ok { + logger.Debugf("cache key %s not found", cacheKey) + return + } + + textMetricsData, ok := data.(TextMetricData) + if !ok { + logger.Warnf("cache data type mismatch for key %s", cacheKey) + return + } + + timestamp := time.Unix(0, textMetricsData.Timestamp*num1000) + + for _, item := range textMetricsData.DataList { + labelValues := make([]string, len(labelKeys)) + for i, key := range labelKeys { + if value, ok := item.Label[key]; ok { + labelValues[i] = value + } else { + labelValues[i] = "" + } + } + + ch <- prometheus.NewMetricWithTimestamp(timestamp, + prometheus.MustNewConstMetric(metricDesc, prometheus.GaugeValue, item.Value, labelValues...)) + } +} + +// UpdateTelegraf update telegraf metric +func (c *TextMetricsInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + logger.Debug("TextMetricsInfoCollector UpdateTelegraf") + + if metricDesc == nil { + logger.Warnf("metricDesc is not initialized, skip UpdateTelegraf") + return fieldsMap + } + + cacheKey := common.GetCacheKey(c) + data, ok := c.Cache.Load(cacheKey) + if !ok { + logger.Debugf("cache key %s not found", cacheKey) + return fieldsMap + } + + textMetricData, ok := data.(TextMetricData) + if !ok { + logger.Warnf("cache data type mismatch for key %s", cacheKey) + return fieldsMap + } + + for _, item := range textMetricData.DataList { + if fieldsMap[common.GeneralDevTagKey] == nil { + fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) + } + npuutils.DoUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], metricDesc, item.Value, "") + } + + return fieldsMap +} + +// IsSupported Check whether the current hardware supports this metric +func (c *TextMetricsInfoCollector) IsSupported(n *common.NpuCollector) bool { + return isSupported +} diff --git a/mind-cluster/component/npu-exporter/plugins/register.go b/mind-cluster/component/npu-exporter/plugins/register.go new file mode 100644 index 0000000..e9b5f41 --- /dev/null +++ b/mind-cluster/component/npu-exporter/plugins/register.go @@ -0,0 +1,21 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +// RegisterPlugin register plugin collector +func RegisterPlugin() { + +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/general_logger.go b/mind-cluster/component/npu-exporter/utils/logger/general_logger.go new file mode 100644 index 0000000..3f1e19c --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/general_logger.go @@ -0,0 +1,76 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "context" + "fmt" + + "ascend-common/common-utils/hwlog" +) + +const ( + maxLogLineLength = 1024 + defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" +) + +type generalLogger struct { +} + +// dynamicConfigure configures the logger +func (c *generalLogger) dynamicConfigure(Config) { +} + +// log logs with specified level +func (c *generalLogger) log(ctx context.Context, level Level, args ...interface{}) { + fn, ok := logFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), args...) +} + +// logf logs with specified level and format +func (c *generalLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { + fn, ok := logfFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), format, args...) +} + +func (c *generalLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, + args ...interface{}) { + + if opts.MaxCounts == 0 { + opts.MaxCounts = hwlog.ProblemOccurMaxNumbers + } + + if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + fn, ok := logfFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), format, args...) + } +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger.go b/mind-cluster/component/npu-exporter/utils/logger/logger.go new file mode 100644 index 0000000..723e070 --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/logger.go @@ -0,0 +1,174 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "context" + "errors" + "fmt" + + "github.com/influxdata/telegraf" + + "ascend-common/common-utils/hwlog" +) + +// the method mapping table (avoid rebuilding with every call) +var ( + logFuncs = map[Level]logFunc{} + logfFuncs = map[Level]logfFunc{} +) + +const ( + // DebugLevel Debug level + DebugLevel Level = iota - 1 + // InfoLevel Info level + InfoLevel + // WarnLevel Warn level + WarnLevel + // ErrorLevel Error level + ErrorLevel + + // PrometheusPlatform Prometheus platform + PrometheusPlatform = "Prometheus" + // TelegrafPlatform Telegraf platform + TelegrafPlatform = "Telegraf" +) + +// HwLogConfig default log file +var HwLogConfig = &hwlog.LogConfig{ + LogFileName: defaultLogFile, + ExpiredTime: hwlog.DefaultExpiredTime, + CacheSize: hwlog.DefaultCacheSize, + MaxLineLength: maxLogLineLength, +} + +// Level log level +type Level int + +// logFunc log function +type logFunc func(ctx context.Context, args ...interface{}) + +// logfFunc logf function +type logfFunc func(ctx context.Context, format string, args ...interface{}) + +var ( + // logger Unified log printer + logger UnifiedLogger +) + +// InitLogger initialize the log manager +func InitLogger(platform string) error { + + if platform == TelegrafPlatform { + logger = &telegrafLogger{} + HwLogConfig.LogFileName = defaultTelegrafLogPath + HwLogConfig.OnlyToFile = true + } else if platform == PrometheusPlatform { + logger = &generalLogger{} + } else { + return errors.New("platform is not supported:" + platform) + } + + if err := hwlog.InitRunLogger(HwLogConfig, context.Background()); err != nil { + fmt.Printf("hwlog init failed, error is %v\n", err) + return err + } + + logFuncs = map[Level]logFunc{ + DebugLevel: hwlog.RunLog.DebugWithCtx, + InfoLevel: hwlog.RunLog.InfoWithCtx, + WarnLevel: hwlog.RunLog.WarnWithCtx, + ErrorLevel: hwlog.RunLog.ErrorWithCtx, + } + + logfFuncs = map[Level]logfFunc{ + DebugLevel: hwlog.RunLog.DebugfWithCtx, + InfoLevel: hwlog.RunLog.InfofWithCtx, + WarnLevel: hwlog.RunLog.WarnfWithCtx, + ErrorLevel: hwlog.RunLog.ErrorfWithCtx, + } + return nil +} + +// LogOptions options for log +type LogOptions struct { + Domain string + ID interface{} + MaxCounts int +} + +// Config config for telegraf +type Config struct { + Acc telegraf.Accumulator +} + +// UnifiedLogger unified logger interface +type UnifiedLogger interface { + dynamicConfigure(Config) + log(ctx context.Context, level Level, args ...interface{}) + logf(ctx context.Context, level Level, format string, args ...interface{}) + logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, args ...interface{}) +} + +// Debug print log info with debug level +func Debug(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), DebugLevel, args...) +} + +// Info print log info with info level +func Info(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), InfoLevel, args...) +} + +// Warn print log info with warn level +func Warn(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), WarnLevel, args...) +} + +// Error print log info with error level +func Error(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), ErrorLevel, args...) +} + +// Debugf print log info with debug level +func Debugf(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), DebugLevel, format, args...) +} + +// Infof print log info with info level +func Infof(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), InfoLevel, format, args...) +} + +// Warnf print log info with warn level +func Warnf(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), WarnLevel, format, args...) +} + +// Errorf print log info with error level +func Errorf(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), ErrorLevel, format, args...) +} + +// LogfWithOptions print log info with error level +func LogfWithOptions(level Level, opts LogOptions, format string, args ...interface{}) { + logger.logfWithOptions(hwlog.DeepIncrease(context.Background()), level, opts, format, args...) +} + +// DynamicConfigure configure the logger +func DynamicConfigure(config Config) { + logger.dynamicConfigure(config) +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger_test.go b/mind-cluster/component/npu-exporter/utils/logger/logger_test.go new file mode 100644 index 0000000..a08ad4b --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/logger_test.go @@ -0,0 +1,119 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "errors" + "testing" + + "ascend-common/common-utils/hwlog" +) + +// TestInitLogger tests the InitLogger function +func TestInitLogger(t *testing.T) { + tests := []struct { + name string + platform string + expected error + }{ + { + name: "Telegraf Platform", + platform: TelegrafPlatform, + expected: nil, + }, + { + name: "Prometheus Platform", + platform: PrometheusPlatform, + expected: nil, + }, + { + name: "Unsupported Platform", + platform: "Unsupported", + expected: errors.New("platform is not supported:Unsupported"), + }, + } + + HwLogConfig.LogLevel = 0 + HwLogConfig.MaxBackups = hwlog.DefaultMaxBackups + HwLogConfig.LogFileName = defaultLogFile + HwLogConfig.MaxAge = hwlog.DefaultMinSaveAge + + var noExistLevel Level = 5 + var args = "mock" + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := InitLogger(tt.platform) + if tt.expected == nil && err != nil { + t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) + } else if tt.expected != nil && err.Error() != tt.expected.Error() { + t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) + } + + logger.log(nil, DebugLevel, args) + logger.log(nil, InfoLevel, args) + logger.log(nil, WarnLevel, args) + logger.log(nil, noExistLevel, args) + logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") + + logger.logf(nil, DebugLevel, args) + logger.logf(nil, InfoLevel, args) + logger.logf(nil, WarnLevel, args) + logger.logf(nil, noExistLevel, args) + logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") + + }) + } +} + +func TestLoggerMethods(t *testing.T) { + + tests := []struct { + name string + method func(...interface{}) + level Level + args []interface{} + }{ + {"test Debug", Debug, DebugLevel, []interface{}{"debug message"}}, + {"test Info", Info, InfoLevel, []interface{}{"info message"}}, + {"test Warn", Warn, WarnLevel, []interface{}{"warn message"}}, + {"test Error", Error, ErrorLevel, []interface{}{"error message"}}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + test.method(test.args...) + }) + } + + testsF := []struct { + name string + method func(string, ...interface{}) + level Level + format string + args []interface{} + }{ + {"test Debugf", Debugf, DebugLevel, "debug message %d", []interface{}{1}}, + {"test Infof", Infof, InfoLevel, "info message %d", []interface{}{1}}, + {"test Warnf", Warnf, WarnLevel, "warn message %d", []interface{}{1}}, + {"test Errorf", Errorf, ErrorLevel, "error message %d", []interface{}{1}}, + } + + for _, test := range testsF { + t.Run(test.name, func(t *testing.T) { + test.method(test.format, test.args...) + }) + } +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go b/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go new file mode 100644 index 0000000..56c2ac5 --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go @@ -0,0 +1,82 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/influxdata/telegraf" + + "ascend-common/common-utils/hwlog" +) + +var defaultTelegrafLogPath = "/var/log/mindx-dl/npu-exporter/npu-plugin.log" +var dangerousChars = map[string]string{ + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", +} + +type telegrafLogger struct { + acc telegraf.Accumulator +} + +// dynamicConfigure configures the logger +func (c *telegrafLogger) dynamicConfigure(config Config) { + c.acc = config.Acc +} + +// log logs with specified level +func (c *telegrafLogger) log(ctx context.Context, level Level, args ...interface{}) { + c.logf(hwlog.DeepIncrease(ctx), level, "%s", args...) +} + +// logf logs with specified level and format +func (c *telegrafLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { + sanitized := format + for char, replacement := range dangerousChars { + sanitized = strings.ReplaceAll(sanitized, char, replacement) + } + if level < InfoLevel || c.acc == nil { + fn, ok := logfFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), sanitized, args...) + return + } + + c.acc.AddError(errors.New(fmt.Sprintf(sanitized, args...))) +} + +// LogfWithOptions print log info with options +func (c *telegrafLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, + args ...interface{}) { + + if opts.MaxCounts == 0 { + opts.MaxCounts = hwlog.ProblemOccurMaxNumbers + } + + if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + c.logf(hwlog.DeepIncrease(ctx), level, format, args...) + } +} diff --git a/mind-cluster/component/npu-exporter/utils/utils.go b/mind-cluster/component/npu-exporter/utils/utils.go new file mode 100644 index 0000000..b5da97c --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/utils.go @@ -0,0 +1,52 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils for common utils +package utils + +import ( + "strings" + + "github.com/prometheus/client_golang/prometheus" +) + +// GetDescName parse metrics name from prometheus.Desc object +func GetDescName(desc *prometheus.Desc) string { + if desc == nil { + return "" + } + str := desc.String() + startIndex := strings.Index(str, "fqName: ") + if startIndex == -1 { + return "" + } + readfqName := str[startIndex+len("fqName: "):] + + endIndex := strings.Index(readfqName, ",") + if endIndex == -1 { + return "" + } + readfqName = readfqName[:endIndex] + + readfqName = strings.Trim(readfqName, "\"") + return readfqName +} + +// DoUpdateTelegraf update telegraf +func DoUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { + if fieldMap == nil { + return + } + fieldMap[GetDescName(desc)+extInfo] = value +} diff --git a/mind-cluster/component/npu-exporter/utils/utils_test.go b/mind-cluster/component/npu-exporter/utils/utils_test.go new file mode 100644 index 0000000..1a91d29 --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/utils_test.go @@ -0,0 +1,103 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils for common utils +package utils + +import ( + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" +) + +const ( + emptyString = "" + testMetricName = "test_metric" + testMetricName2 = "another_metric" + invalidDescStr = "invalid description" + noCommaDescStr = "fqName: test_metric" + normalDescStr = `fqName: "test_metric", help: "test help"` + normalDescStr2 = `fqName: another_metric, help: "another help"` + noQuoteDescStr = `fqName: test_metric, help: "test help"` + testHelp = "test help" +) + +func TestGetDescName(t *testing.T) { + convey.Convey("should return empty string when desc is nil", t, testGetDescNameNil) + convey.Convey("should return empty string when desc.String does not contain fqName prefix", t, + testGetDescNameNoFqName) + convey.Convey("should return empty string when desc.String does not contain comma", t, + testGetDescNameNoComma) + convey.Convey("should return metric name when desc.String contains valid format", t, + testGetDescNameValidFormat) +} + +func testGetDescNameNil() { + result := GetDescName(nil) + convey.So(result, convey.ShouldEqual, emptyString) +} + +func testGetDescNameNoFqName() { + desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) + patch := gomonkey.ApplyMethodReturn(desc, "String", invalidDescStr) + defer patch.Reset() + + result := GetDescName(desc) + convey.So(result, convey.ShouldEqual, emptyString) +} + +func testGetDescNameNoComma() { + desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) + patch := gomonkey.ApplyMethodReturn(desc, "String", noCommaDescStr) + defer patch.Reset() + + result := GetDescName(desc) + convey.So(result, convey.ShouldEqual, emptyString) +} + +func testGetDescNameValidFormat() { + testCases := []struct { + name string + descStr string + expected string + }{ + { + name: "should return metric name when desc.String contains normal format with quotes", + descStr: normalDescStr, + expected: testMetricName, + }, + { + name: "should return metric name when desc.String contains normal format without quotes", + descStr: noQuoteDescStr, + expected: testMetricName, + }, + { + name: "should return correct metric name when desc.String contains another metric", + descStr: normalDescStr2, + expected: testMetricName2, + }, + } + + for _, tc := range testCases { + desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) + patch := gomonkey.ApplyMethodReturn(desc, "String", tc.descStr) + + result := GetDescName(desc) + convey.So(result, convey.ShouldEqual, tc.expected) + + patch.Reset() + } +} diff --git a/mind-cluster/component/npu-exporter/versions/version.go b/mind-cluster/component/npu-exporter/versions/version.go new file mode 100644 index 0000000..63dba00 --- /dev/null +++ b/mind-cluster/component/npu-exporter/versions/version.go @@ -0,0 +1,23 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package versions record the program version +package versions + +var ( + // BuildVersion record the program build version + BuildVersion string + // BuildName record the program build name + BuildName string +) From ee5a36fcf808181782fdccdbecf70b01b68b3038 Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Tue, 20 Jan 2026 18:51:34 +0800 Subject: [PATCH 02/10] fix go.mod Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- go.mod | 1 - 1 file changed, 1 deletion(-) diff --git a/go.mod b/go.mod index 6ea6f74..d04534c 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,6 @@ require ( github.com/Project-HAMi/HAMi v0.0.0 github.com/fsnotify/fsnotify v1.7.0 google.golang.org/grpc v1.63.2 - huawei.com/npu-exporter/v6 v6.0.0-RC3.b001 k8s.io/api v0.29.3 k8s.io/apimachinery v0.29.3 k8s.io/klog/v2 v2.120.1 From bc9ed1428b54667e72fe5452eb1f5a001085c7a1 Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Wed, 21 Jan 2026 15:06:10 +0800 Subject: [PATCH 03/10] remove mind-cluster Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- .../component/ascend-common/README.md | 8 - .../ascend-common/api/ascend-operator/LICENSE | 201 -- .../api/ascend-operator/README.md | 164 -- .../apis/batch/v1/ascendjob_types.go | 85 - .../apis/batch/v1/constants.go | 53 - .../ascend-operator/apis/batch/v1/defaults.go | 137 - .../ascend-operator/apis/batch/v1/register.go | 52 - .../apis/batch/v1/zz_generated.deepcopy.go | 137 - .../apis/batch/v1/zz_generated.defaults.go | 53 - .../client/clientset/versioned/clientset.go | 114 - .../clientset/versioned/scheme/register.go | 39 - .../versioned/typed/batch/v1/client.go | 110 - .../clientset/versioned/typed/batch/v1/job.go | 221 -- .../externalversions/batch/interface.go | 49 - .../externalversions/batch/v1/interface.go | 48 - .../externalversions/batch/v1/job.go | 99 - .../informers/externalversions/factory.go | 207 -- .../informers/externalversions/generic.go | 71 - .../internalinterfaces/factory_interfaces.go | 40 - .../listers/batch/v1/expansion_generated.go | 26 - .../client/listers/batch/v1/job.go | 108 - .../component/ascend-common/api/consts.go | 222 -- .../ascend-common/api/default_name.go | 188 -- .../ascend-common/api/publicfault.go | 32 - .../ascend-common/api/slownet/fault_net.go | 77 - .../ascend-common/api/superpoddevice.go | 36 - .../component/ascend-common/api/type.go | 30 - .../common-utils/cache/lrucache.go | 394 --- .../common-utils/cache/lrucache_test.go | 304 --- .../ascend-common/common-utils/hwlog/api.go | 310 --- .../common-utils/hwlog/api_test.go | 165 -- .../common-utils/hwlog/hwlog_adaptor.go | 174 -- .../common-utils/hwlog/hwlog_adaptor_test.go | 126 - .../common-utils/hwlog/log_limiter.go | 156 -- .../common-utils/hwlog/logger.go | 242 -- .../common-utils/hwlog/logger_test.go | 217 -- .../ascend-common/common-utils/hwlog/rolog.go | 447 ---- .../common-utils/hwlog/rolog_test.go | 687 ----- .../ascend-common/common-utils/hwlog/types.go | 49 - .../ascend-common/common-utils/hwlog/utils.go | 98 - .../common-utils/hwlog/utils_test.go | 38 - .../common-utils/limiter/limit_handler.go | 226 -- .../limiter/limit_handler_test.go | 119 - .../common-utils/limiter/limit_listener.go | 161 -- .../limiter/limit_listener_test.go | 125 - .../common-utils/limiter/limit_writer.go | 64 - .../common-utils/limiter/limit_writer_test.go | 37 - .../common-utils/rand/rand_linux.go | 71 - .../common-utils/rand/rand_linux_test.go | 54 - .../ascend-common/common-utils/rand/random.go | 28 - .../common-utils/rand/random_test.go | 32 - .../ascend-common/common-utils/utils/env.go | 35 - .../common-utils/utils/env_test.go | 51 - .../ascend-common/common-utils/utils/file.go | 176 -- .../common-utils/utils/file_check.go | 240 -- .../common-utils/utils/file_check_test.go | 194 -- .../common-utils/utils/file_test.go | 169 -- .../common-utils/utils/file_watcher.go | 85 - .../common-utils/utils/file_watcher_test.go | 81 - .../common-utils/utils/interface.go | 29 - .../common-utils/utils/interface_test.go | 36 - .../common-utils/utils/ip_utils.go | 98 - .../common-utils/utils/ip_utils_test.go | 182 -- .../ascend-common/common-utils/utils/path.go | 382 --- .../common-utils/utils/path_test.go | 232 -- .../common-utils/utils/pwd_util.go | 75 - .../common-utils/utils/pwd_util_test.go | 59 - .../ascend-common/common-utils/utils/slice.go | 129 - .../common-utils/utils/slice_test.go | 536 ---- .../common-utils/utils/strings.go | 75 - .../common-utils/utils/strings_test.go | 84 - .../ascend-common/devmanager/a310mgr.go | 25 - .../ascend-common/devmanager/a310pmgr.go | 35 - .../ascend-common/devmanager/a910mgr.go | 31 - .../devmanager/common/constants.go | 272 -- .../ascend-common/devmanager/common/types.go | 435 ---- .../ascend-common/devmanager/common/utils.go | 305 --- .../devmanager/common/utils_test.go | 163 -- .../devmanager/dcmi/constants.go | 78 - .../ascend-common/devmanager/dcmi/dcmi.go | 2213 ----------------- .../devmanager/dcmi/dcmi_interface_api.h | 596 ----- .../ascend-common/devmanager/devmanager.go | 1197 --------- .../devmanager/devmanager_910a3_mock.go | 30 - .../devmanager/devmanager_910a3_mock_err.go | 43 - .../devmanager/devmanager_hccs_test.go | 166 -- .../devmanager/devmanager_mock.go | 370 --- .../devmanager/devmanager_mock_err.go | 369 --- .../devmanager/devmanager_test.go | 78 - .../devmanager/hccn/hccn_tool.go | 335 --- .../devmanager/hccn/hccn_tool_test.go | 49 - mind-cluster/component/ascend-common/go.mod | 55 - mind-cluster/component/ascend-common/go.sum | 492 ---- .../component/npu-exporter/.gitignore | 1 - mind-cluster/component/npu-exporter/LICENSE | 201 -- mind-cluster/component/npu-exporter/README.md | 42 - .../component/npu-exporter/build/Dockerfile | 21 - .../npu-exporter/build/Dockerfile-310P-1usoc | 31 - .../component/npu-exporter/build/build.sh | 80 - .../component/npu-exporter/build/build_ch.sh | 74 - .../build/metricConfiguration.json | 13 - .../build/npu-exporter-310P-1usoc.yaml | 167 -- .../npu-exporter/build/npu-exporter.yaml | 140 -- .../build/pluginConfiguration.json | 4 - .../npu-exporter/build/run_for_310P_1usoc.sh | 32 - .../component/npu-exporter/build/test.sh | 75 - .../npu-exporter/cmd/npu-exporter/main.go | 545 ---- .../common/collector_for_container.go | 109 - .../common/collector_for_container_test.go | 137 - .../collector/common/constants.go | 140 -- .../collector/common/metrics_collector.go | 192 -- .../common/metrics_collector_test.go | 231 -- .../collector/common/npu_collector.go | 423 ---- .../collector/common/npu_collector_test.go | 547 ---- .../npu-exporter/collector/common/types.go | 50 - .../collector/config/metrics_config.go | 208 -- .../collector/config/metrics_config_test.go | 216 -- .../collector/container/isula/isula_api.pb.go | 870 ------- .../collector/container/isula/isula_api.proto | 118 - .../container/isula/isula_api_grpc.pb.go | 107 - .../container/isula/isula_container.go | 39 - .../collector/container/isula/isulad.pb.go | 278 --- .../collector/container/isula/isulad.proto | 35 - .../container/isula/isulad_grpc.pb.go | 105 - .../collector/container/parser.go | 630 ----- .../collector/container/parser_test.go | 1027 -------- .../collector/container/runtime_ops.go | 413 --- .../collector/container/runtime_ops_test.go | 568 ----- .../npu-exporter/collector/container/utils.go | 133 - .../collector/container/utils_test.go | 329 --- .../collector/container/v1/containerd.pb.go | 310 --- .../collector/container/v1/containerd.proto | 62 - .../collector/container/v1/spec.go | 59 - .../collector/metrics/collector_for_ddr.go | 142 -- .../collector/metrics/collector_for_hbm.go | 228 -- .../metrics/collector_for_hbm_test.go | 115 - .../collector/metrics/collector_for_hccs.go | 312 --- .../metrics/collector_for_hccs_test.go | 150 -- .../metrics/collector_for_network.go | 190 -- .../collector/metrics/collector_for_npu.go | 453 ---- .../metrics/collector_for_optical.go | 200 -- .../collector/metrics/collector_for_pcie.go | 234 -- .../collector/metrics/collector_for_roce.go | 263 -- .../collector/metrics/collector_for_sio.go | 120 - .../metrics/collector_for_version.go | 56 - .../collector/metrics/collector_for_vnpu.go | 169 -- .../metrics/collector_for_vnpu_test.go | 202 -- .../collector/metrics/collector_test.go | 548 ---- .../collector/metrics/common_utils.go | 193 -- .../collector/metrics/common_utils_test.go | 165 -- .../collector/testdata/prometheus_metrics | 166 -- .../collector/testdata/prometheus_metrics2 | 6 - mind-cluster/component/npu-exporter/go.mod | 63 - mind-cluster/component/npu-exporter/go.sum | 561 ----- .../npu-exporter/platforms/inputs/all/npu.go | 20 - .../platforms/inputs/npu/README.md | 107 - .../npu-exporter/platforms/inputs/npu/npu.go | 104 - .../platforms/inputs/npu/npu_test.go | 174 -- .../platforms/inputs/npu/sample.conf | 9 - .../platforms/prom/prometheus_collector.go | 103 - .../prom/prometheus_collector_test.go | 159 -- .../component/npu-exporter/plugins/README.md | 388 --- .../plugins/collector_for_text_file.go | 358 --- .../npu-exporter/plugins/register.go | 21 - .../utils/logger/general_logger.go | 76 - .../npu-exporter/utils/logger/logger.go | 174 -- .../npu-exporter/utils/logger/logger_test.go | 119 - .../utils/logger/telegraf_logger.go | 82 - .../component/npu-exporter/utils/utils.go | 52 - .../npu-exporter/utils/utils_test.go | 103 - .../npu-exporter/versions/version.go | 23 - 170 files changed, 32586 deletions(-) delete mode 100644 mind-cluster/component/ascend-common/README.md delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/LICENSE delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/README.md delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go delete mode 100644 mind-cluster/component/ascend-common/api/consts.go delete mode 100644 mind-cluster/component/ascend-common/api/default_name.go delete mode 100644 mind-cluster/component/ascend-common/api/publicfault.go delete mode 100644 mind-cluster/component/ascend-common/api/slownet/fault_net.go delete mode 100644 mind-cluster/component/ascend-common/api/superpoddevice.go delete mode 100644 mind-cluster/component/ascend-common/api/type.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/types.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/a310mgr.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/a310pmgr.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/a910mgr.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/constants.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/types.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/constants.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go delete mode 100644 mind-cluster/component/ascend-common/go.mod delete mode 100644 mind-cluster/component/ascend-common/go.sum delete mode 100644 mind-cluster/component/npu-exporter/.gitignore delete mode 100644 mind-cluster/component/npu-exporter/LICENSE delete mode 100644 mind-cluster/component/npu-exporter/README.md delete mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile delete mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc delete mode 100644 mind-cluster/component/npu-exporter/build/build.sh delete mode 100644 mind-cluster/component/npu-exporter/build/build_ch.sh delete mode 100644 mind-cluster/component/npu-exporter/build/metricConfiguration.json delete mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml delete mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter.yaml delete mode 100644 mind-cluster/component/npu-exporter/build/pluginConfiguration.json delete mode 100644 mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh delete mode 100644 mind-cluster/component/npu-exporter/build/test.sh delete mode 100644 mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/constants.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/types.go delete mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config.go delete mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/parser.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/parser_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/utils.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/utils_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto delete mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/spec.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics delete mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 delete mode 100644 mind-cluster/component/npu-exporter/go.mod delete mode 100644 mind-cluster/component/npu-exporter/go.sum delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf delete mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/plugins/README.md delete mode 100644 mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go delete mode 100644 mind-cluster/component/npu-exporter/plugins/register.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/general_logger.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger_test.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go delete mode 100644 mind-cluster/component/npu-exporter/utils/utils.go delete mode 100644 mind-cluster/component/npu-exporter/utils/utils_test.go delete mode 100644 mind-cluster/component/npu-exporter/versions/version.go diff --git a/mind-cluster/component/ascend-common/README.md b/mind-cluster/component/ascend-common/README.md deleted file mode 100644 index fa7f1b8..0000000 --- a/mind-cluster/component/ascend-common/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# AscendCommon - -# 组件介绍 -提供公共代码给其他组件使用,组件包括NPU-Exporter等。 - -# 说明 - -1. 编译NPU-Exporter等组件时,AscendCommon要放在同一目录下 \ No newline at end of file diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE b/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/README.md b/mind-cluster/component/ascend-common/api/ascend-operator/README.md deleted file mode 100644 index 20c2f61..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/README.md +++ /dev/null @@ -1,164 +0,0 @@ -# ascend-aperator-apis - -## 介绍 - -ascend-aperator-apis旨在为用户提供AscendJob API,及其Clientsets, Listers、Informers。使用户能轻松对AscendJob进行CRUD操作。 - -## 接口说明 - -1. 创建clientsets - - ```go - NewForConfig(c *rest.Config)(*Clientset, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------- | ------------------------------------------------------------ | - | c | Input | *rest.Config | 客户端配置文件,由k8s提供的接口生成。包括cluster host、证书等信息 | - | - | Output | *clientsets | Client集合,包括AscendJob client和discovery client | - | - | Output | error | 错误信息 | - -2. 创建AscendJob - - ```go - Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | job | Input | *v1.AscendJob | AscendJob对象指针 | - | opts | Input | metav1.CreateOptions | 创建选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -3. 获取AscendJob - - ```go - Get(ctx context.Context, name string, opts metav1.GetOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | name | Input | string | AscendJob名称 | - | opts | Input | metav1.GetOptions | 获取选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -4. 列举AscendJob - - ```go - List(ctx context.Context, opts metav1.ListOptions)(*v1.AscendJobList, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | --------------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | opts | Input | metav1.ListOptions | 列举选项 | - | - | Output | *v1.AscendJob | AscendJobList对象指针 | - | - | Output | error | 错误信息 | - -5. 观察AscendJob - - ```go - Watch((ctx context.Context, opts metav1.ListOptions)(watch.Interface, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | ------------------ | ---------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | opts | Input | metav1.ListOptions | 列举选项 | - | - | Output | watch.Interface | watch类接口 | - | - | Output | error | 错误信息 | - -6. 更新AscendJob - - ```go - Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | job | Input | *v1.AscendJob | AscendJob对象指针 | - | opts | Input | metav1.UpdateOptions | 更新选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -7. 更新AscendJob状态 - - ```go - UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | job | Input | *v1.AscendJob | AscendJob对象指针 | - | opts | Input | metav1.UpdateOptions | 更新选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -8. 补丁AscendJob - - ```go - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ------------ | ------------ | --------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | name | Input | string | AscendJob名称 | - | pt | Input | types.PatchType | patch类型 | - | data | Input | []byte | patch信息 | - | subresources | Input | ...string | 子信息 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -9. 删除AscendJob - - ```go - Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ---------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | name | Input | string | AscendJob名称 | - | opts | Input | metav1.DeleteOptions | 删除选项 | - | - | Output | error | 错误信息 | - -10. 批量删除AscendJob - - ```go - DeleteCollection(ctx context.Context,opts metav1.DeleteOptions, listOpts metav1.ListOptions) error - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ---------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | opts | Input | metav1.DeleteOptions | 删除选项 | - | listOpts | Input | metav1.ListOptions | 列举选项 | - | - | Output | error | 错误信息 | - -11. 创建informerFactory - - ```go - NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) sharedInformerFactory - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ------------- | ------------ | --------------------- | ------------------ | - | client | Input | versioned.Interface | client类接口 | - | defaultResync | Input | time.Duration | 默认的重新同步时间 | - | - | Output | sharedInformerFactory | informer类接口 | - -12. 创建informer - - ```go - sharedInformerFactory.Batch().V1().Jobs().Informer() - ``` - - - diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go deleted file mode 100644 index 7bd1d65..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go +++ /dev/null @@ -1,85 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to define AscendJob object and its initialization. -package v1 - -import ( - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// AscendJob is the Schema for the AscendJob API -type AscendJob struct { - // Standard Kubernetes type metadata. - metav1.TypeMeta `json:",inline"` - - // +optional - metav1.ObjectMeta `json:"metadata,omitempty"` - - // Specification of the desired state of the AscendJob. - // +optional - Spec AscendJobSpec `json:"spec,omitempty"` - - // Most recently observed status of the AscendJob. - // Populated by the system. - // Read-only. - // +optional - Status commonv1.JobStatus `json:"status,omitempty"` -} - -// AscendJobSpec defines the desired state of AscendJob -type AscendJobSpec struct { - // RunPolicy encapsulates various runtime policies of the distributed training - // job, for example how to clean up resources and how long the job can stay - // active. - // +kubebuilder:validation:Optional - RunPolicy commonv1.RunPolicy `json:"runPolicy"` - - // SuccessPolicy defines the policy to mark the AscendJob as succeeded. - // Default to "", using the default rules. - // +optional - SuccessPolicy *SuccessPolicy `json:"successPolicy,omitempty"` - - // SchedulerName defines the job scheduler with gang-scheduling enabled - SchedulerName string `json:"schedulerName,omitempty"` - - /* A map of ReplicaType (type) to ReplicaSpec (value). Specifies the ML cluster configuration. - For example, - { - "Scheduler": ReplacaSpec, - "Worker": ReplicaSpec, - } - */ - ReplicaSpecs map[commonv1.ReplicaType]*commonv1.ReplicaSpec `json:"replicaSpecs"` -} - -// AscendJobList contains a list of AscendJob -type AscendJobList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []AscendJob `json:"items"` -} - -// SuccessPolicy is the success policy. -type SuccessPolicy string - -const ( - // SuccessPolicyDefault is the default policy of success - SuccessPolicyDefault SuccessPolicy = "" - // SuccessPolicyAllWorkers is the 'ALLWorkers' policy of success - SuccessPolicyAllWorkers SuccessPolicy = "AllWorkers" -) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go deleted file mode 100644 index 9341682..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go +++ /dev/null @@ -1,53 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "github.com/kubeflow/common/pkg/apis/common/v1" -) - -const ( - // GroupName is the group name used in this package. - GroupName = "mindxdl.gitee.com" - - // FrameworkKey the key of the laebl - FrameworkKey = "framework" - - // DefaultPort is default value of the port. - DefaultPort = 2222 - - // MindSporeFrameworkName is the name of ML Framework - MindSporeFrameworkName = "mindspore" - // MindSporeReplicaTypeScheduler is the type for Scheduler of distribute ML - MindSporeReplicaTypeScheduler v1.ReplicaType = "Scheduler" - - // PytorchFrameworkName is the name of ML Framework - PytorchFrameworkName = "pytorch" - // PytorchReplicaTypeMaster is the type for Scheduler of distribute ML - PytorchReplicaTypeMaster v1.ReplicaType = "Master" - - // TensorflowFrameworkName is the name of ML Framework - TensorflowFrameworkName = "tensorflow" - // TensorflowReplicaTypeChief is the type for Scheduler of distribute ML - TensorflowReplicaTypeChief v1.ReplicaType = "Chief" - - // ReplicaTypeWorker this is also used for non-distributed AscendJob - ReplicaTypeWorker v1.ReplicaType = "Worker" - - // DefaultRestartPolicy is default RestartPolicy for MSReplicaSpec. - DefaultRestartPolicy = v1.RestartPolicyNever -) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go deleted file mode 100644 index 4d5c124..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go +++ /dev/null @@ -1,137 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "errors" - "fmt" - "strings" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - - "ascend-common/api" -) - -// Int32 is a helper routine that allocates a new int32 value -// to store v and returns a pointer to it. -func Int32(v int32) *int32 { - return &v -} - -// addDefaultingFuncs is used to register default funcs -func addDefaultingFuncs(scheme *runtime.Scheme) error { - return RegisterDefaults(scheme) -} - -// setDefaultPort sets the default ports for mindxdl container. -func setDefaultPort(spec *v1.PodSpec) { - index := 0 - for i, container := range spec.Containers { - if container.Name == api.DefaultContainerName { - index = i - break - - } - } - hasASJobPort := false - for _, port := range spec.Containers[index].Ports { - if port.Name == api.DefaultPortName { - hasASJobPort = true - break - } - } - if !hasASJobPort { - spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{ - Name: api.DefaultPortName, - ContainerPort: DefaultPort, - }) - } -} - -func setDefaultReplicas(spec *commonv1.ReplicaSpec) { - if spec.Replicas == nil { - spec.Replicas = Int32(1) - } - if spec.RestartPolicy == "" { - spec.RestartPolicy = DefaultRestartPolicy - } -} - -// setTypeNamesToCamelCase sets the name of all replica types from any case to correct case. -func setTypeNamesToCamelCase(job *AscendJob) { - setTypeNameToCamelCase(job, MindSporeReplicaTypeScheduler) - setTypeNameToCamelCase(job, ReplicaTypeWorker) - setTypeNameToCamelCase(job, PytorchReplicaTypeMaster) - setTypeNameToCamelCase(job, TensorflowReplicaTypeChief) -} - -// setTypeNameToCamelCase sets the name of the replica type from any case to correct case. -// E.g. from ps to PS; from WORKER to Worker. -func setTypeNameToCamelCase(job *AscendJob, typ commonv1.ReplicaType) { - for t := range job.Spec.ReplicaSpecs { - if strings.EqualFold(string(t), string(typ)) && t != typ { - spec := job.Spec.ReplicaSpecs[t] - delete(job.Spec.ReplicaSpecs, t) - job.Spec.ReplicaSpecs[typ] = spec - return - } - } -} - -// SetDefaultsAscendJob sets any unspecified values to defaults. -func SetDefaultsAscendJob(job *AscendJob) { - // Set default cleanpod policy to Running. - if job == nil { - return - } - - if job.Spec.RunPolicy.CleanPodPolicy == nil { - running := commonv1.CleanPodPolicyNone - job.Spec.RunPolicy.CleanPodPolicy = &running - } - // Set default success policy to "". - if job.Spec.SuccessPolicy == nil { - defaultPolicy := SuccessPolicyDefault - job.Spec.SuccessPolicy = &defaultPolicy - } - - // Update the key of replicaSpecs to camel case. - setTypeNamesToCamelCase(job) - - for rt, spec := range job.Spec.ReplicaSpecs { - // Set default replicas to 1. - setDefaultReplicas(spec) - // Set default port to ml container. - if rt == MindSporeReplicaTypeScheduler || rt == PytorchReplicaTypeMaster || rt == TensorflowReplicaTypeChief { - setDefaultPort(&spec.Template.Spec) - } - } -} - -// GetJobFramework get framework name of ascendjob -func GetJobFramework(job *AscendJob) (string, error) { - if job == nil || job.Labels == nil { - return "", errors.New("job or job labels is nil") - } - frame, ok := job.Labels[FrameworkKey] - if !ok { - return "", fmt.Errorf("job<%s-%s> label framework is not set", job.Namespace, job.Name) - } - return frame, nil -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go deleted file mode 100644 index 5813e39..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -var ( - // SchemeGroupVersion is the group version used to register these objects. - SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1"} - // SchemeBuilder points to a list of functions added to Scheme. - SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) - // AddToScheme adds the types in this group-version to the given scheme. - AddToScheme = SchemeBuilder.AddToScheme -) - -// Resource takes an unqualified resource and returns a Group-qualified GroupResource. -func Resource(resource string) schema.GroupResource { - return SchemeGroupVersion.WithResource(resource).GroupResource() -} - -// addKnownTypes adds the set of types defined in this package to the supplied scheme. -func addKnownTypes(scheme *runtime.Scheme) error { - scheme.AddKnownTypes(SchemeGroupVersion, - &AscendJob{}, - &AscendJobList{}, - ) - - v1.AddToGroupVersion(scheme, SchemeGroupVersion) - return nil -} - -func init() { - SchemeBuilder.Register(addDefaultingFuncs) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go deleted file mode 100644 index 695038b..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go +++ /dev/null @@ -1,137 +0,0 @@ -//go:build !ignore_autogenerated -// +build !ignore_autogenerated - -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by controller-gen. DO NOT EDIT. - -package v1 - -import ( - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "k8s.io/apimachinery/pkg/runtime" -) - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AscendJob) DeepCopyInto(out *AscendJob) { - if in == nil { - return - } - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJob. -func (in *AscendJob) DeepCopy() *AscendJob { - if in == nil { - return nil - } - out := new(AscendJob) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *AscendJob) DeepCopyObject() runtime.Object { - if in == nil { - return nil - } - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AscendJobList) DeepCopyInto(out *AscendJobList) { - if in == nil { - return - } - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]AscendJob, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobList. -func (in *AscendJobList) DeepCopy() *AscendJobList { - if in == nil { - return nil - } - out := new(AscendJobList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *AscendJobList) DeepCopyObject() runtime.Object { - if in == nil { - return nil - } - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AscendJobSpec) DeepCopyInto(out *AscendJobSpec) { - if in == nil { - return - } - *out = *in - in.RunPolicy.DeepCopyInto(&out.RunPolicy) - if in.SuccessPolicy != nil { - in, out := &in.SuccessPolicy, &out.SuccessPolicy - *out = new(SuccessPolicy) - **out = **in - } - if in.ReplicaSpecs != nil { - in, out := &in.ReplicaSpecs, &out.ReplicaSpecs - *out = make(map[commonv1.ReplicaType]*commonv1.ReplicaSpec, len(*in)) - for key, val := range *in { - var outVal *commonv1.ReplicaSpec - if val == nil { - (*out)[key] = nil - } else { - in, out := &val, &outVal - *out = new(commonv1.ReplicaSpec) - (*in).DeepCopyInto(*out) - } - (*out)[key] = outVal - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobSpec. -func (in *AscendJobSpec) DeepCopy() *AscendJobSpec { - if in == nil { - return nil - } - out := new(AscendJobSpec) - in.DeepCopyInto(out) - return out -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go deleted file mode 100644 index e9b774a..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go +++ /dev/null @@ -1,53 +0,0 @@ -//go:build !ignore_autogenerated -// +build !ignore_autogenerated - -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by defaulter-gen. DO NOT EDIT. - -package v1 - -import ( - "errors" - - runtime "k8s.io/apimachinery/pkg/runtime" -) - -// RegisterDefaults adds defaulters functions to the given scheme. -// Public to allow building arbitrary schemes. -// All generated defaulters are covering - they call all nested defaulters. -func RegisterDefaults(scheme *runtime.Scheme) error { - if scheme == nil { - return errors.New("scheme is nil") - } - scheme.AddTypeDefaultingFunc(&AscendJob{}, func(obj interface{}) { SetObjectDefaults_AscendJob(obj.(*AscendJob)) }) - scheme.AddTypeDefaultingFunc(&AscendJobList{}, func(obj interface{}) { SetObjectDefaults_AscendJobList(obj.(*AscendJobList)) }) - return nil -} - -func SetObjectDefaults_AscendJob(in *AscendJob) { - SetDefaultsAscendJob(in) -} - -func SetObjectDefaults_AscendJobList(in *AscendJobList) { - if in == nil { - return - } - for i := range in.Items { - a := &in.Items[i] - SetObjectDefaults_AscendJob(a) - } -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go deleted file mode 100644 index 0d4add4..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go +++ /dev/null @@ -1,114 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package versioned is used to define the ClientSet interface and struct, and its initialization. -package versioned - -import ( - "fmt" - "net/http" - - "k8s.io/client-go/discovery" - "k8s.io/client-go/rest" - "k8s.io/client-go/util/flowcontrol" - - "ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1" -) - -// Interface is the interface definition for Clientset. -type Interface interface { - Discovery() discovery.DiscoveryInterface - BatchV1() v1.BatchV1Interface -} - -// Clientset contains the clients for groups. Each group has exactly one -// version included in a Clientset. -type Clientset struct { - *discovery.DiscoveryClient - batchV1 *v1.BatchV1Client -} - -// BatchV1 retrieves the BatchV1alpha1Client -func (c *Clientset) BatchV1() v1.BatchV1Interface { - if c == nil { - return nil - } - return c.batchV1 -} - -// Discovery retrieves the DiscoveryClient -func (c *Clientset) Discovery() discovery.DiscoveryInterface { - if c == nil { - return nil - } - return c.DiscoveryClient -} - -// NewForConfig creates a new Clientset for the given config. -// If config's RateLimiter is not set and QPS and Burst are acceptable, -// NewForConfig will generate a rate-limiter in configShallowCopy. -// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), -// where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*Clientset, error) { - configShallowCopy := *c - - // share the transport between all clients - httpClient, err := rest.HTTPClientFor(&configShallowCopy) - if err != nil { - return nil, err - } - - return NewForConfigAndClient(&configShallowCopy, httpClient) -} - -// NewForConfigAndClient creates a new Clientset for the given config and http client. -// Note the http client provided takes precedence over the configured transport values. -// If config's RateLimiter is not set and QPS and Burst are acceptable, -// NewForConfigAndClient will generate a rate-limiter in configShallowCopy. -func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { - if c == nil || httpClient == nil { - return nil, fmt.Errorf("nil pointer") - } - configShallowCopy := *c - if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { - if configShallowCopy.Burst <= 0 { - return nil, fmt.Errorf("burst is required to be greater than 0 " + - "when RateLimiter is not set and QPS is set to greater than 0") - } - configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) - } - - var cs Clientset - var err error - cs.batchV1, err = v1.NewForConfigAndClient(&configShallowCopy, httpClient) - if err != nil { - return nil, err - } - cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) - if err != nil { - return nil, err - } - return &cs, nil -} - -// New creates a new Clientset for the given RESTClient. -func New(c rest.Interface) *Clientset { - var cs Clientset - cs.batchV1 = v1.New(c) - - cs.DiscoveryClient = discovery.NewDiscoveryClient(c) - return &cs -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go deleted file mode 100644 index 58a99b0..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go +++ /dev/null @@ -1,39 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package scheme is used to add runtime.Scheme -package scheme - -import ( - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/serializer" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - - "ascend-common/api/ascend-operator/apis/batch/v1" -) - -// RuntimeScheme is a Scheme object instance. -var RuntimeScheme = runtime.NewScheme() - -// Codecs is a CodecFactory object instance. -var Codecs = serializer.NewCodecFactory(RuntimeScheme) - -// ParameterCodec is a parameterCodec object instance. -var ParameterCodec = runtime.NewParameterCodec(RuntimeScheme) - -func init() { - utilruntime.Must(v1.AddToScheme(RuntimeScheme)) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go deleted file mode 100644 index 7dd8264..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go +++ /dev/null @@ -1,110 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to define some client- and job-related interfaces, initialization operations, -// and method implementations. -package v1 - -import ( - "errors" - "net/http" - - "k8s.io/client-go/rest" - - "ascend-common/api/ascend-operator/apis/batch/v1" - "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" -) - -// BatchV1Interface is a batch client interface. -type BatchV1Interface interface { - RESTClient() rest.Interface - JobsGetter -} - -// BatchV1Client is a client structure. -type BatchV1Client struct { - restClient rest.Interface -} - -// Jobs returns a JobInterface object instance. -func (c *BatchV1Client) Jobs(namespace string) JobInterface { - if c == nil { - return nil - } - return newJobs(c, namespace) -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *BatchV1Client) RESTClient() rest.Interface { - if c == nil { - return nil - } - return c.restClient -} - -// NewForConfig creates a new BatchV1alpha1Client for the given config. -// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), -// where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*BatchV1Client, error) { - if c == nil { - return nil, errors.New(nilPointError) - } - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - httpClient, err := rest.HTTPClientFor(&config) - if err != nil { - return nil, err - } - return NewForConfigAndClient(&config, httpClient) -} - -func setConfigDefaults(config *rest.Config) error { - gv := v1.SchemeGroupVersion - config.GroupVersion = &gv - config.APIPath = "/apis" - config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() - - if config.UserAgent == "" { - config.UserAgent = rest.DefaultKubernetesUserAgent() - } - - return nil -} - -// NewForConfigAndClient creates a new BatchV1alpha1Client for the given config and http client. -// Note the http client provided takes precedence over the configured transport values. -func NewForConfigAndClient(c *rest.Config, h *http.Client) (*BatchV1Client, error) { - if c == nil || h == nil { - return nil, errors.New(nilPointError) - } - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - client, err := rest.RESTClientForConfigAndClient(&config, h) - if err != nil { - return nil, err - } - return &BatchV1Client{restClient: client}, nil -} - -// New creates a new BatchV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *BatchV1Client { - return &BatchV1Client{restClient: c} -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go deleted file mode 100644 index a6527ad..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go +++ /dev/null @@ -1,221 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "context" - "errors" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/rest" - - "ascend-common/api" - "ascend-common/api/ascend-operator/apis/batch/v1" - "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" -) - -const ( - nilPointError = "nil pointer" -) - -// JobsGetter has a method to return a JobInterface. -// A group's client should implement this interface. -type JobsGetter interface { - Jobs(namespace string) JobInterface -} - -// JobInterface has methods to work with Job resources. -type JobInterface interface { - Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) - Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) - UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) - Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error - DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error - Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) - List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) - Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, - subresources ...string) (result *v1.AscendJob, err error) - // JobExpansion -} - -// jobs implements JobInterface -type jobs struct { - client rest.Interface - ns string -} - -func (j *jobs) Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Post(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&opts, scheme.ParameterCodec). - Body(job). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, - error) { - if j == nil || job == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Put(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(job.Name). - VersionedParams(&opts, scheme.ParameterCodec). - Body(job). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, - error) { - if j == nil || job == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Put(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(job.Name). - SubResource("status"). - VersionedParams(&opts, scheme.ParameterCodec). - Body(job). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { - if j == nil { - return errors.New(nilPointError) - } - return j.client.Delete(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(name). - Body(&opts). - Do(ctx). - Error() -} - -func (j *jobs) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error { - if j == nil { - return errors.New(nilPointError) - } - var timeout time.Duration - if listOpts.TimeoutSeconds != nil { - timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second - } - return j.client.Delete(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&listOpts, scheme.ParameterCodec). - Timeout(timeout). - Body(&opts). - Do(ctx). - Error() -} - -func (j *jobs) Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Get(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(name). - VersionedParams(&opts, scheme.ParameterCodec). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - var timeout time.Duration - if opts.TimeoutSeconds != nil { - timeout = time.Duration(*opts.TimeoutSeconds) * time.Second - } - result := &v1.AscendJobList{} - err := j.client.Get(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&opts, scheme.ParameterCodec). - Timeout(timeout). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - var timeout time.Duration - if opts.TimeoutSeconds != nil { - timeout = time.Duration(*opts.TimeoutSeconds) * time.Second - } - opts.Watch = true - return j.client.Get(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&opts, scheme.ParameterCodec). - Timeout(timeout). - Watch(ctx) -} - -func (j *jobs) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, - subresources ...string) (*v1.AscendJob, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Patch(pt). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(name). - SubResource(subresources...). - VersionedParams(&opts, scheme.ParameterCodec). - Body(data). - Do(ctx). - Into(result) - return result, err -} - -// newJobs returns a Jobs -func newJobs(c *BatchV1Client, namespace string) *jobs { - return &jobs{ - client: c.RESTClient(), - ns: namespace, - } -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go deleted file mode 100644 index 78b5d12..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go +++ /dev/null @@ -1,49 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package batch is used to define interfaces. -package batch - -import ( - "ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1" - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to each of this group's versions. -type Interface interface { - // V1 provides access to shared informers for resources in V1alpha1. - V1() v1.Interface -} - -type group struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, - tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// V1 returns a new v1alpha1.Interface. -func (g *group) V1() v1.Interface { - if g == nil { - return nil - } - return v1.New(g.factory, g.namespace, g.tweakListOptions) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go deleted file mode 100644 index a4f0466..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go +++ /dev/null @@ -1,48 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to implement job informer-related methods. -package v1 - -import ( - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to all the informers in this group version. -type Interface interface { - // Jobs returns a JobInformer. - Jobs() JobInformer -} - -type version struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, - tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// Jobs returns a JobInformer. -func (v *version) Jobs() JobInformer { - if v == nil { - return nil - } - return &jobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go deleted file mode 100644 index e5f0b1c..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "context" - "time" - - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/tools/cache" - - batchv1 "ascend-common/api/ascend-operator/apis/batch/v1" - "ascend-common/api/ascend-operator/client/clientset/versioned" - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" - batchlister "ascend-common/api/ascend-operator/client/listers/batch/v1" -) - -// JobInformer provides access to a shared informer and lister for -// Jobs. -type JobInformer interface { - Informer() cache.SharedIndexInformer - Lister() batchlister.JobLister -} - -type jobInformer struct { - factory internalinterfaces.SharedInformerFactory - tweakListOptions internalinterfaces.TweakListOptionsFunc - namespace string -} - -// NewJobInformer constructs a new informer for Job type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, - indexers cache.Indexers) cache.SharedIndexInformer { - return NewFilteredJobInformer(client, namespace, resyncPeriod, indexers, nil) -} - -// NewFilteredJobInformer constructs a new informer for Job type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewFilteredJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, - indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.BatchV1().Jobs(namespace).List(context.TODO(), options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.BatchV1().Jobs(namespace).Watch(context.TODO(), options) - }, - }, - &batchv1.AscendJob{}, - resyncPeriod, - indexers, - ) -} - -func (f *jobInformer) defaultInformer(client versioned.Interface, - resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewFilteredJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{ - cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) -} - -func (f *jobInformer) Informer() cache.SharedIndexInformer { - if f == nil || f.factory == nil { - return nil - } - return f.factory.InformerFor(&batchv1.AscendJob{}, f.defaultInformer) -} - -func (f *jobInformer) Lister() batchlister.JobLister { - if f == nil { - return nil - } - return batchlister.NewJobLister(f.Informer().GetIndexer()) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go deleted file mode 100644 index 5fec15f..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go +++ /dev/null @@ -1,207 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package externalversions - -import ( - "reflect" - "sync" - "time" - - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/tools/cache" - - "ascend-common/api/ascend-operator/client/clientset/versioned" - "ascend-common/api/ascend-operator/client/informers/externalversions/batch" - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" -) - -// SharedInformerFactory provides shared informers for resources in all known -// API group versions. -type SharedInformerFactory interface { - internalinterfaces.SharedInformerFactory - ForResource(resource schema.GroupVersionResource) (GenericInformer, error) - WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool - - Batch() batch.Interface -} - -// SharedInformerOption defines the functional option type for SharedInformerFactory. -type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory - -type sharedInformerFactory struct { - client versioned.Interface - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc - lock sync.Mutex - defaultResync time.Duration - customResync map[reflect.Type]time.Duration - - informers map[reflect.Type]cache.SharedIndexInformer - // startedInformers is used for tracking which informers have been started. - // This allows Start() to be called multiple times safely. - startedInformers map[reflect.Type]bool -} - -// WithCustomResyncConfig sets a custom resync period for the specified informer types. -func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - if factory == nil { - return factory - } - - if factory.customResync == nil { - factory.customResync = make(map[reflect.Type]time.Duration) - } - - for k, v := range resyncConfig { - factory.customResync[reflect.TypeOf(k)] = v - } - return factory - } -} - -// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. -func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - if factory == nil { - return nil - } - factory.tweakListOptions = tweakListOptions - return factory - } -} - -// WithNamespace limits the SharedInformerFactory to the specified namespace. -func WithNamespace(namespace string) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - if factory == nil { - return nil - } - factory.namespace = namespace - return factory - } -} - -// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. -func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { - return NewSharedInformerFactoryWithOptions(client, defaultResync) -} - -// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. -func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, - options ...SharedInformerOption) SharedInformerFactory { - factory := &sharedInformerFactory{ - client: client, - namespace: v1.NamespaceAll, - defaultResync: defaultResync, - informers: make(map[reflect.Type]cache.SharedIndexInformer), - startedInformers: make(map[reflect.Type]bool), - customResync: make(map[reflect.Type]time.Duration), - } - - // Apply all options - for _, opt := range options { - factory = opt(factory) - } - - return factory -} - -// Start initializes all requested informers. -func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { - if f == nil { - return - } - f.lock.Lock() - defer f.lock.Unlock() - - if f.startedInformers == nil { - f.startedInformers = make(map[reflect.Type]bool) - } - - for informerType, informer := range f.informers { - if !f.startedInformers[informerType] { - go informer.Run(stopCh) - f.startedInformers[informerType] = true - } - } -} - -// WaitForCacheSync waits for all started informers' cache were synced. -func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { - informers := func() map[reflect.Type]cache.SharedIndexInformer { - if f == nil { - return nil - } - f.lock.Lock() - defer f.lock.Unlock() - - informers := map[reflect.Type]cache.SharedIndexInformer{} - for informerType, informer := range f.informers { - if f.startedInformers[informerType] { - informers[informerType] = informer - } - } - return informers - }() - - res := map[reflect.Type]bool{} - for informType, informer := range informers { - res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) - } - return res -} - -// InternalInformerFor returns the SharedIndexInformer for obj using an internal -// client. -func (f *sharedInformerFactory) InformerFor(obj runtime.Object, - newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { - if f == nil { - return nil - } - - f.lock.Lock() - defer f.lock.Unlock() - - informerType := reflect.TypeOf(obj) - informer, exists := f.informers[informerType] - if exists { - return informer - } - - resyncPeriod, exists := f.customResync[informerType] - if !exists { - resyncPeriod = f.defaultResync - } - - informer = newFunc(f.client, resyncPeriod) - if f.informers == nil { - f.informers = make(map[reflect.Type]cache.SharedIndexInformer) - } - f.informers[informerType] = informer - - return informer -} - -func (f *sharedInformerFactory) Batch() batch.Interface { - if f == nil { - return nil - } - return batch.New(f, f.namespace, f.tweakListOptions) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go deleted file mode 100644 index 95db6d0..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package externalversions - -import ( - "errors" - "fmt" - - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/tools/cache" - - "ascend-common/api" - "ascend-common/api/ascend-operator/apis/batch/v1" -) - -// GenericInformer is type of SharedIndexInformer which will locate and delegate to other -// sharedInformers based on type -type GenericInformer interface { - Informer() cache.SharedIndexInformer - Lister() cache.GenericLister -} - -type genericInformer struct { - informer cache.SharedIndexInformer - resource schema.GroupResource -} - -// Informer returns the SharedIndexInformer. -func (f *genericInformer) Informer() cache.SharedIndexInformer { - if f == nil { - return nil - } - return f.informer -} - -// Lister returns the GenericLister. -func (f *genericInformer) Lister() cache.GenericLister { - if f == nil { - return nil - } - return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) -} - -// ForResource gives generic access to a shared informer of the matching type -// extend this to unknown resources with a client pool -func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { - if f == nil { - return nil, errors.New("nil pointer") - } - switch resource { - case v1.SchemeGroupVersion.WithResource(api.AscendJobsLowerCase): - return &genericInformer{resource: resource.GroupResource(), informer: f.Batch().V1().Jobs().Informer()}, nil - default: - } - - return nil, fmt.Errorf("no informer found for %v", resource) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go deleted file mode 100644 index 5602b78..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go +++ /dev/null @@ -1,40 +0,0 @@ -/* -Copyright 2019 Bloomberg Finance LP. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package internalinterfaces is used to define informer-related interfaces. -package internalinterfaces - -import ( - "time" - - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/tools/cache" - - "ascend-common/api/ascend-operator/client/clientset/versioned" -) - -// NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. -type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer - -// SharedInformerFactory a small interface to allow for adding an informer without an import cycle -type SharedInformerFactory interface { - Start(stopCh <-chan struct{}) - InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer -} - -// TweakListOptionsFunc is a function that transforms a v1.ListOptions. -type TweakListOptionsFunc func(*v1.ListOptions) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go deleted file mode 100644 index 9ed431c..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go +++ /dev/null @@ -1,26 +0,0 @@ -/* -Copyright 2024 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to define job-related interfaces. -package v1 - -// JobListerExpansion allows custom methods to be added to -// JobLister. -type JobListerExpansion interface{} - -// JobNamespaceListerExpansion allows custom methods to be added to -// JobNamespaceLister. -type JobNamespaceListerExpansion interface{} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go deleted file mode 100644 index 084a913..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go +++ /dev/null @@ -1,108 +0,0 @@ -/* -Copyright 2024 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "errors" - - k8serr "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/tools/cache" - - "ascend-common/api/ascend-operator/apis/batch/v1" -) - -// JobLister helps list Jobs. -// All objects returned here must be treated as read-only. -type JobLister interface { - // List lists all Jobs in the indexer. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*v1.AscendJob, err error) - // Jobs returns an object that can list and get Jobs. - Jobs(namespace string) JobNamespaceLister - JobListerExpansion -} - -// jobLister implements the JobLister interface. -type jobLister struct { - indexer cache.Indexer -} - -// NewJobLister returns a new JobLister. -func NewJobLister(indexer cache.Indexer) JobLister { - return &jobLister{indexer: indexer} -} - -// List lists all Jobs in the indexer. -func (s *jobLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { - if s == nil { - return nil, errors.New("nil pointer") - } - var ret []*v1.AscendJob - err := cache.ListAll(s.indexer, selector, func(m interface{}) { - ret = append(ret, m.(*v1.AscendJob)) - }) - return ret, err -} - -// Jobs returns an object that can list and get Jobs. -func (s *jobLister) Jobs(namespace string) JobNamespaceLister { - if s == nil { - return nil - } - return jobNamespaceLister{indexer: s.indexer, namespace: namespace} -} - -// JobNamespaceLister helps list and get Jobs. -// All objects returned here must be treated as read-only. -type JobNamespaceLister interface { - // List lists all Jobs in the indexer for a given namespace. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*v1.AscendJob, err error) - // Get retrieves the Job from the indexer for a given namespace and name. - // Objects returned here must be treated as read-only. - Get(name string) (*v1.AscendJob, error) - JobNamespaceListerExpansion -} - -// jobNamespaceLister implements the JobNamespaceLister -// interface. -type jobNamespaceLister struct { - indexer cache.Indexer - namespace string -} - -// List lists all Jobs in the indexer for a given namespace. -func (s jobNamespaceLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { - var ret []*v1.AscendJob - err := cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { - ret = append(ret, m.(*v1.AscendJob)) - }) - return ret, err -} - -// Get retrieves the Job from the indexer for a given namespace and name. -func (s jobNamespaceLister) Get(name string) (*v1.AscendJob, error) { - obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) - if err != nil { - return nil, err - } - if !exists { - return nil, k8serr.NewNotFound(v1.Resource("job"), name) - } - return obj.(*v1.AscendJob), nil -} diff --git a/mind-cluster/component/ascend-common/api/consts.go b/mind-cluster/component/ascend-common/api/consts.go deleted file mode 100644 index 01881ce..0000000 --- a/mind-cluster/component/ascend-common/api/consts.go +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api common const -package api - -// Env -const ( - NodeNameEnv = "NODE_NAME" - - // PtWorldSizeEnv the total number of npu used for the task for PyTorch - PtWorldSizeEnv = "WORLD_SIZE" - // PtLocalWorldSizeEnv number of npu used per pod for PyTorch - PtLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" - // PtLocalRankEnv logic id List of npu used by pod for PyTorch - PtLocalRankEnv = "LOCAL_RANK" - - // TfWorkerSizeEnv the total number of npu used for the task for TensorFlow - TfWorkerSizeEnv = "CM_WORKER_SIZE" - // TfLocalWorkerEnv number of npu used per pod for TensorFlow - TfLocalWorkerEnv = "CM_LOCAL_WORKER" - - // MsWorkerNumEnv the total number of npu used for the task for MindSpore - MsWorkerNumEnv = "MS_WORKER_NUM" - // MsLocalWorkerEnv number of npu used per pod for MindSpore - MsLocalWorkerEnv = "MS_LOCAL_WORKER" -) - -// NameSpace -const ( - DLNamespace = "mindx-dl" - ClusterNS = "cluster-system" - KubeNS = "kube-system" -) - -// Node -const ( - // NPUChipMemoryLabel label value is npu chip memory - NPUChipMemoryLabel = "mind-cluster/npu-chip-memory" - - // NodeSNAnnotation annotation value is node sn - NodeSNAnnotation = "product-serial-number" - // BaseDevInfoAnno annotation value is device base info - BaseDevInfoAnno = "baseDeviceInfos" - - // AcceleratorTypeKey the node label key of accelerator type - AcceleratorTypeKey = "accelerator-type" - // AcceleratorTypeModule910A3SuperPod for 910A3-SuperPod hardware - AcceleratorTypeModule910A3SuperPod = "module-a3-16-super-pod" -) - -// Pod -const ( - // PodUsedHardwareTypeAnno annotation value is the hardware type that real used in pod - PodUsedHardwareTypeAnno = "mind-cluster/hardware-type" - // PodRankIndexAnno annotation value is rank index of the pod - PodRankIndexAnno = "hccl/rankIndex" - // SuperPodIDAnno annotation key of the super pod id - SuperPodIDAnno = "super-pod-id" - - // Hotswitch Annotations - - // InHotSwitchFlowKey in hot switch flow key - InHotSwitchFlowKey = "inHotSwitchFlow" - // InHotSwitchFlowValue in hot switch flow true - InHotSwitchFlowValue = "true" - // BackupNewPodNameKey backup new pod name key - BackupNewPodNameKey = "backupNewPodName" - // BackupSourcePodNameKey backup source pod name key - BackupSourcePodNameKey = "backupSourcePodName" - // NeedOperatorOpeKey need operator ope key - NeedOperatorOpeKey = "needOperatorOpe" - // NeedVolcanoOpeKey need volcano ope key - NeedVolcanoOpeKey = "needVolcanoOpe" - // OpeTypeDelete ope type delete - OpeTypeDelete = "delete" - // OpeTypeCreate ope type create - OpeTypeCreate = "create" - // PodTypeKey pod type key - PodTypeKey = "podType" - // PodTypeBackup pod type backup - PodTypeBackup = "backup" - // DefaultRetryTimes default retry times - DefaultRetryTimes = 3 - // MasterPodRank master pod rank - MasterPodRank = "0" -) - -// PodGroup -const ( - // AtlasTaskLabel label value task kind, eg. ascend-910, ascend-{xxx}b - AtlasTaskLabel = "ring-controller.atlas" -) - -// ConfigMap -const ( - // DeviceInfoCMDataKey device-info-cm data key, record device info - DeviceInfoCMDataKey = "DeviceInfoCfg" - // SwitchInfoCMDataKey device-info-cm data key, record switch info - SwitchInfoCMDataKey = "SwitchInfoCfg" - // NodeInfoCMDataKey node-info-cm data key, record node info - NodeInfoCMDataKey = "NodeInfo" - // PubFaultCMDataKey public fault cm data key, record public fault info - PubFaultCMDataKey = "PublicFault" - - // CIMCMLabelKey cm label key, who uses these cms - CIMCMLabelKey = "mx-consumer-cim" - // PubFaultCMLabelKey public fault cm label key - PubFaultCMLabelKey = "mc-consumer-publicfault" -) - -const ( - // FaultJobCmName fault job cm name - FaultJobCmName = "fault-job-info" -) - -const ( - // PodScheduleLabel pod schedule label - PodScheduleLabel = "pod-rescheduling" - // ProcessScheduleLabel process schedule label - ProcessScheduleLabel = "process-recover-enable" - // RecoverStrategyKey recover strategy key in job annotation - RecoverStrategyKey = "recover-strategy" -) - -// process schedule strategy -const ( - // RecoverStrategy recover strategy - RecoverStrategy = "recover" - // RetryStrategy retry strategy - RetryStrategy = "retry" - // InPlaceStrategy recover in place strategy - InPlaceStrategy = "recover-in-place" - // DumpStrategy dump strategy - DumpStrategy = "dump" - // ExitStrategy exit strategy - ExitStrategy = "exit" - // ElasticTraining elastic-training strategy - ElasticTraining = "elastic-training" -) - -// process schedule common env -const ( - // ProcessRecoverEnv process recover env - ProcessRecoverEnv = "PROCESS_RECOVER" - // ElasticRecoverEnv elastic process recover env - ElasticRecoverEnv = "ELASTIC_PROCESS_RECOVER_ENABLE" - // EnableRestartEnv enable restart env - EnableRestartEnv = "ENABLE_RESTART_FAULT_PROCESS" -) - -// process schedule pytorch env -const ( - // HighAvailableEnv high available env - HighAvailableEnv = "HIGH_AVAILABILITY" - // PtCloseWatchDogKey pt close watch dog key - PtCloseWatchDogKey = "HCCL_ASYNC_ERROR_HANDLING" - // PtCloseWatchDogValue pt close watch dog value - PtCloseWatchDogValue = "0" -) - -// process schedule ms env -const ( - // MsRecoverEnv ms recover env - MsRecoverEnv = "MS_ENABLE_TFT" - // EnableMS enable ms - EnableMS = "MINDIO_FOR_MINDSPORE" - // MsDumpStrategy ms dump strategy - MsDumpStrategy = "TTP:1" - // MsUceStrategy ms uce strategy - MsUceStrategy = "UCE:1" - // MsArfStrategy ms arf strategy - MsArfStrategy = "ARF:1" - // MsHcceStrategy ms hcce strategy - MsHcceStrategy = "HCCE:1" - // MsRscStrategy ms rsc strategy - MsRscStrategy = "RSC:1" - // MsCloseWatchDogKey ms close watch dog key - MsCloseWatchDogKey = "MS_ENABLE_THM" - // MsCloseWatchDogValue ms close watch dog value - MsCloseWatchDogValue = `{HCCL_WATCHDOG:0}` -) - -const ( - //EnableFunc Enable Func - EnableFunc = "on" - // EnableFlag enable flag - EnableFlag = "1" - // PytorchFramework framework - PytorchFramework = "pytorch" - // MindSporeFramework framework - MindSporeFramework = "mindspore" -) - -const ( - // RescheduleInPlaceKey reschedule in place key - RescheduleInPlaceKey = "reschedule-in-place" - // RescheduleInPlaceValue reschedule in place value - RescheduleInPlaceValue = "true" -) - -const ( - // DeviceResetTimeout device reset timeout - DeviceResetTimeout = "deviceResetTimeout" - // DefaultDeviceResetTimeout default device reset timeout is 60 seconds - DefaultDeviceResetTimeout = 60 - // MinDeviceResetTimeout min device reset timeout is 10 seconds - MinDeviceResetTimeout = 10 - // MaxDeviceResetTimeout max device reset timeout is 600 seconds - MaxDeviceResetTimeout = 600 -) - -const ( - // SubHealthyStrategy config in pod group label for subHealthy fault strategy - SubHealthyStrategy = "subHealthyStrategy" - // SubHealthyHotSwitch strategy name of hot switch - SubHealthyHotSwitch = "hotSwitch" -) - -const ( - // MinAvailableKey decide minAvailable of task - MinAvailableKey = "huawei.com/schedule_minAvailable" -) diff --git a/mind-cluster/component/ascend-common/api/default_name.go b/mind-cluster/component/ascend-common/api/default_name.go deleted file mode 100644 index 7f0ae6c..0000000 --- a/mind-cluster/component/ascend-common/api/default_name.go +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api common brand moniker -package api - -// common -const ( - // Pod910DeviceAnno annotation value is for generating 910 hccl rank table - Pod910DeviceAnno = "ascend.kubectl.kubernetes.io/ascend-910-configuration" - - // ResourceNamePrefix pre resource name - ResourceNamePrefix = "huawei.com/" - // PodRealAlloc pod annotation key, means pod real mount device - PodRealAlloc = "AscendReal" - - // PodAnnotationAscendReal pod annotation ascend real - PodAnnotationAscendReal = "huawei.com/AscendReal" - - // Ascend brand name - Ascend = "Ascend" - // AscendJob job kind is AscendJob - AscendJob = "AscendJob" - // AscendJobsLowerCase for ascend jobs lowercase - AscendJobsLowerCase = "ascendjobs" - - // AscendOperator ascend-Operator - AscendOperator = "ascend-Operator" -) - -// common 910 -const ( - // Ascend910 for 910 chip - Ascend910 = "Ascend910" - // Ascend910Lowercase for 910 chip lowercase - Ascend910Lowercase = "ascend910" - // HuaweiAscend910 ascend 910 chip with prefix - HuaweiAscend910 = "huawei.com/Ascend910" - // Ascend910MinuxPrefix name prefix of ascend 910 chip - Ascend910MinuxPrefix = "Ascend910-" - // Ascend910MinuxCase minus type of ascend 910 chip - Ascend910MinuxCase = "ascend-910" - // Ascend910No 910 chip number - Ascend910No = "910" -) - -// common 910 A1 -const ( - // Ascend910A ascend 910A chip - Ascend910A = "Ascend910" - // Ascend910APattern regular expression for 910A - Ascend910APattern = `^910` -) - -// common 910 A2 -const ( - // Ascend910B ascend 910B chip - Ascend910B = "Ascend910B" - // Ascend910BPattern regular expression for 910B - Ascend910BPattern = `^(910B\d{1}|A2G\d{1})` -) - -// common 910 A3 -const ( - // Ascend910A3 ascend Ascend910A3 chip - Ascend910A3 = "Ascend910A3" -) - -// common 310 -const ( - // Ascend310 ascend 310 chip - Ascend310 = "Ascend310" - // Ascend310Lowercase ascend 310 chip lowercase - Ascend310Lowercase = "ascend310" - // Ascend310No 310 chip number - Ascend310No = "310" - // HuaweiAscend310 ascend 310 chip with prefix - HuaweiAscend310 = "huawei.com/Ascend310" - // Ascend310MinuxPrefix name prefix of ascend 310 chip - Ascend310MinuxPrefix = "Ascend310-" -) - -// common 310B -const ( - // Ascend310B ascend 310B chip - Ascend310B = "Ascend310B" - // Ascend310BNo 310B chip number - Ascend310BNo = "310B" -) - -// common 310P -const ( - // Ascend310P ascend 310P chip - Ascend310P = "Ascend310P" - // Ascend310PLowercase ascend 310P chip lowercase - Ascend310PLowercase = "ascend310P" - // Ascend310PNo 310P chip number - Ascend310PNo = "310P" - // Ascend310PPattern regular expression for 310P - Ascend310PPattern = `^(310P\d{0,1}|I2\d{0,1})` - // HuaweiAscend310P ascend 310P chip with prefix - HuaweiAscend310P = "huawei.com/Ascend310P" - // Ascend310PMinuxPrefix name prefix of ascend 310P chip - Ascend310PMinuxPrefix = "Ascend310P-" -) - -// device plugin -const ( - // Use310PMixedInsert use 310P Mixed insert - Use310PMixedInsert = "use310PMixedInsert" - // Ascend310PMix dp use310PMixedInsert parameter usage - Ascend310PMix = "ascend310P-V, ascend310P-VPro, ascend310P-IPro" - // A300IA2Label the value of the A300I A2 node label - A300IA2Label = "card-910b-infer" - // A300IDuoLabel the value of the A300I Duo node label - A300IDuoLabel = "card-300i-duo" - //UseAscendDocker UseAscendDocker parameter - UseAscendDocker = "useAscendDocker" -) - -// docker runtime -const ( - // AscendDockerRuntime ascend-docker-runtime - AscendDockerRuntime = "ascend-docker-runtime" - // AscendDockerHook ascend-docker-hook - AscendDockerHook = "ascend-docker-hook" - // AscendDockerDestroy ascend-docker-destroy - AscendDockerDestroy = "ascend-docker-destroy" - // AscendDockerCli ascend-docker-cli - AscendDockerCli = "ascend-docker-cli" - - // AscendDockerRuntimeEnv env variable - AscendDockerRuntimeEnv = "ASCEND_DOCKER_RUNTIME" - // AscendVisibleDevicesEnv env variable - AscendVisibleDevicesEnv = "ASCEND_VISIBLE_DEVICES" - // AscendRuntimeOptionsEnv env variable - AscendRuntimeOptionsEnv = "ASCEND_RUNTIME_OPTIONS" - // AscendRuntimeMountsEnv env variable - AscendRuntimeMountsEnv = "ASCEND_RUNTIME_MOUNTS" - // AscendAllowLinkEnv env variable - AscendAllowLinkEnv = "ASCEND_ALLOW_LINK" - // AscendVnpuSpescEnv env variable - AscendVnpuSpescEnv = "ASCEND_VNPU_SPECS" - - // RunTimeLogDir dir path of runtime - RunTimeLogDir = "/var/log/ascend-docker-runtime/" - // HookRunLogPath run log path of hook - HookRunLogPath = "/var/log/ascend-docker-runtime/hook-run.log" - // InstallHelperRunLogPath run log path of install helper - InstallHelperRunLogPath = "/var/log/ascend-docker-runtime/install-helper-run.log" - // RunTimeRunLogPath run log path of runtime - RunTimeRunLogPath = "/var/log/ascend-docker-runtime/runtime-run.log" - - // RunTimeDConfigPath config path - RunTimeDConfigPath = "/etc/ascend-docker-runtime.d" -) - -// npu exporter -const ( - // DevicePathPattern device path pattern - DevicePathPattern = `^/dev/davinci\d+$` - // HccsBWProfilingTimeStr preset parameter name - HccsBWProfilingTimeStr = "hccsBWProfilingTime" - // Hccs log options domain value - Hccs = "hccs" - // Prefix pre statistic info - Prefix = "npu_chip_info_hccs_statistic_info_" - // BwPrefix pre bandwidth info - BwPrefix = "npu_chip_info_hccs_bandwidth_info_" - // AscendDeviceInfo - AscendDeviceInfo = "ASCEND_VISIBLE_DEVICES" -) - -const ( - // AscendJobKind is the kind name - AscendJobKind = "AscendJob" - // DefaultContainerName the default container name for AscendJob. - DefaultContainerName = "ascend" - // DefaultPortName is name of the port used to communicate between other process. - DefaultPortName = "ascendjob-port" - // ControllerName is the name of controller,used in log. - ControllerName = "ascendjob-controller" - // OperatorName name of operator - OperatorName = "ascend-operator" - // LogModuleName name of log module - LogModuleName = "hwlog" - // OperatorLogFilePath Operator log file name - OperatorLogFilePath = "/var/log/mindx-dl/ascend-operator/ascend-operator.log" -) diff --git a/mind-cluster/component/ascend-common/api/publicfault.go b/mind-cluster/component/ascend-common/api/publicfault.go deleted file mode 100644 index 8561145..0000000 --- a/mind-cluster/component/ascend-common/api/publicfault.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api structs for public fault -package api - -// PubFaultInfo struct for public fault input -type PubFaultInfo struct { - Id string `json:"id"` - TimeStamp int64 `json:"timestamp"` - Version string `json:"version"` - Resource string `json:"resource"` - Faults []Fault `json:"faults"` -} - -// Fault public fault cm item Fault -type Fault struct { - FaultId string `json:"faultId"` - FaultType string `json:"faultType"` - FaultCode string `json:"faultCode"` - FaultTime int64 `json:"faultTime"` - Assertion string `json:"assertion"` - FaultLocation map[string]string `json:"faultLocation"` - Influence []Influence `json:"influence"` - Description string `json:"description"` -} - -// Influence public fault cm item Influence -type Influence struct { - NodeName string `json:"nodeName"` - NodeSN string `json:"nodeSN"` - DeviceIds []int32 `json:"deviceIds"` -} diff --git a/mind-cluster/component/ascend-common/api/slownet/fault_net.go b/mind-cluster/component/ascend-common/api/slownet/fault_net.go deleted file mode 100644 index eacde6a..0000000 --- a/mind-cluster/component/ascend-common/api/slownet/fault_net.go +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package slownet for net fault detect common -package slownet - -import ( - "fmt" - "os" - "path/filepath" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" -) - -const ( - rasNetRootPathKey = "RAS_NET_ROOT_PATH" - netFaultSubPath = "cluster" - detectConf = "cathelper.conf" -) - -// GetRasNetRootPath get ras net fault detect root path from env -func GetRasNetRootPath() (string, error) { - rootPath := os.Getenv(rasNetRootPathKey) - if len(rootPath) == 0 { - return "", fmt.Errorf("env %s not exists, please config it before starting", rasNetRootPathKey) - } - if !utils.IsDir(rootPath) { - return "", fmt.Errorf("env %s=%s, which is not dir", rasNetRootPathKey, rootPath) - } - safeRootPath, err := utils.CheckPath(rootPath) - if err != nil { - return "", fmt.Errorf("env %s=%s, which is invalid, err: %v", rasNetRootPathKey, rootPath, err) - } - return safeRootPath, nil -} - -// GetPingListFilePath get ping list task info file for ping mesh -func GetPingListFilePath(superPodId, serverIndex string) (string, error) { - rootPath, err := GetRasNetRootPath() - if err != nil { - return "", err - } - return filepath.Join(rootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), - fmt.Sprintf("ping_list_%s.json", serverIndex)), nil -} - -// GetSuperPodInfoFilePath get super pod info file path -func GetSuperPodInfoFilePath(superPodID, superPodPrefix string) (string, error) { - rootPath, err := GetRasNetRootPath() - if err != nil { - hwlog.RunLog.Errorf("get ras net root path failed, err : %v", err) - return "", err - } - superPodPathName := fmt.Sprintf("%s-%s", superPodPrefix, superPodID) - fileName := fmt.Sprintf("%s.json", superPodPathName) - filePath := filepath.Join(rootPath, netFaultSubPath, superPodPathName, fileName) - if _, errInfo := utils.CheckPath(filePath); errInfo != nil { - hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) - return "", errInfo - } - return filePath, nil -} - -// GetConfigPathForDetect the config path for network fault detect so -func GetConfigPathForDetect(superPodId string) (string, error) { - rasNetRootPath, err := GetRasNetRootPath() - if err != nil { - hwlog.RunLog.Errorf("get ras net root path failed, err: %v", err) - return "", err - } - confPath := filepath.Join(rasNetRootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), detectConf) - if _, errInfo := utils.CheckPath(confPath); errInfo != nil { - hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) - return "", errInfo - } - return confPath, nil -} diff --git a/mind-cluster/component/ascend-common/api/superpoddevice.go b/mind-cluster/component/ascend-common/api/superpoddevice.go deleted file mode 100644 index 4039dcb..0000000 --- a/mind-cluster/component/ascend-common/api/superpoddevice.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api structs for SuperPodDevice -package api - -import "k8s.io/apimachinery/pkg/util/sets" - -// NpuBaseInfo is the base info of npu -type NpuBaseInfo struct { - IP string - SuperDeviceID uint32 -} - -// NodeDevice node device info -type NodeDevice struct { - NodeName string - ServerID string - ServerType string `json:"-"` - DeviceMap map[string]string // key: dev phyID, value: superPod device id -} - -// SuperPodDevice super node device info, key is superPodID, value is NodeDevice -type SuperPodDevice struct { - Version string - SuperPodID string - NodeDeviceMap map[string]*NodeDevice -} - -// SuperPodFaultInfos super pod fault info -type SuperPodFaultInfos struct { - SdIds []string - FaultNodes sets.String - NodeNames []string - FaultTimes int64 - JobId string `json:"JobId,omitempty"` -} diff --git a/mind-cluster/component/ascend-common/api/type.go b/mind-cluster/component/ascend-common/api/type.go deleted file mode 100644 index 9a2cde1..0000000 --- a/mind-cluster/component/ascend-common/api/type.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api common const -package api - -// ResetCmInfo is the reset config info of a task -type ResetCmInfo struct { - RankList []*DevFaultnfo - UpdateTime int64 - RetryTime int - FaultFlushing bool - GracefulExit int - RestartFaultProcess bool -} - -// DevFaultnfo is the device info of a task -type DevFaultnfo struct { - RankId int - FaultInfo -} - -// FaultInfo is the fault info of device -type FaultInfo struct { - LogicId int32 - Status string - Policy string - InitialPolicy string - ErrorCode []int64 - ErrorCodeHex string -} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go deleted file mode 100644 index 0c0d420..0000000 --- a/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go +++ /dev/null @@ -1,394 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package cache implement a memory-based LRU local cache -package cache - -import ( - "container/list" - "errors" - "fmt" - "math" - "sync" - "time" -) - -const ( - segmentCount = 16 - int64One int64 = 1 - int64Zero int64 = 0 - negInt64One int64 = -1 - intTwo = 2 - hashInit uint32 = 2166136261 - prime32 uint32 = 16777619 - twentyYears time.Duration = 20 * 365 * 24 * time.Hour -) - -var ( - notInitErr = errors.New("not initializes") - paraErr = errors.New("parameter error") -) - -type cacheEle struct { - key string - data interface{} - expireTime int64 -} - -type lruCache struct { - maxSize int - elemIndex map[string]*list.Element - *list.List - mu sync.Mutex -} - -// ConcurrencyLRUCache is a memory-based LRU local cache, default total 16 segment to improve concurrent performance -// LRU is not real least recently used for the total cache,but just for each buket -// we just need a proper method to clear cache -type ConcurrencyLRUCache struct { - segment int - cacheBuket [segmentCount]*lruCache -} - -// Set create or update an element using key -// key: The identity of an element -// value: new value of the element -// expireTime: expire time, positive int64 or -1 which means never overdue -func (cl *ConcurrencyLRUCache) Set(key string, value interface{}, expireTime time.Duration) error { - if cl == nil || cl.cacheBuket[0] == nil { - return notInitErr - } - if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { - return paraErr - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].setValue(key, value, expireTime) -} - -// Get get the value of a cached element by key. If key do not exist, this function will return nil and an error msg -// key: The identity of an element -// return: -// value: the cached value, nil if key do not exist -// err: error info, nil if value is not nil -func (cl *ConcurrencyLRUCache) Get(key string) (interface{}, error) { - if cl == nil || cl.cacheBuket[0] == nil { - return nil, notInitErr - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return nil, errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].getValue(key) -} - -// Delete delete the value by key, no error returned -func (cl *ConcurrencyLRUCache) Delete(key string) { - if cl == nil || cl.cacheBuket[0] == nil { - return - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return - } - cl.cacheBuket[cacheIndex].delValue(key) -} - -// SetIfNX if the key not exist or expired, will set the new value to cache and return true ,otherwise return false -func (cl *ConcurrencyLRUCache) SetIfNX(key string, value interface{}, expireTime time.Duration) bool { - if cl == nil || cl.cacheBuket[0] == nil { - return false - } - if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { - return false - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return false - } - return cl.cacheBuket[cacheIndex].setIfNotExist(key, value, expireTime) -} - -// INCR add one to the value(must int64) of the key , if the key not exist, initialize with 0 and then add one -func (cl *ConcurrencyLRUCache) INCR(key string, expireTime time.Duration) (int64, error) { - if err := validate(cl, expireTime); err != nil { - return 0, err - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return 0, errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].increment(key, expireTime) -} - -// DECR minus one to the value(must int64) of the key,if the key not exist, initialize with 0 and then minus one -func (cl *ConcurrencyLRUCache) DECR(key string, expireTime time.Duration) (int64, error) { - if err := validate(cl, expireTime); err != nil { - return 0, err - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return 0, errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].decrement(key, expireTime) -} - -func validate(cl *ConcurrencyLRUCache, expireTime time.Duration) error { - if cl == nil || cl.cacheBuket[0] == nil { - return paraErr - } - if expireTime <= 0 && expireTime != time.Duration(negInt64One) { - return paraErr - } - return nil -} - -// index calculate the key hashcode and index the right buket -func (cl *ConcurrencyLRUCache) index(key string) int { - var hash = hashInit - for i := 0; i < len(key); i++ { - hash *= prime32 - hash ^= uint32(key[i]) - } - return int(hash & (uint32(cl.segment) - 1)) -} - -// New create an instance of ConcurrencyLRUCache -// maxEntries the cache size, will to convert to (n/16+n%16>0?1:0)*16 -func New(maxEntries int) *ConcurrencyLRUCache { - if maxEntries <= 0 { - return nil - } - size := maxEntries / segmentCount - remain := maxEntries % segmentCount - if remain > 0 { - size += 1 - } - var cache [segmentCount]*lruCache - for i := 0; i < segmentCount; i++ { - cache[i] = &lruCache{ - maxSize: size, - elemIndex: make(map[string]*list.Element, segmentCount), - List: list.New(), - mu: sync.Mutex{}, - } - } - return &ConcurrencyLRUCache{ - segment: segmentCount, - cacheBuket: cache, - } -} - -func (c *lruCache) setValue(key string, value interface{}, expireTime time.Duration) error { - if c == nil || c.elemIndex == nil { - return errors.New("not initializes") - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - // if the cache not exist - c.setInner(key, value, expireTime) - return nil - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - return errors.New("cacheElement convert failed") - } - c.MoveToFront(v) - pkgElement(ele, value, expireTime) - return nil -} - -func pkgElement(ele *cacheEle, value interface{}, expireTime time.Duration) { - ele.data = value - if expireTime == time.Duration(negInt64One) { - ele.expireTime = negInt64One - return - } - ele.expireTime = time.Now().UnixNano() + int64(expireTime) -} - -func (c *lruCache) getValue(key string) (interface{}, error) { - if c == nil || c.elemIndex == nil { - return nil, errors.New("not initializes") - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - return nil, errors.New("no value found") - } - c.MoveToFront(v) - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - return nil, errors.New("cacheElement convert failed") - } - if ele.expireTime != negInt64One && time.Now().UnixNano() > ele.expireTime { - // if cache expired - c.safeDeleteByKey(key, v) - return nil, errors.New("the key was expired") - } - return ele.data, nil -} - -// Delete delete an element -func (c *lruCache) delValue(key string) { - if c == nil || c.elemIndex == nil { - return - } - c.mu.Lock() - defer c.mu.Unlock() - if v, ok := c.elemIndex[key]; ok { - c.safeDeleteByKey(key, v) - } -} - -func (c *lruCache) setIfNotExist(key string, value interface{}, expireTime time.Duration) bool { - if c == nil || c.elemIndex == nil { - return false - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - // if the cache not exist - c.setInner(key, value, expireTime) - return true - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - return false - } - c.MoveToFront(v) - if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { - return false - } - // if cache expired - pkgElement(ele, value, expireTime) - return true -} - -func (c *lruCache) increment(key string, expireTime time.Duration) (int64, error) { - if c == nil || c.elemIndex == nil { - return 0, notInitErr - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - c.setInner(key, int64One, expireTime) - return int64One, nil - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - c.setInner(key, int64One, expireTime) - return int64One, nil - } - c.MoveToFront(v) - if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { - newValue, ok := ele.data.(int64) - if !ok || newValue == math.MaxInt64 { - return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) - } - newValue++ - pkgElement(ele, newValue, expireTime) - return newValue, nil - } - // if cache expired - pkgElement(ele, int64One, expireTime) - return int64One, nil -} - -func (c *lruCache) decrement(key string, expireTime time.Duration) (int64, error) { - if c == nil || c.elemIndex == nil { - return 0, notInitErr - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - // if the cache not exist - c.setInner(key, negInt64One, expireTime) - return negInt64One, nil - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - c.setInner(key, negInt64One, expireTime) - return negInt64One, nil - } - c.MoveToFront(v) - if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { - newValue, ok := ele.data.(int64) - if !ok || newValue == math.MinInt64 { - return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) - } - newValue-- - pkgElement(ele, newValue, expireTime) - return newValue, nil - } - // if cache expired - pkgElement(ele, negInt64One, expireTime) - return negInt64One, nil -} - -func (c *lruCache) setInner(key string, value interface{}, expireTime time.Duration) { - if c == nil { - return - } - if c.Len()+1 > c.maxSize { - c.safeRemoveOldest() - } - newElem := &cacheEle{ - key: key, - data: value, - expireTime: negInt64One, - } - if expireTime != time.Duration(negInt64One) { - newElem.expireTime = time.Now().UnixNano() + int64(expireTime) - } - e := c.PushFront(newElem) - c.elemIndex[key] = e -} - -func (c *lruCache) safeDeleteByKey(key string, v *list.Element) { - if c == nil { - return - } - c.List.Remove(v) - delete(c.elemIndex, key) -} - -func (c *lruCache) safeRemoveOldest() { - if c == nil { - return - } - v := c.List.Back() - if v == nil { - return - } - c.List.Remove(v) - ele, ok := v.Value.(*cacheEle) - if !ok { - return - } - delete(c.elemIndex, ele.key) -} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go deleted file mode 100644 index a8b5ea0..0000000 --- a/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go +++ /dev/null @@ -1,304 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package cache implement a memory-based LRU local cache -package cache - -import ( - "container/list" - "fmt" - "math" - "sync" - "testing" - "time" - - "github.com/smartystreets/goconvey/convey" -) - -const ( - cacheTime = 500 - goRoutineCount = 10 -) - -func TestSet(t *testing.T) { - cache := New(1) - convey.Convey("test lru cacheTime", t, func() { - cache.Set("testkey1", "1", cacheTime*time.Millisecond) - v, err := cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - time.Sleep(cacheTime * time.Millisecond) - v, err = cache.Get("testkey1") - convey.So(v, convey.ShouldEqual, nil) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("test set twice", t, func() { - cache.Set("testkey1", "1", time.Minute) - v, err := cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - cache.Set("testkey1", "2", time.Minute) - v, err = cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "2") - }) - convey.Convey("SET failed", t, func() { - c := &lruCache{} - err := c.setValue("test", "1", time.Minute) - convey.So(err.Error(), convey.ShouldEqual, "not initializes") - _, err = c.getValue("test") - convey.So(err.Error(), convey.ShouldEqual, "not initializes") - }) - convey.Convey("SET not expired", t, func() { - cache.Set("testkey2", "1", time.Second) - err := cache.Set("testkey2", "1", time.Duration(negInt64One)) - convey.So(err, convey.ShouldEqual, nil) - v, err := cache.Get("testkey2") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - }) - convey.Convey("SET parameter error", t, func() { - err := cache.Set("testkey2", "1", -time.Second) - convey.So(err.Error(), convey.ShouldEqual, "parameter error") - }) -} - -func TestDelete(t *testing.T) { - cache := New(1) - convey.Convey("test lru delete", t, func() { - cache.Set("testkey1", "1", time.Minute) - v, err := cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - cache.Delete("testkey1") - v, err = cache.Get("testkey1") - convey.So(v, convey.ShouldEqual, nil) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("Delete no thing", t, func() { - c := &lruCache{} - c.delValue("test") - }) -} - -func TestSetIfNX(t *testing.T) { - cache := New(1) - convey.Convey("SetIfNX set parameter error", t, func() { - r := cache.SetIfNX("testkey1", "1", -time.Millisecond) - convey.So(r, convey.ShouldEqual, false) - }) - convey.Convey("SetIfNX set success", t, func() { - r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, true) - }) - convey.Convey("SetIfNX set success failed", t, func() { - r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, false) - }) - time.Sleep(cacheTime * time.Millisecond) - convey.Convey("SetIfNX set success", t, func() { - r := cache.SetIfNX("testkey1", "1", time.Second) - convey.So(r, convey.ShouldEqual, true) - }) - convey.Convey("SetIfNX expireTime -1", t, func() { - r := cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) - convey.So(r, convey.ShouldEqual, true) - r = cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) - convey.So(r, convey.ShouldEqual, false) - }) - -} - -func TestSetIfNXConcurrencyTest(t *testing.T) { - cache := New(1) - convey.Convey("SetIfNX concurrency test", t, func() { - var count = 0 - count = testSetIfNX(cache, count) - convey.So(count, convey.ShouldEqual, 1) - }) -} - -func testSetIfNX(cache *ConcurrencyLRUCache, count int) int { - l := sync.Mutex{} - wg := sync.WaitGroup{} - wg.Add(goRoutineCount) - for i := 0; i < goRoutineCount; i++ { - go func() { - r := cache.SetIfNX("testkey2", "1", time.Second) - if r { - l.Lock() - count++ - l.Unlock() - } - wg.Done() - }() - } - wg.Wait() - return count -} - -func TestINCRConcurrencyTest(t *testing.T) { - cache := New(1) - convey.Convey("INCR concurrency test", t, func() { - max := testIncr(cache) - convey.So(max, convey.ShouldEqual, goRoutineCount) - }) -} - -func testIncr(cache *ConcurrencyLRUCache) int64 { - var max = int64Zero - l := sync.Mutex{} - wg := sync.WaitGroup{} - wg.Add(goRoutineCount) - for i := 0; i < goRoutineCount; i++ { - go func() { - r, err := cache.INCR("testkey1", time.Second) - if err != nil { - return - } - l.Lock() - if r > max { - max = r - } - l.Unlock() - wg.Done() - }() - } - wg.Wait() - return max -} - -func TestDECRConcurrencyTest(t *testing.T) { - cache := New(1) - cache.Set("testkey1", int64(goRoutineCount), time.Minute) - convey.Convey("INCR concurrency test", t, func() { - min := testDecr(cache) - convey.So(min, convey.ShouldEqual, 0) - }) -} - -func testDecr(cache *ConcurrencyLRUCache) int64 { - var min = int64(math.MaxInt) - l := sync.Mutex{} - wg := sync.WaitGroup{} - wg.Add(goRoutineCount) - for i := 0; i < goRoutineCount; i++ { - go func() { - r, err := cache.DECR("testkey1", time.Second) - if err != nil { - return - } - l.Lock() - if r < min { - min = r - } - l.Unlock() - wg.Done() - }() - } - wg.Wait() - return min -} - -func TestINCR(t *testing.T) { - cache := New(1) - convey.Convey("not initializes", t, func() { - c := &lruCache{} - _, err := c.increment("test", time.Minute) - convey.So(err, convey.ShouldEqual, notInitErr) - }) - convey.Convey("parameter error", t, func() { - _, err := cache.INCR("testkey", -time.Minute) - convey.So(err, convey.ShouldEqual, paraErr) - }) - convey.Convey("INCR success", t, func() { - r, err := cache.INCR("testkey", time.Minute) - convey.So(r, convey.ShouldEqual, 1) - convey.So(err, convey.ShouldEqual, nil) - r, err = cache.INCR("testkey", time.Minute) - convey.So(r, convey.ShouldEqual, intTwo) - }) - - convey.Convey("INCR success when exits", t, func() { - cache.Set("testkey1", int64Zero, cacheTime*time.Millisecond) - r, err := cache.INCR("testkey1", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, 1) - convey.So(err, convey.ShouldEqual, nil) - time.Sleep(cacheTime * time.Millisecond) - r, err = cache.INCR("testkey1", time.Minute) - convey.So(r, convey.ShouldEqual, 1) - }) -} - -func TestDECR(t *testing.T) { - cache := New(1) - convey.Convey("not initializes", t, func() { - c := &lruCache{} - _, err := c.decrement("test", time.Minute) - convey.So(err, convey.ShouldEqual, notInitErr) - }) - convey.Convey("parameter error", t, func() { - _, err := cache.DECR("testkey1", -time.Minute) - convey.So(err, convey.ShouldEqual, paraErr) - }) - convey.Convey("SetIfNX set success", t, func() { - r, err := cache.DECR("testkey1", time.Minute) - convey.So(r, convey.ShouldEqual, negInt64One) - convey.So(err, convey.ShouldEqual, nil) - cache.Set("testkey1", int64One, time.Minute) - r, err = cache.DECR("testkey1", time.Minute) - convey.So(r, convey.ShouldEqual, 0) - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("Decr success when exits", t, func() { - cache.Set("testkey2", int64One, cacheTime*time.Millisecond) - r, err := cache.DECR("testkey2", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, 0) - convey.So(err, convey.ShouldEqual, nil) - time.Sleep(cacheTime * time.Millisecond) - r, err = cache.DECR("testkey2", time.Minute) - convey.So(err, convey.ShouldEqual, nil) - convey.So(r, convey.ShouldEqual, negInt64One) - }) -} - -func TestLRU(t *testing.T) { - convey.Convey("not initializes", t, func() { - c := &lruCache{ - maxSize: intTwo, - elemIndex: make(map[string]*list.Element, segmentCount), - List: list.New(), - mu: sync.Mutex{}, - } - c.setValue("test", "1", time.Minute) - c.setValue("test1", "1", time.Minute) - c.setValue("test2", "1", time.Minute) - _, err := c.getValue("test") - convey.So(err.Error(), convey.ShouldEqual, "no value found") - }) -} - -func BenchmarkSetIfNx(b *testing.B) { - cache := New(1) - for n := 0; n < b.N; n++ { - cache.SetIfNX(fmt.Sprintf("key%d", n), "xx", time.Second) - } -} - -func BenchmarkINCR(b *testing.B) { - cache := New(1) - for n := 0; n < b.N; n++ { - cache.INCR("sdds", time.Second) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api.go deleted file mode 100644 index 65de3e7..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/api.go +++ /dev/null @@ -1,310 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "context" - "fmt" - "io" - "log" - "os" - "path" -) - -const ( - logDebugLv = iota - 1 - logInfoLv - logWarnLv - logErrorLv - logCriticalLv -) - -type logger struct { - lgDebug *log.Logger - lgInfo *log.Logger - lgWarn *log.Logger - lgError *log.Logger - lgCritical *log.Logger - lgCtrl *LogLimiter - lgLevel int - lgMaxLine int -} - -func (lg *logger) initLogWriter(w io.Writer) { - lg.lgDebug = log.New(w, "[DEBUG] ", log.Ldate|log.Lmicroseconds) - lg.lgInfo = log.New(w, "[INFO] ", log.Ldate|log.Lmicroseconds) - lg.lgWarn = log.New(w, "[WARN] ", log.Ldate|log.Lmicroseconds) - lg.lgError = log.New(w, "[ERROR] ", log.Ldate|log.Lmicroseconds) - lg.lgCritical = log.New(w, "[Critical] ", log.Ldate|log.Lmicroseconds) -} - -func (lg *logger) setLoggerLevel(lv int) { - if lv < minLogLevel || lv > maxLogLevel { - lg.lgLevel = 0 - return - } - lg.lgLevel = lv -} - -func (lg *logger) setLoggerMaxLine(lml int) { - if lml <= 0 || lml > maxEachLineLen { - lg.lgMaxLine = defaultMaxEachLineLen - return - } - lg.lgMaxLine = lml -} - -func (lg *logger) setLoggerWriter(config *LogConfig) { - rollLogger := &Logs{ - FileName: config.LogFileName, - Capacity: config.FileMaxSize, // megabytes - SaveVolume: config.MaxBackups, - SaveTime: config.MaxAge, // days - } - logWriter := &LogLimiter{ - Logs: rollLogger, - ExpiredTime: config.ExpiredTime, // seconds - CacheSize: config.CacheSize, - } - if config.OnlyToStdout { - lg.initLogWriter(os.Stdout) - return - } - if config.OnlyToFile { - lg.initLogWriter(logWriter) - return - } - writer := io.MultiWriter(os.Stdout, logWriter) - lg.initLogWriter(writer) - lg.lgCtrl = logWriter -} - -func (lg *logger) setLogger(config *LogConfig) error { - if err := validateLogConfigFiled(config); err != nil { - return err - } - lg.setLoggerWriter(config) - lg.setLoggerLevel(config.LogLevel) - lg.setLoggerMaxLine(config.MaxLineLength) - msg := fmt.Sprintf("%s's logger init success", path.Base(config.LogFileName)) - // skip change file mode and fs notify - if config.OnlyToStdout { - msg = fmt.Sprintf("%s, only to stdout", msg) - return nil - } - lg.Info(msg) - if err := os.Chmod(config.LogFileName, LogFileMode); err != nil { - lg.Errorf("change file mode failed: %v", err) - return fmt.Errorf("set log file mode failed") - } - return nil -} - -func (lg *logger) isInit() bool { - return lg.lgDebug != nil && lg.lgInfo != nil && lg.lgWarn != nil && lg.lgError != nil && lg.lgCritical != nil -} - -// Debug record debug not format -func (lg *logger) Debug(args ...interface{}) { - lg.DebugWithCtx(nil, args...) -} - -// Debugf record debug -func (lg *logger) Debugf(format string, args ...interface{}) { - lg.DebugfWithCtx(nil, format, args...) -} - -// DebugWithCtx record Debug not format -func (lg *logger) DebugWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logDebugLv { - return - } - if lg.validate() { - printHelper(lg.lgDebug, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// DebugfWithCtx record Debug format -func (lg *logger) DebugfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logDebugLv { - return - } - if lg.validate() { - printHelper(lg.lgDebug, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// Info record info not format -func (lg *logger) Info(args ...interface{}) { - lg.InfoWithCtx(nil, args...) -} - -// Infof record info -func (lg *logger) Infof(format string, args ...interface{}) { - lg.InfofWithCtx(nil, format, args...) -} - -// InfoWithCtx record Info not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) InfoWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logInfoLv { - return - } - if lg.validate() { - printHelper(lg.lgInfo, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// InfofWithCtx record Info format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) InfofWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logInfoLv { - return - } - if lg.validate() { - printHelper(lg.lgInfo, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// Warn record warn not format -func (lg *logger) Warn(args ...interface{}) { - lg.WarnWithCtx(nil, args...) -} - -// Warnf record warn -func (lg *logger) Warnf(format string, args ...interface{}) { - lg.WarnfWithCtx(nil, format, args...) -} - -// WarnWithCtx record Warn not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) WarnWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logWarnLv { - return - } - if lg.validate() { - printHelper(lg.lgWarn, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// WarnfWithCtx record Warn format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) WarnfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logWarnLv { - return - } - if lg.validate() { - printHelper(lg.lgWarn, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// WarnfWithLimit record warn for default times (default 3),domain is for logType of msg, -// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt -func (lg *logger) WarnfWithLimit(domain string, id interface{}, format string, args ...interface{}) { - if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - lg.WarnfWithCtx(nil, format, args...) - } -} - -// Error record error not format -func (lg *logger) Error(args ...interface{}) { - lg.ErrorWithCtx(nil, args...) -} - -// Errorf record error -func (lg *logger) Errorf(format string, args ...interface{}) { - lg.ErrorfWithCtx(nil, format, args...) -} - -// ErrorfWithLimit record error for default times (default 3),domain is for logType of msg, -// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt -func (lg *logger) ErrorfWithLimit(domain string, id interface{}, format string, args ...interface{}) { - if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - lg.ErrorfWithCtx(nil, format, args...) - } -} - -// ErrorfWithSpecifiedCounts record error for specified times,domain is for logType of msg, -// id is a unique identifier of this logType,maxCounts is for max print counts, -// you can reset the counter by call ResetErrCnt -func (lg *logger) ErrorfWithSpecifiedCounts(domain string, id interface{}, maxCounts int, - format string, args ...interface{}) { - if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, maxCounts); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - lg.ErrorfWithCtx(nil, format, args...) - } -} - -// ErrorWithCtx record Error not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) ErrorWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logErrorLv { - return - } - if lg.validate() { - printHelper(lg.lgError, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// ErrorfWithCtx record Error format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) ErrorfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logErrorLv { - return - } - if lg.validate() { - printHelper(lg.lgError, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// Critical record critical not format -func (lg *logger) Critical(args ...interface{}) { - lg.CriticalWithCtx(nil, args...) -} - -// Criticalf record Critical log format -func (lg *logger) Criticalf(format string, args ...interface{}) { - lg.CriticalfWithCtx(nil, format, args...) -} - -// CriticalWithCtx record Critical not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) CriticalWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logCriticalLv { - return - } - if lg.validate() { - printHelper(lg.lgCritical, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// CriticalfWithCtx record Critical format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) CriticalfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logCriticalLv { - return - } - if lg.validate() { - printHelper(lg.lgCritical, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -func (lg *logger) validate() bool { - if lg == nil || !lg.isInit() { - fmt.Println("Fatal function's logger is nil") - return false - } - return true -} - -// FlushMem writes the contents of the memory to the disk -func (lg *logger) FlushMem() error { - return lg.lgCtrl.Flush() -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go deleted file mode 100644 index ecdcef6..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "fmt" - "io/fs" - "os" - "path" - "path/filepath" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/utils" -) - -func TestNewLogger(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test setLogger func", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - // test for log file - mockPathCheck := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { - return "", nil - }) - mockMkdir := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { - return nil - }) - defer mockPathCheck.Reset() - defer mockMkdir.Reset() - lgConfig = &LogConfig{ - LogFileName: path.Join(filepath.Dir(os.Args[0]), "t.log"), - OnlyToFile: true, - MaxBackups: DefaultMaxBackups, - MaxAge: DefaultMinSaveAge, - CacheSize: DefaultCacheSize, - ExpiredTime: DefaultExpiredTime, - } - err = lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestLoggerPrint(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test logger print func", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - LogLevel: -1, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - lg.Debug("test debug") - lg.Debugf("test debugf") - lg.Info("test info") - lg.Infof("test infof") - lg.Warn("test warn") - lg.Warnf("test warnf") - lg.Error("test error") - lg.Errorf("test errorf") - lg.Critical("test critical") - lg.Criticalf("test criticalf") - lg.setLoggerLevel(maxLogLevel + 1) - lg.Debug("test debug") - lg.Debugf("test debugf") - lg.Info("test info") - lg.Infof("test infof") - lg.Warn("test warn") - lg.Warnf("test warnf") - lg.Error("test error") - lg.Errorf("test errorf") - lg.Critical("test critical") - lg.Criticalf("test criticalf") - }) - }) -} -func TestLoggerPrintWithLimit(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test logger print func with limit", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - LogLevel: -1, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - domain := "hccs" - logicId := 1 - - errFormat := "collect failed ,err:%v" - collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - ResetErrCnt(domain, logicId) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - }) - }) -} - -func TestWarnfWithLimit(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test warn logger print func with limit", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - LogLevel: -1, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - domain := "hccs" - logicId := 1 - - errFormat := "collect failed ,err:%v" - collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - ResetErrCnt(domain, logicId) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - }) - }) -} - -func TestValidate(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test validate", func() { - lg := new(logger) - res := lg.validate() - convey.So(res, convey.ShouldBeFalse) - lgConfig := &LogConfig{ - OnlyToStdout: true, - } - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - res = lg.validate() - convey.So(res, convey.ShouldBeTrue) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go deleted file mode 100644 index 5e5c567..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "context" - "errors" -) - -// RunLog run logger -var RunLog *logger - -// InitRunLogger initialize run logger -func InitRunLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("run logger config is nil") - } - if RunLog != nil && RunLog.isInit() { - RunLog.Warn("run logger is been initialized") - return nil - } - RunLog = new(logger) - if RunLog == nil { - return errors.New("malloc new logger flied") - } - if err := RunLog.setLogger(config); err != nil { - return err - } - if !RunLog.isInit() { - return errors.New("run logger init failed") - } - return nil -} - -// OpLog operate logger -var OpLog *logger - -// InitOperateLogger initialize operate logger -func InitOperateLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("operate logger config is nil") - } - if OpLog != nil && OpLog.isInit() { - OpLog.Warn("operate logger is been initialized") - return nil - } - OpLog = new(logger) - if OpLog == nil { - return errors.New("malloc new logger flied") - } - if err := OpLog.setLogger(config); err != nil { - return err - } - if !OpLog.isInit() { - return errors.New("operate logger init failed") - } - return nil -} - -// SecLog security logger -var SecLog *logger - -// InitSecurityLogger initialize security logger -func InitSecurityLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("security logger config is nil") - } - if SecLog != nil && SecLog.isInit() { - SecLog.Warn("security logger is been initialized") - return nil - } - SecLog = new(logger) - if SecLog == nil { - return errors.New("malloc new logger flied") - } - if err := SecLog.setLogger(config); err != nil { - return err - } - if !SecLog.isInit() { - return errors.New("security logger init failed") - } - return nil -} - -// UserLog user logger -var UserLog *logger - -// InitUserLogger initialize user logger -func InitUserLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("user logger config is nil") - } - if UserLog != nil && UserLog.isInit() { - UserLog.Warn("user logger is been initialized") - return nil - } - UserLog = new(logger) - if UserLog == nil { - return errors.New("malloc new logger flied") - } - if err := UserLog.setLogger(config); err != nil { - return err - } - if !UserLog.isInit() { - return errors.New("user logger init failed") - } - return nil -} - -// DebugLog debug logger -var DebugLog *logger - -// InitDebugLogger initialize debug logger -func InitDebugLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("debug logger config is nil") - } - if DebugLog != nil && DebugLog.isInit() { - DebugLog.Warn("debug logger is been initialized") - return nil - } - DebugLog = new(logger) - if DebugLog == nil { - return errors.New("malloc new logger flied") - } - if err := DebugLog.setLogger(config); err != nil { - return err - } - if !DebugLog.isInit() { - return errors.New("debug logger init failed") - } - return nil -} - -// CustomLogger custom logger -type CustomLogger struct { - *logger -} - -// NewCustomLogger create a new custom logger -func NewCustomLogger(config *LogConfig, ctx context.Context) (*CustomLogger, error) { - if config == nil { - return nil, errors.New("custom logger config is nil") - } - log := new(logger) - if err := log.setLogger(config); err != nil { - return nil, err - } - if !log.isInit() { - return nil, errors.New("logger init failed") - } - return &CustomLogger{logger: log}, nil -} - -// SetCustomLogger set custom logger -func SetCustomLogger(log *logger) *CustomLogger { - if log == nil { - return nil - } - return &CustomLogger{logger: log} -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go deleted file mode 100644 index a32e9be..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "context" - "errors" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestInitRunLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init run log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitRunLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("run logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitRunLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitRunLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestNewCustomLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init custom log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - _, err := NewCustomLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("custom logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - _, err = NewCustomLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - _, err = NewCustomLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitOperateLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init operate log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitOperateLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("operate logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitOperateLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitOperateLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitSecurityLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init security log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitSecurityLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("security logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitSecurityLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitSecurityLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitUserLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init user log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitUserLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("user logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitUserLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitUserLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitDebugLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init debug log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitDebugLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("debug logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitDebugLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitDebugLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go b/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go deleted file mode 100644 index 88cfb9d..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go +++ /dev/null @@ -1,156 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "fmt" - "sync" - "time" - - "ascend-common/common-utils/cache" -) - -const ( - // MaxCacheSize indicates the maximum log cache size - MaxCacheSize = 100 * 1024 - // MaxExpiredTime indicates the maximum log cache expired time - MaxExpiredTime = 60 * 60 - // DefaultCacheSize indicates the default log cache size - DefaultCacheSize = 10 * 1024 - // DefaultExpiredTime indicates the default log cache expired time - DefaultExpiredTime = 1 - cutPreLen = 46 - // ProblemOccurMaxNumbers indicates the maximum number of times that the same problem can occur - ProblemOccurMaxNumbers = 3 -) - -var ( - errorMap sync.Map -) - -// LogLimiter encapsulates Logs and provides the log traffic limiting capability -// to prevent too many duplicate logs. -type LogLimiter struct { - // Logs is a log rotate instance - Logs *Logs - logCache *cache.ConcurrencyLRUCache - logMu sync.Mutex - doOnce sync.Once - - logExpiredTime time.Duration - // CacheSize indicates the size of log cache - CacheSize int - // ExpiredTime indicates the expired time of log cache - ExpiredTime int -} - -// Write implements io.Writer. It encapsulates the Write method of Los and uses -// the lru cache to prevent duplicate log writing. -func (l *LogLimiter) Write(d []byte) (int, error) { - if l == nil { - return 0, fmt.Errorf("log limiter pointer does not exist") - } - - l.logMu.Lock() - defer l.logMu.Unlock() - - if l.ExpiredTime == 0 || l.CacheSize == 0 { - return l.Logs.Write(d) - } - - l.doOnce.Do(func() { - l.validateLimiterConf() - l.logCache = cache.New(l.CacheSize) - l.logExpiredTime = time.Duration(int64(l.ExpiredTime) * int64(time.Second)) - }) - - if l.logCache == nil { - l.logCache = cache.New(DefaultCacheSize) - } - if !l.logCache.SetIfNX(string(d[cutPreLen:]), "v", l.logExpiredTime) { - return 0, nil - } - - return l.Logs.Write(d) -} - -// Close implements io.Closer. It encapsulates the Close method of Logs. -func (l *LogLimiter) Close() error { - if l == nil { - return fmt.Errorf("log limiter pointer does not exist") - } - - l.logMu.Lock() - defer l.logMu.Unlock() - - return l.Logs.Close() -} - -// Flush encapsulates the Flush method of Logs. -func (l *LogLimiter) Flush() error { - if l == nil { - return fmt.Errorf("log limiter pointer does not exist") - } - - l.logMu.Lock() - defer l.logMu.Unlock() - - return l.Logs.Flush() -} - -// validateLimiterConf verifies the external input parameters in the LogLimiter. -func (l *LogLimiter) validateLimiterConf() { - if l.CacheSize < 0 || l.CacheSize > MaxCacheSize { - l.CacheSize = DefaultCacheSize - } - if l.ExpiredTime < 0 || l.ExpiredTime > MaxExpiredTime { - l.ExpiredTime = DefaultExpiredTime - } -} - -func getKey(domain string, id interface{}) string { - return fmt.Sprintf("%d_%s", id, domain) -} - -// IsNeedPrintWithSpecifiedCounts check whether print the error message, -// if the error message (domain_id as a unique identifier) has been printed -// for problemOccurMaxNumbers times, return false -func IsNeedPrintWithSpecifiedCounts(domain string, id interface{}, problemOccurMaxNumbers int) (bool, string) { - key := getKey(domain, id) - cnt, _ := errorMap.LoadOrStore(key, 0) - intCnt, ok := cnt.(int) - extraErrLog := "" - if !ok { - // the counter type is abnormal, print by default - return true, extraErrLog - } - if intCnt >= problemOccurMaxNumbers { - return false, extraErrLog - } - intCnt += 1 - errorMap.Store(key, intCnt) - if intCnt == problemOccurMaxNumbers { - extraErrLog = fmt.Sprintf(".The error log has been printed for %v times "+ - "and will not be printed any more", problemOccurMaxNumbers) - } - return true, extraErrLog - -} - -// ResetErrCnt reset the error count -func ResetErrCnt(domain string, id interface{}) { - errorMap.Delete(getKey(domain, id)) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go deleted file mode 100644 index f659fbc..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go +++ /dev/null @@ -1,242 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "errors" - "fmt" - "os" - "path" - "regexp" - "strings" - - "github.com/fsnotify/fsnotify" - - "ascend-common/common-utils/utils" -) - -const ( - // DefaultFileMaxSize the default maximum size of a single log file is 20 MB - DefaultFileMaxSize = 20 - // DefaultMinSaveAge the minimum storage duration of backup logs is 7 days - DefaultMinSaveAge = 7 - // DefaultMaxSaveAge the maximum storage duration of backup logs is 700 days - DefaultMaxSaveAge = 700 - // DefaultMaxBackups the default number of backup log - DefaultMaxBackups = 30 - // LogFileMode log file mode - LogFileMode os.FileMode = 0640 - // BackupLogFileMode backup log file mode - BackupLogFileMode os.FileMode = 0400 - // LogDirMode log dir mode - LogDirMode = 0750 - backUpLogRegex = `^.+-[0-9]{4}-[0-9]{2}-[0-9T]{5}-[0-9]{2}-[0-9]{2}\.[0-9]{2,4}` - bitsize = 64 - stackDeep = 3 - pathLen = 2 - minLogLevel = -1 - maxLogLevel = 3 - maxEachLineLen = 1048576 - defaultMaxEachLineLen = 256 -) - -// LogConfig log module config -type LogConfig struct { - // log file path - LogFileName string - // only write to std out, default value: false - OnlyToStdout bool - // only write to file, default value: false - OnlyToFile bool - // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 - LogLevel int - // size of a single log file (MB), default value: 20MB - FileMaxSize int - // MaxLineLength Max length of each log line, default value: 256 - MaxLineLength int - // maximum number of backup log files, default value: 30 - MaxBackups int - // maximum number of days for backup log files, default value: 7 - MaxAge int - // whether backup files need to be compressed, default value: false - IsCompress bool - // expiration time for log cache, default value: 1s - ExpiredTime int - // Size of log cache space, default: 10240 - CacheSize int -} - -var reg = regexp.MustCompile(backUpLogRegex) - -type validateFunc func(config *LogConfig) error - -func checkDir(fileDir string) error { - if !utils.IsExist(fileDir) { - if err := os.MkdirAll(fileDir, LogDirMode); err != nil { - return fmt.Errorf("create dirs failed") - } - return nil - } - if err := os.Chmod(fileDir, LogDirMode); err != nil { - return fmt.Errorf("change log dir mode failed") - } - return nil -} - -func createFile(filePath string) error { - fileName := path.Base(filePath) - if !utils.IsExist(filePath) { - f, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, LogFileMode) - if err != nil { - return fmt.Errorf("create file(%s) failed", fileName) - } - defer func() { - if err := f.Close(); err != nil { - fmt.Printf("close file failed: %v\n", err) - return - } - }() - } - return nil -} - -func checkAndCreateLogFile(filePath string) error { - if !utils.IsFile(filePath) { - return fmt.Errorf("config path is not file") - } - fileDir := path.Dir(filePath) - if err := checkDir(fileDir); err != nil { - return err - } - if err := createFile(filePath); err != nil { - return err - } - return nil -} - -func validateLogConfigFileMaxSize(config *LogConfig) error { - if config.FileMaxSize == 0 { - config.FileMaxSize = DefaultFileMaxSize - return nil - } - if config.FileMaxSize < 0 || config.FileMaxSize > DefaultFileMaxSize { - return fmt.Errorf("the size of a single log file range is (0, 20] MB") - } - - return nil -} - -func validateLogConfigBackups(config *LogConfig) error { - if config.MaxBackups <= 0 || config.MaxBackups > DefaultMaxBackups { - return fmt.Errorf("the number of backup log file range is (0, 30]") - } - return nil -} - -func validateLogConfigMaxAge(config *LogConfig) error { - fmt.Printf("MaxAge %s", config.MaxAge) - if config.MaxAge < DefaultMinSaveAge || config.MaxAge > DefaultMaxSaveAge { - return fmt.Errorf("the maxage of backup logs range is [7,700]") - } - return nil -} - -func validateLogLevel(config *LogConfig) error { - if config.LogLevel < minLogLevel || config.LogLevel > maxLogLevel { - return fmt.Errorf("the log level range should be [-1, 3]") - } - return nil -} - -func validateMaxLineLength(config *LogConfig) error { - if config.MaxLineLength == 0 { - config.MaxLineLength = defaultMaxEachLineLen - return nil - } - if config.MaxLineLength < 0 || config.MaxLineLength > maxEachLineLen { - return fmt.Errorf("the max length of each log line should be in the range (0, 1048576]") - } - return nil -} - -func getValidateFuncList() []validateFunc { - var funcList []validateFunc - funcList = append(funcList, validateLogConfigFileMaxSize, validateLogConfigBackups, validateMaxLineLength, - validateLogConfigMaxAge, validateLogLevel, validateLogConfigLimiter) - return funcList -} - -func validateLogConfigFiled(config *LogConfig) error { - if config.OnlyToStdout { - return nil - } - if _, err := utils.CheckPath(config.LogFileName); err != nil && err != os.ErrNotExist { - return fmt.Errorf("config log path is not absolute path: %v", err) - } - if strings.Contains(config.LogFileName, "..") || strings.Contains(config.LogFileName, "./") { - return errors.New("log path include invalid char") - } - - if err := checkAndCreateLogFile(config.LogFileName); err != nil { - return err - } - validateFuncList := getValidateFuncList() - for _, vaFunc := range validateFuncList { - if err := vaFunc(config); err != nil { - return err - } - } - - return nil -} - -func validateLogConfigLimiter(config *LogConfig) error { - if config.ExpiredTime < 0 || config.ExpiredTime > MaxExpiredTime { - return fmt.Errorf("the expired time of log cache range is [0, 3600], the value 0 disables the limiter") - } - if config.CacheSize < 0 || config.CacheSize > MaxCacheSize { - return fmt.Errorf("the size of log cache range is [0, 102400], the value 0 disables the limiter") - } - return nil -} - -func changeFileMode(l *logger, event fsnotify.Event, logFileFullPath string) { - if l == nil { - fmt.Println("changeFileMode logger is nil") - return - } - var logMode = LogFileMode - logPath := path.Dir(logFileFullPath) - changedFileName := path.Base(event.Name) - if isTargetLog(changedFileName) { - logMode = BackupLogFileMode - } - changedLogFilePath := path.Join(logPath, changedFileName) - if !utils.IsExist(changedLogFilePath) { - return - } - fPath, err := utils.CheckPath(changedLogFilePath) - if err != nil { - l.Errorf("wrong file path: %v", err) - return - } - if errChmod := os.Chmod(fPath, logMode); errChmod != nil { - l.Errorf("set file mode failed, filename: %s", changedFileName) - } -} -func isTargetLog(fileName string) bool { - return reg.MatchString(fileName) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go deleted file mode 100644 index f91b663..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "errors" - "io/fs" - "os" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/fsnotify/fsnotify" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/utils" -) - -func TestCheckDir(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test check dir func", func() { - mockStat := gomonkey.ApplyFunc(os.Stat, func(_ string) (fs.FileInfo, error) { - return nil, os.ErrNotExist - }) - mockMkDir := gomonkey.ApplyFunc(os.MkdirAll, func(_ string, _ fs.FileMode) error { - return nil - }) - defer mockStat.Reset() - defer mockMkDir.Reset() - err := checkDir("log") - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestCreateFile(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test create file", func() { - mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { - return false - }) - mockCreate := gomonkey.ApplyFunc(os.Create, func(_ string) (*os.File, error) { - return nil, nil - }) - defer mockExist.Reset() - defer mockCreate.Reset() - err := createFile("log") - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestCheckAndCreateLogFile(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test checkAndCreateLogFile func", func() { - mockCreate := gomonkey.ApplyFunc(createFile, func(_ string) error { - return nil - }) - defer mockCreate.Reset() - err := checkAndCreateLogFile("log") - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestValidateLogConfigFileMaxSize(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate max size func", func() { - conf := &LogConfig{} - err := validateLogConfigFileMaxSize(conf) - convey.So(err, convey.ShouldBeNil) - convey.So(conf.FileMaxSize, convey.ShouldEqual, DefaultFileMaxSize) - conf.FileMaxSize = -1 - err = validateLogConfigFileMaxSize(conf) - convey.So(err, convey.ShouldBeError) - conf.FileMaxSize = DefaultFileMaxSize + 1 - err = validateLogConfigFileMaxSize(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateLogConfigBackups(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate backups func", func() { - conf := &LogConfig{MaxBackups: DefaultMaxBackups} - err := validateLogConfigBackups(conf) - convey.So(err, convey.ShouldBeNil) - conf.MaxBackups = 0 - err = validateLogConfigBackups(conf) - convey.So(err, convey.ShouldBeError) - conf.FileMaxSize = DefaultMaxBackups + 1 - err = validateLogConfigBackups(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateLogConfigMaxAge(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate max age func", func() { - conf := &LogConfig{MaxAge: DefaultMinSaveAge} - err := validateLogConfigMaxAge(conf) - convey.So(err, convey.ShouldBeNil) - conf.MaxAge = 0 - err = validateLogConfigMaxAge(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateLogLevel(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate log level func", func() { - conf := &LogConfig{} - err := validateLogLevel(conf) - convey.So(err, convey.ShouldBeNil) - conf.LogLevel = minLogLevel - 1 - err = validateLogLevel(conf) - convey.So(err, convey.ShouldBeError) - conf.LogLevel = maxLogLevel + 1 - err = validateLogLevel(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateMaxLineLength(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate max line length func", func() { - conf := &LogConfig{} - err := validateMaxLineLength(conf) - convey.So(err, convey.ShouldBeNil) - convey.So(conf.MaxLineLength, convey.ShouldEqual, defaultMaxEachLineLen) - conf.MaxLineLength = -1 - err = validateMaxLineLength(conf) - convey.So(err, convey.ShouldNotBeNil) - conf.MaxLineLength = maxEachLineLen + 1 - err = validateMaxLineLength(conf) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestValidateLogConfigFiled(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate config filed func", func() { - mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { - return "", nil - }) - mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { - return nil - }) - defer mockCheckPath.Reset() - defer mockCheckAndCreate.Reset() - conf := &LogConfig{ - MaxBackups: DefaultMaxBackups, - MaxAge: DefaultMinSaveAge, - CacheSize: DefaultCacheSize, - ExpiredTime: DefaultExpiredTime, - } - err := validateLogConfigFiled(conf) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("test validate config filed func, log file is relative path", func() { - mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { - return "", nil - }) - mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { - return nil - }) - defer mockCheckPath.Reset() - defer mockCheckAndCreate.Reset() - conf := &LogConfig{ - MaxBackups: DefaultMaxBackups, - MaxAge: DefaultMinSaveAge, - CacheSize: DefaultCacheSize, - ExpiredTime: DefaultExpiredTime, - LogFileName: "../", - } - err := validateLogConfigFiled(conf) - expErr := errors.New("log path include invalid char") - convey.So(err, convey.ShouldResemble, expErr) - }) - }) -} - -func TestChangeFileMode(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test changeFileMode func", func() { - changeFileMode(nil, fsnotify.Event{}, "log") - mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { - return true - }) - mockChmod := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { - return nil - }) - defer mockExist.Reset() - defer mockChmod.Reset() - lg := new(logger) - evt := fsnotify.Event{Name: "run-2022-01-01T00-00-00.123.log"} - changeFileMode(lg, evt, "log") - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go deleted file mode 100644 index cc07bb2..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go +++ /dev/null @@ -1,447 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "sort" - "strings" - "sync" - "time" -) - -const ( - oneDaySeconds = 24 * 60 * 60 - defaultCapacity = 20 - timeFormat = "2006-01-02T15-04-05.000" - kilobytes = 1024 - defaultDirPermission = 0750 - defaultFilePermission = 0600 - defaultBackupPermission = 0400 - maxCapacity = 20 - minSaveVolume = 1 - maxSaveVolume = 30 - maxSaveTime = 700 - minSaveTime = 7 -) - -// Logs is an io.WriteCloser. -type Logs struct { - file *os.File - mutex sync.Mutex - rmOnce sync.Once - - // FileName is the file where logs are written. - FileName string `json:"filename" yaml:"filename"` - - // Capacity is the maximum number of bytes before the log file - // is rotated, and the default value is 128 megabytes. - Capacity int `json:"capacity" yaml:"capacity"` - - // SaveTime is the maximum number of days for retaining old log - // files. It calculates the retention time based on the timestamp - // of the old log file name and the current time. - SaveTime int `json:"savetime" yaml:"savetime"` - - // SaveVolume is the maximum number of old log files that can be - // retained. It saves all old files by default. - SaveVolume int `json:"savevolume" yaml:"savevolume"` - - // UTC determines whether to use the local time of the computer - // or the UTC time as the timestamp in the formatted backup file. - LocalOrUTC bool `json:"localorutc" yaml:"localorutc"` - - length int64 - rmCh chan bool -} - -// logFile is a struct that is used to return filename and -// timestamp. -type logFile struct { - fileInfo os.FileInfo - timeStamp time.Time -} - -var ( - // mByte is used to convert capacity into bytes. - mByte = kilobytes * kilobytes -) - -// Write implements io.Writer. If a write would not cause the size of -// the log file to exceed Capacity, the log file is written normally. -// If a write would cause the size of the log file to exceed Capacity, -// but the write length is less than Capacity, the log file is closed, -// renamed to include a timestamp of the current time, and a new log -// is created using the original log file name. If the length of a write -// is greater than the Capacity, an error is returned. -func (l *Logs) Write(d []byte) (int, error) { - if l == nil { - return 0, fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - - writeLenth := int64(len(d)) - if writeLenth > l.maxLenth() { - return 0, fmt.Errorf("the write lenth %d is greater than the maximum file size %d", - writeLenth, l.maxLenth(), - ) - } - - if l.file == nil { - if err := l.openOrCreateFile(writeLenth); err != nil { - return 0, err - } - } - fileInfo, err := l.file.Stat() - if err != nil { - return 0, err - } - l.length = fileInfo.Size() - if writeLenth+l.length > l.maxLenth() { - if err := l.roll(); err != nil { - return 0, err - } - } - - n, err := l.file.Write(d) - if err != nil { - return 0, err - } - l.length += int64(n) - return n, err -} - -// Roll causes Logs to close the existing log file and create a new log -// file immediately. The purpose of this function is to provide rotation -// outside the normal rotation rule, e.g. in response to SIGHUP. After -// rotation, the deletion of the old log files is initiated. -func (l *Logs) Roll() error { - if l == nil { - return fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - return l.roll() -} - -// Close implements io.Closer. It closses the current log file. -func (l *Logs) Close() error { - if l == nil { - return fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - - return l.close() -} - -// Flush persist the contents of the current memory. -func (l *Logs) Flush() error { - if l == nil { - return fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - if l.file == nil { - return nil - } - return l.file.Sync() -} - -// maxLenth return the number of bytes of the maximum log size -// before rotating. -func (l *Logs) maxLenth() int64 { - if l.Capacity > 0 && l.Capacity < maxCapacity { - return int64(l.Capacity) * int64(mByte) - } - return int64(defaultCapacity * mByte) -} - -// fileName return the name of the log file. -func (l *Logs) fileName() string { - if l.FileName != "" { - return l.FileName - } - logName := filepath.Base(os.Args[0]) + "-mindx-dl.log" - return filepath.Join(os.TempDir(), logName) -} - -// openOrCreateFile opens the log file if it exists and the -// current write would not exceed the Capacity. It will create -// a new file if there is no such file or the write would exceed -// the Capacity. -func (l *Logs) openOrCreateFile(writeLen int64) error { - l.remove() - - name := l.fileName() - message, err := os.Stat(name) - if os.IsNotExist(err) { - return l.create() - } - - if err != nil { - return fmt.Errorf("failed to get log file message: %v", err) - } - - if writeLen+message.Size() >= l.maxLenth() { - return l.roll() - } - - f, err := os.OpenFile(name, os.O_APPEND|os.O_WRONLY, defaultFilePermission) - if err != nil { - return l.create() - } - l.file = f - l.length = message.Size() - return nil -} - -// create creates a new log file for writing, and backs up the -// old log file. The file is closed when this method is invoked -// by default. -func (l *Logs) create() error { - if err := os.MkdirAll(l.getDir(), defaultDirPermission); err != nil { - return fmt.Errorf("unable to create directory for new log file: %v", err) - } - - fileName, fileMode := l.fileName(), os.FileMode(defaultFilePermission) - if message, err := os.Stat(fileName); err == nil { - fileMode = message.Mode() - backupName := l.backup() - if err := os.Rename(fileName, backupName); err != nil { - return fmt.Errorf("failed to rename the log file: %v", err) - } - if err := os.Chmod(backupName, defaultBackupPermission); err != nil { - return fmt.Errorf("failed to change backup log file permission: %v", err) - } - } - newFile, err := os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, fileMode) - if err != nil { - return fmt.Errorf("unable to open new log file: %v", err) - } - l.length, l.file = 0, newFile - return nil -} - -// backup generates a backup file name based on the original file -// name and inserts a timestamp between the file name and extension. -// The timestamp uses the UTC time by default. -func (l *Logs) backup() string { - prefix, extension := l.getPreAndExt() - return filepath.Join(l.getDir(), fmt.Sprintf("%s%s%s", prefix, l.getTimestamp(), extension)) -} - -// getDir returns the directory for the current filename. -func (l *Logs) getDir() string { - return filepath.Dir(l.fileName()) -} - -// getPreAndExt returns the prefix name and extension name -// from Logs's filename. -func (l *Logs) getPreAndExt() (string, string) { - name := filepath.Base(l.fileName()) - extension := filepath.Ext(name) - prefix := name[:len(name)-len(extension)] + "-" - return prefix, extension -} - -// getTimestamp returns the timestamp of current time, and -// uses UTC time by default. -func (l *Logs) getTimestamp() string { - t := time.Now() - if !l.LocalOrUTC { - t = t.UTC() - } - return t.Format(timeFormat) -} - -// roll rotates the log file, close the existing log file and -// create a new one immediately. After rotating, this method -// deletes the old log files according to the configuration. -func (l *Logs) roll() error { - if err := l.close(); err != nil { - return err - } - if err := l.create(); err != nil { - return err - } - l.remove() - return nil -} - -// close closes the file if it is open. -func (l *Logs) close() error { - if l.file == nil { - return nil - } - err := l.file.Sync() - if err != nil { - return err - } - err = l.file.Close() - l.file = nil - return err -} - -// remove delete outdated log files, starting the remove -// goroutine if necessary. -func (l *Logs) remove() { - l.rmOnce.Do(func() { - l.rmCh = make(chan bool, 1) - go l.removeRun() - }) - select { - case l.rmCh <- true: - default: - } -} - -// removeRun manages the deletion of the old log files after -// rotating, which runs in a goroutine. -func (l *Logs) removeRun() { - for range l.rmCh { - if err := l.removeRunOnce(); err != nil { - fmt.Println("failed to remove runonce: ", err) - } - } -} - -// removeRunOnce performs removal of outdated log files. -// Old log files are removed if the number of old files -// exceed the Capacity or the retention time of old files -// is greater than SaveTime. -func (l *Logs) removeRunOnce() error { - if l.SaveVolume == 0 && l.SaveTime == 0 { - return nil - } - - if err := checkParam(l.SaveVolume, l.SaveTime); err != nil { - return err - } - - oldFiles, err := l.oldFilesList() - if err != nil { - return err - } - - var removeFiles []logFile - if l.SaveTime > 0 { - delTime := time.Now().Unix() - int64(l.SaveTime)*oneDaySeconds - var remainingFiles []logFile - for _, f := range oldFiles { - if f.timeStamp.Unix() <= delTime { - removeFiles = append(removeFiles, f) - continue - } - remainingFiles = append(remainingFiles, f) - } - oldFiles = remainingFiles - } - - if l.SaveVolume > 0 && l.SaveVolume < len(oldFiles) { - saved := make(map[string]struct{}, len(oldFiles)) - var remainingFiles []logFile - for _, f := range oldFiles { - saved[f.fileInfo.Name()] = struct{}{} - if l.SaveVolume >= len(saved) { - remainingFiles = append(remainingFiles, f) - continue - } - removeFiles = append(removeFiles, f) - } - oldFiles = remainingFiles - } - - for _, f := range removeFiles { - rmError := os.Remove(filepath.Join(l.getDir(), f.fileInfo.Name())) - if rmError != nil { - err = rmError - } - } - return err -} - -// oldFilesList returns the list of backup log files sorted -// by ModTime. These backup log files are stored in the same -// directory as the current log file. -func (l *Logs) oldFilesList() ([]logFile, error) { - logFiles, err := ioutil.ReadDir(l.getDir()) - if err != nil { - return nil, fmt.Errorf("unable to open the log file directory: %v", err) - } - - prefix, extension := l.getPreAndExt() - - var oldFiles []logFile - - for _, file := range logFiles { - if file.IsDir() { - continue - } - if timeStamp, err := l.extractTime(file.Name(), prefix, extension); err == nil { - oldFiles = append(oldFiles, logFile{fileInfo: file, timeStamp: timeStamp}) - continue - } - } - sort.Slice(oldFiles, func(i, j int) bool { - if i < 0 || i > len(oldFiles) || j < 0 || j > len(oldFiles) { - return false - } - return oldFiles[i].timeStamp.After(oldFiles[j].timeStamp) - }) - - return oldFiles, nil -} - -// extractTime extracts the formatted time from file name by -// stripping the prefix and extension of the file name. This -// prevents fileName from being confused with time.parse. -func (l *Logs) extractTime(name, prefix, extension string) (time.Time, error) { - if !strings.HasSuffix(name, extension) { - return time.Time{}, errors.New("unmatched extension") - } - - if !strings.HasPrefix(name, prefix) { - return time.Time{}, errors.New("unmatched prefix") - } - - timeStamp := name[len(prefix) : len(name)-len(extension)] - return time.Parse(timeFormat, timeStamp) -} - -// checkParam checks whether the parameters are correct -func checkParam(volume int, time int) error { - if volume != 0 { - if volume < minSaveVolume || volume > maxSaveVolume { - return fmt.Errorf("the value of savevolume is incorrect") - } - } - if time != 0 { - if time < minSaveTime || time > maxSaveTime { - return fmt.Errorf("the value of savetime is incorrect") - } - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go deleted file mode 100644 index 67807bd..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go +++ /dev/null @@ -1,687 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "encoding/json" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -const ( - testDirPermission = 0700 - testFilePermission = 0600 - testMByte = 1 - testCapacity = 10 - testCapacity2 = 100 - testCapacity3 = 5 - testSaveTime = 10 - testSaveTime2 = 7 - testSaveVolume = 3 - testSaveVolume2 = 1 - fileCountOne = 1 - fileCountTwo = 2 - fileCountFour = 4 - waitTime = 50 - oneDayHour = 24 - sevenDays = 7 - fourteenDays = 14 - twentyOneDays = 21 - testYear = 2014 - testMonth = 5 - testDay = 4 - testHour = 14 - testMin = 44 - testSec = 33 - testNsec = 555000000 -) - -// TestCreate for test the function of create log file -func TestCreate(t *testing.T) { - convey.Convey("TestCreate", t, func() { - dir := makeTempDir("TestCrate") - defer os.RemoveAll(dir) - l := &Logs{ - FileName: getLogFile(dir), - } - defer l.Close() - - input := []byte("foobarfoobar!") - fileWrite(input, l) - existWithContent(input, getLogFile(dir)) - fileCount(fileCountOne, dir) - }) -} - -// TestOpenFile for test the function of open log file -func TestOpenFile(t *testing.T) { - convey.Convey("TestOpenFile", t, func() { - dir := makeTempDir("TestOpenFile") - defer os.RemoveAll(dir) - fileName := getLogFile(dir) - data := []byte("foo!") - err := ioutil.WriteFile(fileName, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - existWithContent(data, fileName) - - l := &Logs{ - FileName: fileName, - } - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(append(data, b...), fileName) - fileCount(fileCountOne, dir) - }) -} - -// TestWriteTooLong for test the processing of the overlong write error -func TestWriteTooLong(t *testing.T) { - convey.Convey("TestWriteTooLong", t, func() { - mByte = testMByte - dir := makeTempDir("TestWriteTooLong") - defer os.RemoveAll(dir) - - l := &Logs{ - FileName: getLogFile(dir), - Capacity: testCapacity3, - } - defer l.Close() - - b := []byte("barrrrrrrrrrrrrrrrr!") - n, err := l.Write(b) - convey.So(err, convey.ShouldNotBeNil) - convey.So(0, convey.ShouldEqual, n) - convey.So(err.Error(), convey.ShouldEqual, fmt.Sprintf( - "the write lenth %d is greater than the maximum file size %d", len(b), l.Capacity)) - _, err = os.Stat(getLogFile(dir)) - convey.So(err, shouldNotBeExist) - }) -} - -// TestMakeLogDir for test the function of make log file directory -func TestMakeLogDir(t *testing.T) { - convey.Convey("TestMakeLogDir", t, func() { - dir := time.Now().Format("TestMakeLogDir" + timeFormat) - dir = filepath.Join(os.TempDir(), dir) - defer os.RemoveAll(dir) - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - } - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, getLogFile(dir)) - fileCount(fileCountOne, dir) - }) -} - -// TestDefaultFileName for test default log file name -func TestDefaultFileName(t *testing.T) { - convey.Convey("TestDefaultFileName", t, func() { - dir := os.TempDir() - fileName := filepath.Join(dir, filepath.Base(os.Args[0])+"-mindx-dl.log") - defer os.Remove(fileName) - - l := &Logs{} - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, fileName) - }) -} - -// TestAutoRoll for test the automatic log rolling -func TestAutoRoll(t *testing.T) { - convey.Convey("TestAutoRoll", t, func() { - mByte = testMByte - dir := makeTempDir("TestAutoRoll") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - } - defer l.Close() - - b := []byte("aoo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - defer patch1.Reset() - - b2 := []byte("foooooo!") - fileWrite(b2, l) - existWithContent(b2, fileName) - existWithContent(b, getBackupFile(dir, time.Now())) - fileCount(fileCountTwo, dir) - }) -} - -// TestFirstWriteRoll for test the log rolling on first write -func TestFirstWriteRoll(t *testing.T) { - convey.Convey("TestFirstWriteRoll", t, func() { - mByte = testMByte - dir := makeTempDir("TestFirstWriteRoll") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - } - defer l.Close() - - start := []byte("boooooo!") - err := ioutil.WriteFile(fileName, start, testFilePermission) - convey.So(err, convey.ShouldBeNil) - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - defer patch1.Reset() - - b := []byte("fooo!") - fileWrite(b, l) - existWithContent(b, fileName) - existWithContent(start, getBackupFile(dir, time.Now())) - fileCount(fileCountTwo, dir) - }) -} - -// TestSaveVolumeCase1 for test the deleting log files that exceed the volume -func TestSaveVolumeCase1(t *testing.T) { - convey.Convey("TestSaveVolumeCase1", t, func() { - mByte = testMByte - dir := makeTempDir("TestSaveVolumeCase1") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - SaveVolume: testSaveVolume2, - } - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b2 := []byte("foooooo!") - fileWrite(b2, l) - secondFileName := getBackupFile(dir, time.Now()) - existWithContent(b, secondFileName) - existWithContent(b2, fileName) - fileCount(fileCountTwo, dir) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - defer patch2.Reset() - b3 := []byte("baaaaaar!") - fileWrite(b3, l) - thirdFileName := getBackupFile(dir, time.Now()) - existWithContent(b2, thirdFileName) - existWithContent(b3, fileName) - <-time.After(time.Millisecond * waitTime) - fileCount(fileCountTwo, dir) - existWithContent(b2, thirdFileName) - convey.So(secondFileName, shouldNotExist) - }) -} - -// TestSaveVolumeCase2 for test the deleting log files that exceed the volume when a non-log file exists -func TestSaveVolumeCase2(t *testing.T) { - convey.Convey("TestSaveVolumeCase2", t, func() { - mByte = testMByte - dir := makeTempDir("TestSaveVolumeCase2") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{FileName: fileName, Capacity: testCapacity, SaveVolume: testSaveVolume2} - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b2 := []byte("baaaaaar!") - fileWrite(b2, l) - secondFileName := getBackupFile(dir, time.Now()) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - notLogFile := getLogFile(dir) + ".foo" - err := ioutil.WriteFile(notLogFile, []byte("data"), testFilePermission) - convey.So(err, convey.ShouldBeNil) - notLogFileDir := getBackupFile(dir, time.Now()) - err = os.Mkdir(notLogFileDir, testDirPermission) - convey.So(err, convey.ShouldBeNil) - - patch2.Reset() - patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time3 := currentTime - return time3.Add(time.Hour * oneDayHour * twentyOneDays) - }) - defer patch3.Reset() - thirdFileName := getBackupFile(dir, time.Now()) - b3 := []byte("baaaaaaz!") - fileWrite(b3, l) - existWithContent(b2, thirdFileName) - <-time.After(time.Millisecond * waitTime) - fileCount(fileCountFour, dir) - existWithContent(b3, fileName) - convey.So(secondFileName, shouldNotExist) - convey.So(notLogFile, shouldExist) - convey.So(notLogFileDir, shouldExist) - }) -} - -// TestCleanupExistingBackupFiles fot test the clearing the current backup log files -func TestCleanupExistingBackupFiles(t *testing.T) { - convey.Convey("TestCleanupExistingBackupFiles", t, func() { - mByte = testMByte - dir := makeTempDir("TestCleanupExistingBackupFiles") - defer os.RemoveAll(dir) - currentTime := time.Now() - - data := []byte("data") - backup := getBackupFile(dir, time.Now()) - err := ioutil.WriteFile(backup, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - backup = getBackupFile(dir, time.Now()) - err = ioutil.WriteFile(backup, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - fileName := getLogFile(dir) - err = ioutil.WriteFile(fileName, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - SaveVolume: testSaveVolume2, - } - defer l.Close() - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - defer patch2.Reset() - b2 := []byte("foooooo!") - fileWrite(b2, l) - - <-time.After(time.Millisecond * waitTime) - - fileCount(fileCountTwo, dir) - }) -} - -// TestSaveTime for test the deleting log files that exceed the time -func TestSaveTime(t *testing.T) { - convey.Convey("TestSaveTime", t, func() { - mByte = testMByte - dir := makeTempDir("TestSaveTime") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - SaveTime: testSaveTime2, - } - defer l.Close() - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b := []byte("zoo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - b2 := []byte("foooooo!") - fileWrite(b2, l) - existWithContent(b, getBackupFile(dir, time.Now())) - - <-time.After(waitTime * time.Millisecond) - - fileCount(fileCountTwo, dir) - existWithContent(b2, fileName) - existWithContent(b, getBackupFile(dir, time.Now())) - - patch2.Reset() - patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time3 := currentTime - return time3.Add(time.Hour * oneDayHour * twentyOneDays) - }) - defer patch3.Reset() - b3 := []byte("baaaaar!") - fileWrite(b3, l) - existWithContent(b2, getBackupFile(dir, time.Now())) - - <-time.After(waitTime * time.Millisecond) - - fileCount(fileCountTwo, dir) - existWithContent(b3, fileName) - existWithContent(b2, getBackupFile(dir, time.Now())) - }) -} - -// TestOldLogFilesList for test the obtaining the list of old log files -func TestOldLogFilesList(t *testing.T) { - convey.Convey("TestOldLogFilesList", t, func() { - mByte = testMByte - dir := makeTempDir("TestOldLogFiles") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - data := []byte("data") - err := ioutil.WriteFile(fileName, data, testDirPermission) - convey.So(err, convey.ShouldBeNil) - t1, err := time.Parse(timeFormat, currentTime.UTC().Format(timeFormat)) - convey.So(err, convey.ShouldBeNil) - backup := getBackupFile(dir, currentTime) - err = ioutil.WriteFile(backup, data, testDirPermission) - convey.So(err, convey.ShouldBeNil) - - patch := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - defer patch.Reset() - t2, err := time.Parse(timeFormat, time.Now().UTC().Format(timeFormat)) - convey.So(err, convey.ShouldBeNil) - backup2 := getBackupFile(dir, time.Now()) - err = ioutil.WriteFile(backup2, data, testDirPermission) - convey.So(err, convey.ShouldBeNil) - - l := &Logs{FileName: fileName} - files, err := l.oldFilesList() - convey.So(err, convey.ShouldBeNil) - convey.So(fileCountTwo, convey.ShouldEqual, len(files)) - convey.So(t2, convey.ShouldEqual, files[0].timeStamp) - convey.So(t1, convey.ShouldEqual, files[1].timeStamp) - }) -} - -// TestExtractTime for test obtaining log file timestamp -func TestExtractTime(t *testing.T) { - convey.Convey("TestExtractTime", t, func() { - l := &Logs{FileName: "/var/log/myfoo/foo.log"} - prefix, extention := l.getPreAndExt() - - tests := []struct { - fileName string - want time.Time - wantErr bool - }{ - {"foo-2014-05-04T14-44-33.555.log", time.Date( - testYear, testMonth, testDay, testHour, testMin, testSec, testNsec, time.UTC), false}, - {"foo-2014-05-04T14-44-33.555", time.Time{}, true}, - {"2014-05-04T14-44-33.555.log", time.Time{}, true}, - {"foo.log", time.Time{}, true}, - } - - for _, test := range tests { - got, err := l.extractTime(test.fileName, prefix, extention) - convey.So(got, convey.ShouldEqual, test.want) - convey.So(err != nil, convey.ShouldEqual, test.wantErr) - } - }) -} - -// TestLocalTime for test the situation that current time is the local time -func TestLocalTime(t *testing.T) { - convey.Convey("TestLocalTime", t, func() { - mByte = testMByte - dir := makeTempDir("TestLocalTime") - defer os.RemoveAll(dir) - currentTime := time.Now() - - l := &Logs{ - FileName: getLogFile(dir), - Capacity: testCapacity, - LocalOrUTC: true, - } - defer l.Close() - - patch := gomonkey.ApplyFunc(time.Now, func() time.Time { - return currentTime - }) - defer patch.Reset() - b := []byte("boo!") - fileWrite(b, l) - - b2 := []byte("fooooooo!") - fileWrite(b2, l) - existWithContent(b2, getLogFile(dir)) - existWithContent(b, getBackupFileLocal(dir, currentTime)) - }) -} - -// TestRoll for test rolling -func TestRoll(t *testing.T) { - convey.Convey("TestRoll", t, func() { - dir := makeTempDir("TestRotate") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - SaveVolume: testSaveVolume2, - Capacity: testCapacity2, // megabytes - } - defer l.Close() - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - err := l.Roll() - convey.So(err, convey.ShouldBeNil) - - <-time.After(waitTime * time.Millisecond) - - filename2 := getBackupFile(dir, time.Now()) - existWithContent(b, filename2) - existWithContent([]byte{}, fileName) - fileCount(fileCountTwo, dir) - - patch2.Reset() - patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time3 := currentTime - return time3.Add(time.Hour * oneDayHour * twentyOneDays) - }) - defer patch3.Reset() - err = l.Roll() - convey.So(err, convey.ShouldBeNil) - - <-time.After(waitTime * time.Millisecond) - - filename3 := getBackupFile(dir, time.Now()) - existWithContent([]byte{}, filename3) - existWithContent([]byte{}, fileName) - fileCount(fileCountTwo, dir) - - b2 := []byte("foooooo!") - fileWrite(b2, l) - existWithContent(b2, fileName) - }) -} - -// TestJson for test JSON conversion -func TestJson(t *testing.T) { - convey.Convey("TestJson", t, func() { - data := []byte(` - { - "filename": "foo", - "capacity": 10, - "savetime": 10, - "savevolume": 3, - "localorutc": true - }`[1:]) - - l := Logs{} - err := json.Unmarshal(data, &l) - convey.So(err, convey.ShouldBeNil) - convey.So("foo", convey.ShouldEqual, l.FileName) - convey.So(testCapacity, convey.ShouldEqual, l.Capacity) - convey.So(testSaveTime, convey.ShouldEqual, l.SaveTime) - convey.So(testSaveVolume, convey.ShouldEqual, l.SaveVolume) - convey.So(true, convey.ShouldEqual, l.LocalOrUTC) - }) -} - -// makeTempDir creates a file in the OS temp directory to keep parallel test -func makeTempDir(name string) string { - dir := time.Now().Format(name + timeFormat) - dir = filepath.Join(os.TempDir(), dir) - err := os.Mkdir(dir, testDirPermission) - convey.So(err, convey.ShouldBeNil) - return dir -} - -// existWithContent checks that the given file exists and has the correct content -func existWithContent(content []byte, dir string) { - info, err := os.Stat(dir) - convey.So(err, convey.ShouldBeNil) - convey.So(int64(len(content)), convey.ShouldEqual, info.Size()) - - b, err := ioutil.ReadFile(dir) - convey.So(err, convey.ShouldBeNil) - convey.So(content, convey.ShouldResemble, b) -} - -// getLogFile returns the log file name in the given directory for the current fake time -func getLogFile(dir string) string { - return filepath.Join(dir, "foobar.log") -} - -func getBackupFile(dir string, t time.Time) string { - return filepath.Join(dir, "foobar-"+t.UTC().Format(timeFormat)+".log") -} - -func getBackupFileLocal(dir string, t time.Time) string { - return filepath.Join(dir, "foobar-"+t.Format(timeFormat)+".log") -} - -// fileCount checks that the number of files in the directory is exp. -func fileCount(exp int, dir string) { - files, err := ioutil.ReadDir(dir) - convey.So(err, convey.ShouldBeNil) - convey.So(len(files), convey.ShouldEqual, exp) -} - -func fileWrite(b []byte, l *Logs) { - n, err := l.Write(b) - convey.So(err, convey.ShouldBeNil) - convey.So(len(b), convey.ShouldEqual, n) -} - -func shouldNotBeExist(actual interface{}, expected ...interface{}) string { - err, ok := actual.(error) - if !ok { - return "incorrect parameter type" - } - if os.IsNotExist(err) { - return "" - } - return "File exists, but should not have been created" -} -func shouldNotExist(actual interface{}, expected ...interface{}) string { - path, ok := actual.(string) - if !ok { - return "incorrect parameter type" - } - _, err := os.Stat(path) - if os.IsNotExist(err) { - return "" - } - return fmt.Sprintf("expected to get os.IsNotExist, but instead got %v", err) -} - -func shouldExist(actual interface{}, expected ...interface{}) string { - path, ok := actual.(string) - if !ok { - return "incorrect parameter type" - } - _, err := os.Stat(path) - if err != nil { - return fmt.Sprintf("expected file to exist, but got error from os.Stat: %v", err) - } - return "" -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/types.go b/mind-cluster/component/ascend-common/common-utils/hwlog/types.go deleted file mode 100644 index e97c80b..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/types.go +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import "errors" - -// ContextKey especially for context value -// to solve problem of "should not use basic type untyped string as key in context.WithValue" -type ContextKey string - -// String the implement of String method -func (c ContextKey) String() string { - return string(c) -} - -const ( - // UserID used for context value key of "ID" - UserID ContextKey = "UserID" - // ReqID used for context value key of "requestID" - ReqID ContextKey = "RequestID" - // extraDeepKey used for context value key of "extraDeepKey" - extraDeepKey ContextKey = "extraDeepKey" -) - -// SelfLogWriter used this to replace some opensource log -type SelfLogWriter struct { -} - -// Write implement the interface of io.writer -func (l *SelfLogWriter) Write(p []byte) (int, error) { - if RunLog == nil { - return -1, errors.New("hwlog is not initialized") - } - RunLog.Info(string(p)) - return len(p), nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go deleted file mode 100644 index 40955f4..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "bytes" - "context" - "fmt" - "log" - "runtime" - "strings" -) - -// printHelper helper function for log printing -func printHelper(lg *log.Logger, msg string, maxLogLength int, ctx ...context.Context) { - str := getCallerInfo(ctx...) - trimMsg := strings.Replace(msg, "\r", " ", -1) - trimMsg = strings.Replace(trimMsg, "\n", " ", -1) - runeArr := []rune(trimMsg) - if length := len(runeArr); length > maxLogLength { - trimMsg = string(runeArr[:maxLogLength]) - } - lg.Println(str + trimMsg) -} - -// getCallerInfo gets the caller's information -func getCallerInfo(ctx ...context.Context) string { - var deep = stackDeep - var userID interface{} - var traceID interface{} - for _, c := range ctx { - if c == nil { - deep++ - continue - } - userID = c.Value(UserID) - traceID = c.Value(ReqID) - if val := c.Value(extraDeepKey); val != nil { - currentVal, _ := val.(int) // security type assertions, invalid values are automatically zeroed - deep += currentVal - } - } - var funcName string - pc, codePath, codeLine, ok := runtime.Caller(deep) - if ok { - funcName = runtime.FuncForPC(pc).Name() - } - p := strings.Split(codePath, "/") - l := len(p) - if l == pathLen { - funcName = p[l-1] - } else if l > pathLen { - funcName = fmt.Sprintf("%s/%s", p[l-pathLen], p[l-1]) - } - callerPath := fmt.Sprintf("%s:%d", funcName, codeLine) - goroutineID := getGoroutineID() - str := fmt.Sprintf("%-8s%s ", goroutineID, callerPath) - if userID != nil || traceID != nil { - str = fmt.Sprintf("%s{%#v}-{%#v} ", str, userID, traceID) - } - return str -} - -// getCallerGoroutineID gets the goroutineID -func getGoroutineID() string { - b := make([]byte, bitsize, bitsize) - b = b[:runtime.Stack(b, false)] - b = bytes.TrimPrefix(b, []byte("goroutine ")) - b = b[:bytes.IndexByte(b, ' ')] - return string(b) -} - -// DeepIncrease increases the stack depth by 1 -func DeepIncrease(ctx context.Context) context.Context { - if ctx == nil { - return context.WithValue(context.Background(), extraDeepKey, 1) - } - - var currentVal int - if val := ctx.Value(extraDeepKey); val != nil { - currentVal, _ = val.(int) // security type assertions, invalid values are automatically zeroed - } - - return context.WithValue(ctx, extraDeepKey, currentVal+1) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go deleted file mode 100644 index ca2bda2..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "context" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestUtilsFunc(t *testing.T) { - convey.Convey("test utils", t, func() { - convey.Convey("test utils func", func() { - lg := new(logger) - conf := &LogConfig{OnlyToStdout: true} - userCtx := context.TODO() - userCtx = context.WithValue(userCtx, UserID, 0) - userCtx = context.WithValue(userCtx, ReqID, 0) - err := lg.setLogger(conf) - convey.So(err, convey.ShouldBeNil) - printHelper(lg.lgInfo, "test", defaultMaxEachLineLen) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go deleted file mode 100644 index fdab9a8..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go +++ /dev/null @@ -1,226 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limiter -package limiter - -import ( - "context" - "errors" - "fmt" - "math" - "net/http" - "regexp" - "strconv" - "strings" - "syscall" - "time" - - "ascend-common/common-utils/cache" - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" -) - -const ( - kilo = 1000.0 - // DefaultDataLimit default http body limit size - DefaultDataLimit = 1024 * 1024 * 10 - defaultMaxConcurrency = 1024 - maxStringLen = 20 - // DefaultCacheSize default cache size - DefaultCacheSize = 1024 * 100 - arrLen = 2 - // IPReqLimitReg ip request limit regex string - IPReqLimitReg = "^[1-9]\\d{0,2}/[1-9]\\d{0,2}$" -) - -type limitHandler struct { - concurrency chan struct{} - httpHandler http.Handler - log bool - method string - limitBytes int64 - ipExpiredTime time.Duration - ipCache *cache.ConcurrencyLRUCache -} - -// HandlerConfig the configuration of the limitHandler -type HandlerConfig struct { - // PrintLog whether you need print access log, when use gin framework, suggest to set false,otherwise set true - PrintLog bool - // Method only allow setting http method pass - Method string - // LimitBytes set the max http body size - LimitBytes int64 - // TotalConCurrency set the program total concurrent http request - TotalConCurrency int - // IPConCurrency set the signle IP concurrent http request "2/1sec" - IPConCurrency string - // CacheSize the local cacheSize - CacheSize int -} - -// StatusResponseWriter the writer record the http status -type StatusResponseWriter struct { - http.ResponseWriter - http.Hijacker - Status int -} - -// WriteHeader override the WriteHeader method -func (w *StatusResponseWriter) WriteHeader(status int) { - w.ResponseWriter.WriteHeader(status) - w.Status = status -} - -// ServeHTTP implement http.Handler -func (h *limitHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { - req.Body = http.MaxBytesReader(w, req.Body, h.limitBytes) - ctx := initContext(req) - path := req.URL.Path - clientUserAgent := req.UserAgent() - clientIP := utils.ClientIP(req) - if clientIP != "" && h.ipCache != nil { - if !h.ipCache.SetIfNX(fmt.Sprintf("key-%s", clientIP), "v", h.ipExpiredTime) { - hwlog.RunLog.WarnfWithCtx(ctx, "Single IP request reject:%s: %s <%3d> |%15s |%s |%d ", req.Method, - path, http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) - http.Error(w, "503 too busy", http.StatusServiceUnavailable) - return - } - } - select { - case _, ok := <-h.concurrency: - if !ok { - // channel closed and no need return token - return - } - if h.method != "" && req.Method != h.method { - http.NotFound(w, req) - // recover token to the bucket - h.concurrency <- struct{}{} - return - } - hwlog.RunLog.Debugf("token count:%d", len(h.concurrency)) - start := time.Now() - statusRes := newResponse(w) - h.httpHandler.ServeHTTP(statusRes, req) - stop := time.Since(start) - h.concurrency <- struct{}{} - latency := int(math.Ceil(float64(stop.Nanoseconds()) / kilo / kilo)) - if h.log { - hwlog.RunLog.InfofWithCtx(ctx, "%s %s: %s <%3d> (%dms) |%15s |%s |%d", req.Proto, req.Method, path, - statusRes.Status, latency, clientIP, clientUserAgent, syscall.Getuid()) - } - default: - hwlog.RunLog.WarnfWithCtx(ctx, "Total reject request:%s: %s <%3d> |%15s |%s |%d ", req.Method, path, - http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) - http.Error(w, "503 too busy", http.StatusServiceUnavailable) - } -} - -func newResponse(w http.ResponseWriter) *StatusResponseWriter { - jk, ok := w.(http.Hijacker) - if !ok { - hwlog.RunLog.Warn("hijack not implement") - } - statusRes := &StatusResponseWriter{ - ResponseWriter: w, - Status: http.StatusOK, - Hijacker: jk, - } - return statusRes -} - -func initContext(req *http.Request) context.Context { - ctx := context.Background() - reqID := req.Header.Get(hwlog.ReqID.String()) - if reqID != "" { - ctx = context.WithValue(context.Background(), hwlog.ReqID, reqID) - } - id := req.Header.Get(hwlog.UserID.String()) - if id != "" { - ctx = context.WithValue(ctx, hwlog.UserID, id) - } - return ctx -} - -// NewLimitHandler new a bucket-token limiter -func NewLimitHandler(maxConcur, maxConcurrency int, handler http.Handler, printLog bool) (http.Handler, error) { - return NewLimitHandlerWithMethod(maxConcur, maxConcurrency, handler, printLog, "") -} - -// NewLimitHandlerWithMethod new a bucket-token limiter with specific http method -func NewLimitHandlerWithMethod(maxConcur, maxConcurrency int, handler http.Handler, printLog bool, - httpMethod string) (http.Handler, error) { - if maxConcur < 1 || maxConcur > maxConcurrency { - return nil, errors.New("maxConcurrency parameter error") - } - conchan := make(chan struct{}, maxConcur) - return createHandler(conchan, handler, printLog, httpMethod, DefaultDataLimit), nil -} - -func createHandler(ch chan struct{}, handler http.Handler, printLog bool, - httpMethod string, bodySizeLimit int64) *limitHandler { - h := &limitHandler{ - concurrency: ch, - httpHandler: handler, - log: printLog, - method: httpMethod, - limitBytes: bodySizeLimit, - ipExpiredTime: time.Duration(-1), - } - for i := 0; i < cap(ch); i++ { - h.concurrency <- struct{}{} - } - return h -} - -// NewLimitHandlerV2 new a bucket-token limiter which contains limit request by IP -func NewLimitHandlerV2(handler http.Handler, conf *HandlerConfig) (http.Handler, error) { - if conf == nil { - return nil, errors.New("parameter error") - } - if conf.TotalConCurrency < 1 || conf.TotalConCurrency > defaultMaxConcurrency { - return nil, errors.New("totalConCurrency parameter error") - } - if len(conf.Method) > maxStringLen { - return nil, errors.New("method parameter error") - } - if conf.CacheSize <= 0 { - hwlog.RunLog.Info("use default cache size") - conf.CacheSize = DefaultCacheSize - } - reg := regexp.MustCompile(IPReqLimitReg) - if !reg.Match([]byte(conf.IPConCurrency)) { - return nil, errors.New("IPConCurrency parameter error") - } - conchan := make(chan struct{}, conf.TotalConCurrency) - h := createHandler(conchan, handler, conf.PrintLog, conf.Method, conf.LimitBytes) - arr := strings.Split(conf.IPConCurrency, "/") - if len(arr) != arrLen || arr[0] == "0" { - return nil, errors.New("IPConCurrency parameter error") - } - arr1, err := strconv.ParseInt(arr[1], 0, 0) - if err != nil { - return nil, fmt.Errorf("IPConCurrency parameter(%s) error, parse to int failed: %v", arr[1], err) - } - arr0, err := strconv.ParseInt(arr[0], 0, 0) - if err != nil || arr0 == 0 { - return nil, fmt.Errorf("IPConCurrency parameter(%s) error,parse to int failed: %v", arr[0], err) - } - h.ipExpiredTime = time.Duration(arr1 * int64(time.Second) / arr0) - h.ipCache = cache.New(DefaultCacheSize) - return h, nil - -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go deleted file mode 100644 index 69dbb8e..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limiter -package limiter - -import ( - "context" - "net/http" - "net/url" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" -) - -func init() { - config := hwlog.LogConfig{ - OnlyToStdout: true, - } - hwlog.InitRunLogger(&config, context.TODO()) -} -func TestServeHTTP(t *testing.T) { - convey.Convey("test limitHandler serveHTTP", t, func() { - h, w, r := initVarable() - convey.Convey("header contains reqID and userID,", func() { - mock := gomonkey.ApplyMethodFunc(h.httpHandler, "ServeHTTP", func(http.ResponseWriter, - *http.Request) { - return - }) - defer mock.Reset() - h.ServeHTTP(w.ResponseWriter, r) - convey.So(len(h.concurrency), convey.ShouldEqual, 1) - }) - convey.Convey("token channel close,", func() { - mock := gomonkey.ApplyFunc(http.Error, func(http.ResponseWriter, string, int) { - return - }) - defer mock.Reset() - _, ok := <-h.concurrency - if !ok { - return - } - h.ServeHTTP(w.ResponseWriter, r) - convey.So(len(h.concurrency), convey.ShouldEqual, 0) - }) - }) -} - -func initVarable() (*limitHandler, StatusResponseWriter, *http.Request) { - lh, err := NewLimitHandler(1, len2, http.DefaultServeMux, false) - if err != nil { - return nil, StatusResponseWriter{}, nil - } - v, ok := lh.(*limitHandler) - if !ok { - return nil, StatusResponseWriter{}, nil - } - w := StatusResponseWriter{ - ResponseWriter: nil, - Status: 0, - } - r := &http.Request{ - URL: &url.URL{ - Path: "test.com", - }, - Header: map[string][]string{"userID": {"1"}, "reqID": {"requestIDxxxx"}}, - Method: "GET", - } - return v, w, r -} - -func TestNewLimitHandlerV2(t *testing.T) { - conf := &HandlerConfig{ - PrintLog: false, - Method: "", - LimitBytes: DefaultDataLimit, - TotalConCurrency: defaultMaxConcurrency, - IPConCurrency: "2/1", - CacheSize: DefaultCacheSize, - } - convey.Convey("normal situation,no err return", t, func() { - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("IPConCurrency parameter error", t, func() { - conf.IPConCurrency = "2021/1" - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("cacheSize parameter error", t, func() { - conf.CacheSize = 0 - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("method parameter error", t, func() { - conf.Method = "20/iajsdkjas2jhjdklsjkldjsdfasd1" - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("TotalConCurrency parameter error", t, func() { - conf.TotalConCurrency = 0 - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go deleted file mode 100644 index b81d511..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limit listener, refer to "golang.org/x/net/netutil" and -// change the acquire method, if acquire failed, return false immediately -package limiter - -import ( - "errors" - "fmt" - "net" - "strings" - "sync" - "time" - - "ascend-common/common-utils/cache" - "ascend-common/common-utils/hwlog" -) - -const ( - maxConnection = 1024 - maxIPConnection = 512 - - largeMaxConnection = 16384 -) - -func commonLimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { - if IPConnLimit < 0 || IPConnLimit > maxIPConnection { - return nil, errors.New("the parameter IPConnLimit is illegal") - } - bucket := make(chan struct{}, totalConnLimit) - ll := &localLimitListener{ - Listener: l, - buckets: bucket, - ipConnLimit: int64(IPConnLimit), - } - if cacheSize > 0 { - ll.ipCache = cache.New(cacheSize) - } - return ll, nil -} - -// LimitListener returns a Listener that accepts at most n connections at the same time -func LimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { - if totalConnLimit < 0 || totalConnLimit > maxConnection { - return nil, errors.New("the parameter totalConnLimit is illegal") - } - return commonLimitListener(l, totalConnLimit, IPConnLimit, cacheSize) -} - -type localLimitListener struct { - net.Listener - buckets chan struct{} - closeOnce sync.Once - ipCache *cache.ConcurrencyLRUCache - ipConnLimit int64 -} - -// acquire acquires the limiting semaphore. Returns true if successfully -// accquired, false if the listener is closed or reach the max limit -func (l *localLimitListener) acquire() bool { - select { - case l.buckets <- struct{}{}: - return true - default: - return false - } -} -func (l *localLimitListener) release() { <-l.buckets } - -// Accept implement net.Listener interface -func (l *localLimitListener) Accept() (net.Conn, error) { - c, err := l.Listener.Accept() - if err != nil { - return nil, err - } - // ip connection limit - ip, cacheKey := getIpAndKey(c) - if ip != "" && l.ipCache != nil { - if counts, err := l.ipCache.INCR(cacheKey, -1); err == nil && counts > l.ipConnLimit { - hwlog.RunLog.Warn("ip connections reach max limit, connection will to force closed") - return closeImmediately(c, l.ipCache), nil - } - } - // total tcp connection limit - if l.acquire() { - return &limitListenerConn{Conn: c, release: l.release, ipCache: l.ipCache}, nil - } - hwlog.RunLog.Warn("limit forbidden, connection will to force closed") - return closeImmediately(c, l.ipCache), nil - -} - -func getIpAndKey(c net.Conn) (string, string) { - ipWithPort := c.RemoteAddr().String() - if ipWithPort != "" { - s := strings.Split(ipWithPort, ":") - return s[0], fmt.Sprintf("key-conn-%s", s[0]) - } - return "", "" -} - -func closeImmediately(c net.Conn, lruCache *cache.ConcurrencyLRUCache) net.Conn { - // once the connection reach the max limit, force close the connection - tcpConn, ok := c.(*net.TCPConn) - if ok { - if err := tcpConn.SetLinger(0); err != nil { - hwlog.RunLog.Warnf("Error when setting linger: %s", err) - } - } - - err := c.Close() - if err != nil { - hwlog.RunLog.Warn(err) - } - return &limitListenerConn{Conn: c, release: func() {}, ipCache: lruCache} -} - -// Close implement net.Listener interface -func (l *localLimitListener) Close() error { - err := l.Listener.Close() - l.closeOnce.Do(func() { close(l.buckets) }) - return err -} - -type limitListenerConn struct { - net.Conn - releaseOnce sync.Once - release func() - ipCache *cache.ConcurrencyLRUCache -} - -// Close override net.Conn interface -func (l *limitListenerConn) Close() error { - err := l.Conn.Close() - if err != nil { - hwlog.RunLog.Debugf("close grpc connect failed: %v", err) - return fmt.Errorf("close grpc connect failed: %v", err) - } - l.releaseOnce.Do(l.release) - ip, cacheKey := getIpAndKey(l.Conn) - if ip != "" && l.ipCache != nil { - d, err := l.ipCache.DECR(cacheKey, time.Hour) - if err != nil { - hwlog.RunLog.Error(err) - } - hwlog.RunLog.Debugf("decrement ip connections %d", d) - } - return err -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go deleted file mode 100644 index 631e1bb..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limiter -package limiter - -import ( - "errors" - "net" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -const ( - len2 = 2 -) - -func TestLimitListenerAccept(t *testing.T) { - convey.Convey("test Accept function", t, func() { - - limitLor, err := LimitListener(&mockLicener{}, len2, len2, DefaultCacheSize) - if err != nil { - return - } - l, ok := limitLor.(*localLimitListener) - if !ok { - return - } - mock2 := gomonkey.ApplyFunc(getIpAndKey, func(net.Conn) (string, string) { - return "127.0.0.1", "key-127.0.0.1" - }) - defer mock2.Reset() - convey.Convey("acquire token success", func() { - _, err = l.Accept() - convey.So(err, convey.ShouldEqual, nil) - }) - - convey.Convey("accept failed", func() { - mock := gomonkey.ApplyMethodFunc(l.Listener, "Accept", func() (net.Conn, error) { - return nil, errors.New("mock error") - }) - defer mock.Reset() - con, err := l.Accept() - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(con, convey.ShouldEqual, nil) - }) - - convey.Convey("acquire token failed", func() { - mock := gomonkey.ApplyPrivateMethod(l, "acquire", func(*localLimitListener) bool { - return false - }) - defer mock.Reset() - con, err := l.Accept() - convey.So(err, convey.ShouldEqual, nil) - conm, ok := con.(*limitListenerConn) - if !ok { - return - } - convey.So(conm.release, convey.ShouldNotEqual, nil) - }) - - }) -} - -type mockLicener struct { -} - -func (l *mockLicener) Accept() (net.Conn, error) { - return &net.TCPConn{}, nil -} - -func (l *mockLicener) Addr() net.Addr { - return &net.IPAddr{ - IP: []byte("127.0.0.1"), - Zone: "", - } -} - -func (l *mockLicener) Close() error { - return nil -} - -func TestGetIpAndKey(t *testing.T) { - convey.Convey("test getIp function", t, func() { - c := net.TCPConn{} - mock := gomonkey.ApplyMethodFunc(&c, "RemoteAddr", func() net.Addr { - return &net.IPAddr{ - IP: []byte("127.0.0.1"), - Zone: "", - } - }) - defer mock.Reset() - ip, _ := getIpAndKey(&c) - convey.So(ip, convey.ShouldNotEqual, "") - }) -} - -func TestLimitListener(t *testing.T) { - convey.Convey("test new listener function success", t, func() { - l, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection, DefaultDataLimit) - convey.So(l, convey.ShouldNotEqual, nil) - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("test new listener function", t, func() { - _, err := LimitListener(&mockLicener{}, maxConnection+1, maxIPConnection, DefaultDataLimit) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("test new listener function", t, func() { - _, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection+1, DefaultDataLimit) - convey.So(err, convey.ShouldNotEqual, nil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go deleted file mode 100644 index 9117d07..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a writer limiter -package limiter - -import ( - "bytes" - "errors" - - "ascend-common/common-utils/hwlog" -) - -const defaultLimit = 1024 - -// LimitedWriter limit the size of written data -type LimitedWriter struct { - buffer *bytes.Buffer - limit int - size int -} - -// NewLimitedWriter create a LimitedWriter -func NewLimitedWriter(limit int) *LimitedWriter { - if limit <= 0 { - hwlog.RunLog.Warnf("limit: %v is invalid, set default limit: %v", limit, defaultLimit) - limit = defaultLimit - } - return &LimitedWriter{ - buffer: &bytes.Buffer{}, - limit: limit, - } -} - -// Write write bytes to buffer -func (lw *LimitedWriter) Write(p []byte) (int, error) { - if lw.size+len(p) > lw.limit { - return 0, errors.New("buffer limit exceeded") - } - n, err := lw.buffer.Write(p) - if err == nil { - lw.size += n - } - return n, err -} - -// GetBufferBytes get buffer bytes -func (lw *LimitedWriter) GetBufferBytes() []byte { - if lw.buffer == nil { - return []byte{} - } - return lw.buffer.Bytes() -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go deleted file mode 100644 index 9a308f3..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a writer limiter -package limiter - -import ( - "io" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestLimitWriterWrite(t *testing.T) { - convey.Convey("test limiter Writer write function", t, func() { - data := []byte("test") - limitBuffer := NewLimitedWriter(len(data)) - - n, err := limitBuffer.Write(data) - convey.So(err, convey.ShouldBeNil) - convey.So(n, convey.ShouldEqual, len(data)) - n, err = limitBuffer.Write(data) - convey.So(err, convey.ShouldEqual, io.EOF) - convey.So(n, convey.ShouldEqual, 0) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go deleted file mode 100644 index 1a97a1b..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security rand -package rand - -import ( - "errors" - "fmt" - "io" - "os" - "runtime" - "sync" - "time" -) - -const ( - maxReadSize = 1<<25 - 1 -) - -// A randomReader satisfies reads by reading the file named name. -type randomReader struct { - f io.Reader - mu sync.Mutex -} - -func init() { - Reader = &randomReader{} -} - -func warnBlocked() { - fmt.Println("mindx-security/rand: blocked for 60 seconds waiting to read random data from the kernel") -} - -var supportOs = "linux" - -// Read implements the interface of io.Reader -func (r *randomReader) Read(b []byte) (int, error) { - t := time.AfterFunc(time.Minute, warnBlocked) - defer t.Stop() - if len(b) > maxReadSize { - return 0, errors.New("byte size is too large") - } - r.mu.Lock() - defer r.mu.Unlock() - if runtime.GOOS != supportOs { - return 0, errors.New("not supported") - } - f, err := os.Open("/dev/random") - if err != nil { - return 0, err - } - defer func() { - err = f.Close() - if err != nil { - fmt.Println("close random file failed") - } - }() - return f.Read(b) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go deleted file mode 100644 index b02d9d6..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security rand -package rand - -import ( - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -const ( - illegalSize = 1 << 25 -) - -func TestInnerRead(t *testing.T) { - convey.Convey("test random read func", t, func() { - reader := &randomReader{} - convey.Convey("read size too large, err returned", func() { - bs := make([]byte, illegalSize, illegalSize) - r, err := reader.Read(bs) - convey.So(err.Error(), convey.ShouldEqual, "byte size is too large") - convey.So(r, convey.ShouldEqual, 0) - }) - convey.Convey("windows,err returned", func() { - mock := gomonkey.ApplyGlobalVar(&supportOs, "windows") - defer mock.Reset() - bs := make([]byte, 1, 1) - r, err := reader.Read(bs) - convey.So(err.Error(), convey.ShouldEqual, "not supported") - convey.So(r, convey.ShouldEqual, 0) - }) - convey.Convey("normal situation,no err returned", func() { - // the length of byte is one, to prevent block when generate random - bs := make([]byte, 1, 1) - r, err := reader.Read(bs) - convey.So(err, convey.ShouldEqual, nil) - convey.So(r, convey.ShouldEqual, 1) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random.go b/mind-cluster/component/ascend-common/common-utils/rand/random.go deleted file mode 100644 index 353d868..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/random.go +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security io.Reader -package rand - -import ( - "io" -) - -// Reader rand reader to generate security random bytes -var Reader io.Reader - -// Read is a helper function that calls Reader.Read using io.ReadFull. -func Read(b []byte) (int, error) { - return io.ReadFull(Reader, b) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random_test.go b/mind-cluster/component/ascend-common/common-utils/rand/random_test.go deleted file mode 100644 index 04ce333..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/random_test.go +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security rand -package rand - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestRead(t *testing.T) { - convey.Convey("package function test,normal situation", t, func() { - // the length of byte is one, to prevent block when generate random - bs := make([]byte, 1, 1) - l, err := Read(bs) - convey.So(err, convey.ShouldEqual, nil) - convey.So(l, convey.ShouldEqual, 1) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env.go b/mind-cluster/component/ascend-common/common-utils/utils/env.go deleted file mode 100644 index 4402375..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/env.go +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils env function -package utils - -import ( - "fmt" - "os/user" - "strconv" -) - -// GetCurrentUid get current uid -func GetCurrentUid() (uint32, error) { - userInfo, err := user.Current() - if err != nil { - return 0, fmt.Errorf("get current user info failed: %v", err) - } - uid, err := strconv.Atoi(userInfo.Uid) - if err != nil { - return 0, fmt.Errorf("convert uid to int failed: %v", err) - } - return uint32(uid), nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env_test.go b/mind-cluster/component/ascend-common/common-utils/utils/env_test.go deleted file mode 100644 index 95d8983..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/env_test.go +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils env test -package utils - -import ( - "fmt" - "os/user" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -func TestGetCurrentUid(t *testing.T) { - convey.Convey("test func GetCurrentUid success", t, func() { - var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "0"}, nil) - defer p1.Reset() - uid, err := GetCurrentUid() - convey.So(err, convey.ShouldBeNil) - convey.So(uid, convey.ShouldEqual, 0) - }) - convey.Convey("test func GetCurrentUid failed, get current user info failed", t, func() { - var p1 = gomonkey.ApplyFuncReturn(user.Current, nil, testErr) - defer p1.Reset() - uid, err := GetCurrentUid() - expErr := fmt.Errorf("get current user info failed: %v", testErr) - convey.So(err, convey.ShouldResemble, expErr) - convey.So(uid, convey.ShouldEqual, 0) - }) - convey.Convey("test func GetCurrentUid failed, uid is invalid", t, func() { - var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "invalid uid"}, nil) - defer p1.Reset() - uid, err := GetCurrentUid() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "convert uid to int failed") - convey.So(uid, convey.ShouldEqual, 0) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file.go b/mind-cluster/component/ascend-common/common-utils/utils/file.go deleted file mode 100644 index 253e2b5..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file.go +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "errors" - "fmt" - "io" - "io/ioutil" - "os" - "path/filepath" - "reflect" - "strings" -) - -const ( - // FileMode file privilege - FileMode = 0600 - // Size10M bytes of 10M - Size10M = 10 * 1024 * 1024 - maxSize = 1024 * 1024 * 1024 -) - -// ReadLimitBytes read limit length of contents from file path -func ReadLimitBytes(path string, limitLength int) ([]byte, error) { - if limitLength < 0 || limitLength > maxSize { - return nil, errors.New("the limit length is not valid") - } - - key, err := CheckPath(path) - if err != nil { - return nil, err - } - file, err := os.OpenFile(key, os.O_RDONLY, FileMode) - if err != nil { - return nil, errors.New(fmt.Sprintf("open file with read-only and %04o mode failed", FileMode)) - } - defer file.Close() - buf := make([]byte, limitLength, limitLength) - l, err := file.Read(buf) - if err != nil { - return nil, fmt.Errorf("read file failed: %v", err) - } - return buf[0:l], nil -} - -// LoadFile load file content -func LoadFile(filePath string) ([]byte, error) { - if filePath == "" { - return nil, nil - } - absPath, err := filepath.Abs(filePath) - if err != nil { - return nil, fmt.Errorf("the filePath is invalid: %v", err) - } - if !IsExist(absPath) { - return nil, nil - } - - return ReadLimitBytes(absPath, Size10M) -} - -func closeFile(file *os.File) { - if file == nil { - return - } - if err := file.Close(); err != nil { - return - } - return -} - -// CopyFile copy file -func CopyFile(src, dst string) error { - src, err := CheckPath(src) - if err != nil { - return err - } - if IsExist(dst) { - dst, err = CheckPath(dst) - if err != nil { - return err - } - } - - srcFile, err := os.Open(src) - if err != nil { - return err - } - defer closeFile(srcFile) - - srcInfo, err := os.Stat(src) - if err != nil { - return err - } - - dstFile, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE|os.O_TRUNC, srcInfo.Mode()) - if err != nil { - return err - } - defer closeFile(dstFile) - - if _, err = io.Copy(dstFile, srcFile); err != nil { - return err - } - return os.Chmod(dst, srcInfo.Mode()) -} - -// CopyDir recursively copy files -func CopyDir(src string, dst string) error { - var ( - err error - fds []os.FileInfo = nil - dstInfo os.FileInfo - ) - - if dstInfo, err = os.Stat(src); err != nil { - return err - } - if err = os.MkdirAll(dst, dstInfo.Mode()); err != nil { - return err - } - if subFolder(src, dst) { - return errors.New("the destination directory is a subdirectory of the source directory") - } - if fds, err = ioutil.ReadDir(src); err != nil { - return err - } - for _, fd := range fds { - srcFile := filepath.Join(src, fd.Name()) - dstFile := filepath.Join(dst, fd.Name()) - if fd.IsDir() { - if err = CopyDir(srcFile, dstFile); err != nil { - return err - } - } else { - if err = CopyFile(srcFile, dstFile); err != nil { - return err - } - } - } - return nil -} - -func subFolder(src, dst string) bool { - if src == dst { - return true - } - srcReal, err := filepath.EvalSymlinks(src) - if err != nil { - return false - } - dstReal, err := filepath.EvalSymlinks(dst) - if err != nil { - return false - } - srcList := strings.Split(srcReal, string(os.PathSeparator)) - dstList := strings.Split(dstReal, string(os.PathSeparator)) - if len(srcList) > len(dstList) { - return false - } - return reflect.DeepEqual(srcList, dstList[:len(srcList)]) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check.go deleted file mode 100644 index 4134245..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_check.go +++ /dev/null @@ -1,240 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "fmt" - "io/fs" - "os" - "path/filepath" - "strings" - "syscall" -) - -const ( - notValidPath = "not-valid-file-path" - maxAllowFileSize int64 = 1024 * 100 // in megabytes - oneMegabytes int64 = 1024 * 1024 - // DefaultWhiteList default white list in string - DefaultWhiteList = "-_./~" - // DefaultStringLength default string max length - DefaultStringLength = 256 - // DefaultPathLength default path max length - DefaultPathLength = 4096 -) - -// RealFileChecker Check whether the file is valid -func RealFileChecker(path string, checkParent, allowLink bool, size int64) (string, error) { - realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) - if err != nil { - return notValidPath, err - } - if fileInfo.IsDir() { - return notValidPath, fmt.Errorf("invalid dir") - } - if !fileInfo.Mode().IsRegular() { - return notValidPath, fmt.Errorf("invalid regular file") - } - if size > maxAllowFileSize || size < 0 { - return notValidPath, fmt.Errorf("invalid size") - } - if fileInfo.Size() > size*oneMegabytes { - return notValidPath, fmt.Errorf("size too large") - } - return realPath, nil -} - -// RealDirChecker Check whether the directory is valid -func RealDirChecker(path string, checkParent, allowLink bool) (string, error) { - realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) - if err != nil { - return notValidPath, err - } - if !fileInfo.IsDir() { - return notValidPath, fmt.Errorf("is not dir") - } - return realPath, nil -} - -// PathStringChecker Check whether the directory string is valid -func PathStringChecker(path string) (string, error) { - realPath, err := filepath.Abs(path) - if err != nil { - return notValidPath, err - } - if len(realPath) > DefaultPathLength { - return notValidPath, fmt.Errorf("path over max path length") - } - if !stringChecker(realPath, 0, DefaultPathLength) { - return notValidPath, fmt.Errorf("invalid path") - } - if err = pathDepthChecker(realPath, 0); err != nil { - return notValidPath, err - } - return realPath, nil -} - -// VerifyFile verify the file after it is opened. -func VerifyFile(file *os.File, size int64) error { - fileInfo, err := file.Stat() - if err != nil { - return err - } - if size > maxAllowFileSize || size < 0 { - return fmt.Errorf("invalid size") - } - if fileInfo.Size() > size*oneMegabytes { - return fmt.Errorf("file size error %v", fileInfo.Size()) - } - if (fileInfo.Mode() & fs.ModeSymlink) != 0 { - return fmt.Errorf("file is softlink") - } - if st := fileInfo.Sys(); st.(*syscall.Stat_t).Uid != uint32(os.Geteuid()) { - return fmt.Errorf("file owner incorrect") - } - return nil -} - -// SafeChmod after the verification is complete, run the chmod command. -func SafeChmod(path string, size int64, mode os.FileMode) error { - file, err := os.Open(path) - if err != nil { - return err - } - defer file.Close() - if err = VerifyFile(file, size); err != nil { - return err - } - if err = file.Chmod(mode); err != nil { - return err - } - return nil -} - -func realPathChecker(path string, checkParent, allowLink bool) (string, os.FileInfo, error) { - realPath, err := filepath.Abs(path) - if err != nil { - return notValidPath, nil, err - } - if len(realPath) > DefaultPathLength { - return notValidPath, nil, fmt.Errorf("path over max path length") - } - if !stringChecker(realPath, 0, DefaultPathLength) { - return notValidPath, nil, fmt.Errorf("invalid path") - } - if err = fileChecker(realPath, true, checkParent, allowLink, 0); err != nil { - return notValidPath, nil, err - } - fileInfo, err := os.Stat(realPath) - if err != nil { - return notValidPath, nil, err - } - return realPath, fileInfo, nil -} - -func fileChecker(path string, allowDir, checkParent, allowLink bool, deep int) error { - const maxDepth int = 99 - if deep > maxDepth { - return fmt.Errorf("over maxDepth %d", maxDepth) - } - fileInfo, err := normalFileCheck(path, allowDir, allowLink) - if err != nil { - return err - } - if err = checkOwnerAndPermission(fileInfo, path); err != nil { - return err - } - if path != "/" && checkParent { - return fileChecker(filepath.Dir(path), true, true, allowLink, deep+1) - } - return nil -} - -func pathDepthChecker(path string, deep int) error { - const maxDepth int = 99 - if deep > maxDepth { - return fmt.Errorf("over maxDepth %d", maxDepth) - } - if path != "/" { - return pathDepthChecker(filepath.Dir(path), deep+1) - } - return nil -} - -func checkOwnerAndPermission(fileInfo os.FileInfo, filePath string) error { - const groupWriteIndex, otherWriteIndex, permLength int = 5, 8, 10 - perm := fileInfo.Mode().Perm().String() - if len(perm) != permLength { - return fmt.Errorf("permission not right %v %v", filePath, perm) - } - for index, char := range perm { - if (index == groupWriteIndex || index == otherWriteIndex) && char == 'w' { - return fmt.Errorf("write permission not right %v %v", filePath, perm) - } - } - stat, ok := fileInfo.Sys().(*syscall.Stat_t) - if !ok { - return fmt.Errorf("can not get stat %v", filePath) - } - if !(int(stat.Uid) == 0 || int(stat.Uid) == os.Getuid()) { - return fmt.Errorf("owner not right %v %v", filePath, int(stat.Uid)) - } - return nil -} - -func normalFileCheck(filePath string, allowDir, allowLink bool) (os.FileInfo, error) { - realPath, err := filepath.EvalSymlinks(filePath) - if err != nil || (realPath != filePath && !allowLink) { - return nil, fmt.Errorf("symlinks or not existed, failed %v, %v", filePath, err) - } - fileInfo, err := os.Stat(filePath) - if err != nil { - return nil, fmt.Errorf("get file stat failed %v", err) - } - if allowDir && !fileInfo.Mode().IsRegular() && !fileInfo.IsDir() { - return nil, fmt.Errorf("not regular file/dir %v", filePath) - } - if !allowDir && !fileInfo.Mode().IsRegular() { - return nil, fmt.Errorf("not regular file %v", filePath) - } - if fileInfo.Mode()&os.ModeSetuid != 0 { - return nil, fmt.Errorf("setuid not allowed %v", filePath) - } - if fileInfo.Mode()&os.ModeSetgid != 0 { - return nil, fmt.Errorf("setgid not allowed %v", filePath) - } - return fileInfo, nil -} - -func isValidCode(c rune) bool { - return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') -} - -func isInWhiteList(c rune) bool { - return strings.Contains(DefaultWhiteList, string(c)) -} - -func stringChecker(text string, minLength, maxLength int) bool { - if len(text) <= minLength || len(text) >= maxLength { - return false - } - for _, char := range text { - if !isValidCode(char) && !isInWhiteList(char) { - return false - } - } - return true -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go deleted file mode 100644 index 3c8e065..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go +++ /dev/null @@ -1,194 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package mindxcheckutils is a check utils package -package utils - -import ( - "os" - "strings" - "testing" -) - -func TestNormalFileCheckRegularFile(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - defer removeTmpDir(t, tmpDir) - err = os.Symlink(filePath, tmpDir+"/syslink") - if err != nil { - t.Fatalf("create symlink failed %q: %s", filePath, err) - } - - if _, err = normalFileCheck(tmpDir, true, false); err != nil { - t.Fatalf("check allow dir failed %q: %s", tmpDir+"/__test__", err) - } - - if _, err = normalFileCheck(tmpDir, false, false); !strings.Contains(err.Error(), "not regular file") { - t.Fatalf("check not allow dir failed %q: %s", tmpDir+"/__test__", err) - } - - if _, err = normalFileCheck("/dev/zero", true, false); !strings.Contains(err.Error(), "not regular file/dir") { - t.Fatalf("check /dev/zero failed %q: %s", tmpDir+"/__test__", err) - } - - if _, err = normalFileCheck(tmpDir+"/syslink", false, false); !strings.Contains(err.Error(), "symlinks") { - t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) - } - - if _, err = normalFileCheck(filePath, false, false); err != nil { - t.Fatalf("check failed %q: %s", filePath, err) - } - - if _, err = normalFileCheck(tmpDir+"/notexisted", false, false); !strings.Contains(err.Error(), "not existed") { - t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) - } -} - -func TestRealFileChecker(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - const permission os.FileMode = 0700 - err = os.WriteFile(filePath, []byte("hello\n"), permission) - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - if _, err = RealFileChecker(filePath, false, true, 0); err == nil { - t.Fatalf("size check wrong 0 %q: %s", filePath, err) - } - if _, err = RealFileChecker(filePath, false, true, 1); err != nil { - t.Fatalf("size check wrong 1 %q: %s", filePath, err) - } -} - -func TestRealFileCheckerInside(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - const permission os.FileMode = 0700 - const deep int = 100 - err = os.WriteFile(filePath, []byte("hello\n"), permission) - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - if err = fileChecker(filePath, false, false, false, deep); err == nil { - t.Fatalf("size check wrong 0 %q: %s", filePath, err) - } -} - -func TestRealDirChecker(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - if _, err = RealDirChecker(filePath, false, true); err == nil { - t.Fatalf("should be dir 0 %q: %s", filePath, err) - } - if _, err = RealDirChecker(tmpDir, false, true); err != nil { - t.Fatalf("should be dir 1 %q: %s", filePath, err) - } -} - -func TestVerifyFile(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - err = os.Symlink(filePath, tmpDir+"/syslink") - if err != nil { - t.Fatalf("create symlink failed %q: %s", filePath, err) - } - file, err := os.Open(filePath) - if err != nil { - t.Fatalf("open file failed") - } - defer file.Close() - linkFile, err := os.Open(tmpDir + "/syslink") - if err != nil { - t.Fatalf("open file failed") - } - defer linkFile.Close() - const permission os.FileMode = 0700 - err = os.WriteFile(filePath, []byte("hello\n"), permission) - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - if err = VerifyFile(file, 0); err == nil { - t.Fatalf("size check wrong 0 %q: %s", filePath, err) - } - if err = VerifyFile(file, 1); err != nil { - t.Fatalf("size check wrong 1 %q: %s", filePath, err) - } - if err = VerifyFile(linkFile, 1); err != nil && !strings.Contains(err.Error(), "symlinks") { - t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) - } -} - -func TestStringChecker(t *testing.T) { - if ok := stringChecker("0123456789abcABC", 0, DefaultStringLength); !ok { - t.Fatalf("failed on regular letters") - } - const testSize = 3 - if ok := stringChecker("123", 0, testSize); ok { - t.Fatalf("failed on max length") - } - if ok := stringChecker("1234", 0, testSize); ok { - t.Fatalf("failed on max length") - } - if ok := stringChecker("12", 0, testSize); !ok { - t.Fatalf("failed on max length") - } - if ok := stringChecker("", 0, testSize); ok { - t.Fatalf("failed on min length") - } - if ok := stringChecker("123", testSize, DefaultStringLength); ok { - t.Fatalf("failed on min length") - } - if ok := stringChecker("123%", 0, DefaultStringLength); ok { - t.Fatalf("failed on strange words") - } - if ok := stringChecker("123.-/~", 0, DefaultStringLength); !ok { - t.Fatalf("failed on strange words") - } -} - -func createTestFile(t *testing.T, fileName string) (string, string, error) { - const fileMode os.FileMode = 0600 - tmpDir := os.TempDir() - const permission os.FileMode = 0700 - if os.MkdirAll(tmpDir+"/__test__", permission) != nil { - t.Fatalf("MkdirAll failed %q", tmpDir+"/__test__") - } - f, err := os.Create(tmpDir + "/__test__" + fileName) - if err != nil { - t.Fatalf("create file failed %q: %s", tmpDir+"/__test__", err) - } - defer f.Close() - err = f.Chmod(fileMode) - if err != nil { - t.Fatalf("change file mode failed %q: %s", tmpDir+"/__test__", err) - } - return tmpDir + "/__test__", tmpDir + "/__test__" + fileName, err -} - -func removeTmpDir(t *testing.T, tmpDir string) { - if os.RemoveAll(tmpDir) != nil { - t.Logf("removeall %v", tmpDir) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_test.go deleted file mode 100644 index 8f91417..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_test.go +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "errors" - "fmt" - "os" - "path/filepath" - "reflect" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -func TestReadLimitBytes(t *testing.T) { - convey.Convey("test ReadLimitBytes func", t, func() { - convey.Convey("should return nil given empty string", func() { - emptyString := "" - const limitLength = 10 - res, err := ReadLimitBytes(emptyString, limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err, convey.ShouldBeError) - }) - - convey.Convey("should not return nil given valid path", func() { - const limitLength = 10 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldNotBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return nil given invalid limit length", func() { - const limitLength = -1 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "the limit length is not valid") - }) - - convey.Convey("should return nil when check path failed", func() { - checkStub := gomonkey.ApplyFunc(CheckPath, func(path string) (string, error) { - return "", errors.New("check failed") - }) - defer checkStub.Reset() - const limitLength = 10 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "check failed") - }) - - convey.Convey("should return nil when read file failed", func() { - var file *os.File - checkStub := gomonkey.ApplyMethod(reflect.TypeOf(file), "Read", - func(_ *os.File, _ []byte) (int, error) { - return 0, errors.New("read file failed") - }) - defer checkStub.Reset() - const limitLength = 10 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "read file failed: read file failed") - }) - }) -} - -func TestLoadFile(t *testing.T) { - convey.Convey("test LoadFile func", t, func() { - convey.Convey("should return error given empty path", func() { - res, err := LoadFile("") - convey.So(res, convey.ShouldBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return nil given path not existing", func() { - res, err := LoadFile("xxxx") - convey.So(res, convey.ShouldBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should not return nil given valid path", func() { - res, err := LoadFile("../../go.mod") - convey.So(res, convey.ShouldNotBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return nil given invalid path", func() { - absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { - return "", errors.New("the path is invalid") - }) - defer absStub.Reset() - res, err := LoadFile("../../go.mod") - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "the filePath is invalid: the path is invalid") - }) - - convey.Convey("should return nil when read file failed", func() { - readStub := gomonkey.ApplyFunc(ReadLimitBytes, func(path string, limitLength int) ([]byte, error) { - return nil, errors.New("read file failed") - }) - defer readStub.Reset() - res, err := LoadFile("../../go.mod") - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "read file failed") - }) - }) -} - -func TestCopyDir(t *testing.T) { - convey.Convey("test CopyDir func", t, func() { - convey.Convey("should return error given empty src path", func() { - err := CopyDir("", "") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given file src path", func() { - err := CopyDir("../../go.mod", "") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return nil given dir src path", func() { - err := CopyDir("../utils", "../utils_test") - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("should return error given file dst path", func() { - err := CopyDir("../utils", "../utils_test/file_test.go") - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestCopyFile(t *testing.T) { - convey.Convey("test CopyFile func", t, func() { - convey.Convey("should return error given empty src file path", func() { - err := CopyFile("", "../utils_test/file_test.go") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given empty dst path", func() { - err := CopyFile("../utils_test/file_test.go", "") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given dir scr path", func() { - err := CopyFile("../utils", "../utils_test/file_test.go") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given dir dst path", func() { - err := CopyFile("../utils/file_test.go", "../utils_test") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return nil given file scr and dst path", func() { - err := CopyFile("../utils/file_test.go", "../utils_test/file_test.go") - convey.So(err, convey.ShouldBeNil) - }) - }) - if err := os.RemoveAll("../utils_test"); err != nil { - fmt.Print("remove util_test file failed") - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go deleted file mode 100644 index 78f4266..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer utils for file watcher -package utils - -import ( - "fmt" - "os" - - "github.com/fsnotify/fsnotify" -) - -// FileWatcher struct file watcher -type FileWatcher struct { - watcher *fsnotify.Watcher -} - -// NewFileWatcher new FileWatcher -func NewFileWatcher() (*FileWatcher, error) { - watcher, err := fsnotify.NewWatcher() - if err != nil { - return nil, err - } - return &FileWatcher{watcher: watcher}, nil -} - -// WatchFile add file to watch -func (fw *FileWatcher) WatchFile(filePath string) error { - if _, err := os.Stat(filePath); err != nil { - return err - } - if _, err := PathStringChecker(filePath); err != nil { - return err - } - return fw.watcher.Add(filePath) -} - -// Events get event channel -func (fw *FileWatcher) Events() chan fsnotify.Event { - if fw == nil || fw.watcher == nil { - return nil - } - return fw.watcher.Events -} - -// Errors get error channel -func (fw *FileWatcher) Errors() chan error { - if fw == nil || fw.watcher == nil { - return nil - } - return fw.watcher.Errors -} - -// Close to close the file watcher -func (fw *FileWatcher) Close() error { - if fw == nil || fw.watcher == nil { - return nil - } - return fw.watcher.Close() -} - -// GetFileWatcherChan get eventCh and errCh for file watcher -func GetFileWatcherChan(filePath string) (*FileWatcher, error) { - watcher, err := NewFileWatcher() - if err != nil { - return nil, fmt.Errorf("new file watcher failed, error: %v", err) - } - if err = watcher.WatchFile(filePath); err != nil { - return nil, fmt.Errorf("watch file <%s> failed, error: %v", filePath, err) - } - fmt.Printf("watching file <%s>...\n", filePath) - return watcher, nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go deleted file mode 100644 index 32220da..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils test for file watcher utils -package utils - -import ( - "errors" - "fmt" - "os" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/fsnotify/fsnotify" - "github.com/smartystreets/goconvey/convey" -) - -var testErr = errors.New("test error") - -const ( - testFilePath = "./test.txt" - errFilePath = "./not_exist_file.txt" -) - -func TestGetFileWatcherChan(t *testing.T) { - prepareTestFile(t) - defer removeFile() - - p1 := gomonkey.ApplyFuncReturn(PathStringChecker, "", nil) - defer p1.Reset() - convey.Convey("test func GetFileWatcherChan success", t, func() { - _, err := GetFileWatcherChan(testFilePath) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("test func GetFileWatcherChan failed, new watcher err", t, func() { - p2 := gomonkey.ApplyFuncReturn(fsnotify.NewWatcher, nil, testErr) - defer p2.Reset() - _, err := GetFileWatcherChan(testFilePath) - expErr := fmt.Errorf("new file watcher failed, error: %v", testErr) - convey.So(err, convey.ShouldResemble, expErr) - }) - convey.Convey("test func GetFileWatcherChan failed, file does not exist", t, func() { - _, err := GetFileWatcherChan(errFilePath) - expErr := fmt.Sprintf("watch file <%s> failed", errFilePath) - convey.So(err.Error(), convey.ShouldContainSubstring, expErr) - }) - convey.Convey("test func GetFileWatcherChan failed, watcher is nil", t, func() { - var watcher = &FileWatcher{} - eventCh := watcher.Events() - convey.So(eventCh, convey.ShouldBeNil) - errCh := watcher.Errors() - convey.So(errCh, convey.ShouldBeNil) - err := watcher.Close() - convey.So(err, convey.ShouldBeNil) - }) -} - -func prepareTestFile(t *testing.T) { - const mode644 = 0644 - err := os.WriteFile(testFilePath, []byte("file context"), mode644) - if err != nil { - t.Error(err) - } -} - -func removeFile() { - if err := os.Remove(testFilePath); err != nil && errors.Is(err, os.ErrNotExist) { - fmt.Printf("remove file %s failed, %v\n", testFilePath, err) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface.go b/mind-cluster/component/ascend-common/common-utils/utils/interface.go deleted file mode 100644 index 7ccae4d..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/interface.go +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import "reflect" - -// IsNil check whether the interface is nil, including type or data is nil -func IsNil(i interface{}) bool { - if i == nil { - return true - } - defer func() { - recover() - }() - return reflect.ValueOf(i).IsNil() -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go b/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go deleted file mode 100644 index f2ce878..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestIsNil(t *testing.T) { - var a interface{} // type = nil, data = nil - var b interface{} = (*int)(nil) // type is *int , data = nil - var c interface{} = "dd" - convey.Convey("test IsNil func, type and data is both nil", t, func() { - convey.So(a == nil, convey.ShouldEqual, true) - convey.So(b == nil, convey.ShouldEqual, false) - convey.So(c == nil, convey.ShouldEqual, false) - convey.So(IsNil(a), convey.ShouldEqual, true) - convey.So(IsNil(b), convey.ShouldEqual, true) - convey.So(IsNil(c), convey.ShouldEqual, false) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go deleted file mode 100644 index f3ed96e..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import ( - "errors" - "net" - "net/http" - "regexp" - "strings" -) - -const ( - domainReg = "^[a-zA-Z0-9][a-zA-Z0-9.-]{1,256}[a-zA-Z0-9]$" -) - -// ClientIP try to get the clientIP -func ClientIP(r *http.Request) string { - // get forward ip fistly - var ip string - xForwardedFor := r.Header.Get("X-Forwarded-For") - forwardSlice := strings.Split(xForwardedFor, ",") - if len(forwardSlice) >= 1 { - if ip = strings.TrimSpace(forwardSlice[0]); ip != "" { - return ip - } - } - // try get ip from "X-Real-Ip" - ip = strings.TrimSpace(r.Header.Get("X-Real-Ip")) - if ip != "" { - return ip - } - var err error - if ip, _, err = net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)); err == nil { - return ip - } - return "" -} - -// CheckDomain check domain which by regex and blacklist -func CheckDomain(domain string, forLocalUsage bool) error { - matched, err := regexp.MatchString(domainReg, domain) - if err != nil { - return err - } - if !matched { - return errors.New("domain does not match allowed regex") - } - if !forLocalUsage { - return nil - } - if IsDigitString(domain) { - return errors.New("domain can not be all digits") - } - if strings.Contains(domain, "localhost") { - return errors.New("domain can not contain localhost") - } - return nil -} - -// IsHostValid check if the host is valid -func IsHostValid(host string) error { - parsedIp := net.ParseIP(host) - if parsedIp != nil { - return IsIPValid(parsedIp) - } - return CheckDomain(host, false) -} - -// IsIPValid check ip valid -func IsIPValid(parsedIp net.IP) error { - if parsedIp == nil { - return errors.New("parse ip is nil") - } - if parsedIp.To4() == nil && parsedIp.To16() == nil { - return errors.New("not a valid ipv4 or ipv6 ip") - } - if parsedIp.IsUnspecified() { - return errors.New("is all zeros ip") - } - if parsedIp.IsMulticast() { - return errors.New("is multicast ip") - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go deleted file mode 100644 index 6ad93ab..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go +++ /dev/null @@ -1,182 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import ( - "net/http" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -const ( - localhost = "127.0.0.1" - localhostLoop = "0.0.0.0" -) - -func TestClientIP(t *testing.T) { - convey.Convey("test ClientIP func", t, func() { - convey.Convey("get IP from X-Forwarded-For", func() { - ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {localhost, localhostLoop}})) - convey.So(ip, convey.ShouldEqual, localhost) - }) - convey.Convey("get IP from X-Real-Ip", func() { - ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, - "X-Real-Ip": {localhost}})) - convey.So(ip, convey.ShouldEqual, localhost) - }) - convey.Convey("get IP from RemoteAddr", func() { - ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, - "X-Real-Ip": {}})) - convey.So(ip, convey.ShouldEqual, localhost) - }) - convey.Convey("get IP from RemoteAddr failed", func() { - ip := ClientIP(&http.Request{RemoteAddr: localhost}) - convey.So(ip, convey.ShouldEqual, "") - }) - convey.Convey("get IP failed", func() { - ip := ClientIP(&http.Request{}) - convey.So(ip, convey.ShouldEqual, "") - }) - }) -} - -func mockRequest(header map[string][]string) *http.Request { - return &http.Request{ - Method: "GET", - URL: nil, - Proto: "HTTP", - ProtoMajor: 0, - ProtoMinor: 0, - Header: header, - ContentLength: 0, - Close: false, - Host: "www.test.com", - RemoteAddr: "127.0.0.1:8080", - } -} - -func TestCheckDomain(t *testing.T) { - convey.Convey("CheckDomain function test suite", t, func() { - testDomainFormatValidation() - testLocalUsageConstraints() - testParameterCombinations() - }) -} - -// Test domain format validation -func testDomainFormatValidation() { - convey.Convey("Validate domain format rules", func() { - convey.Convey("Valid domain should pass validation", func() { - err := CheckDomain("example.com", false) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("Domain with special characters should be rejected", func() { - err := CheckDomain("example@com", false) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "domain does not match allowed regex") - }) - - convey.Convey("Domain starting with hyphen should be rejected", func() { - err := CheckDomain("-example.com", false) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -// Test local usage constraints -func testLocalUsageConstraints() { - convey.Convey("Validate constraints for local usage (forLocalUsage=true)", func() { - convey.Convey("All-digit domain should be rejected", func() { - err := CheckDomain("123456", true) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not be all digits") - }) - - convey.Convey("Domain containing 'localhost' should be rejected", func() { - err := CheckDomain("my-localhost.com", true) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not contain localhost") - }) - - convey.Convey("Valid local domain should pass validation", func() { - err := CheckDomain("local-app.example", true) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -// Test parameter combinations -func testParameterCombinations() { - convey.Convey("Validate parameter combinations", func() { - convey.Convey("All-digit restriction ignored when forLocalUsage=false", func() { - err := CheckDomain("123456", false) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("DNS check skipped when forLocalUsage=false", func() { - err := CheckDomain("unresolvable.test", false) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestIsHostValid(t *testing.T) { - tests := []struct { - name string - ip string - wantErr bool - errMsg string - }{ - { - name: "invalid IP format but domain", ip: "not.an.ip", - wantErr: false, - }, - { - name: "valid IPv4", ip: "192.168.1.1", wantErr: false, - }, - { - name: "valid IPv6", ip: "2001:0db8:85a3:0000:0000:8a2e:0370:7334", - wantErr: false, - }, - { - name: "unspecified IPv4", ip: "0.0.0.0", - wantErr: true, errMsg: "is all zeros ip", - }, - { - name: "unspecified IPv6", ip: "::", - wantErr: true, errMsg: "is all zeros ip", - }, - { - name: "IPv6 multicast", ip: "ff02::1", - wantErr: true, errMsg: "is multicast ip", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := IsHostValid(tt.ip) - if (err != nil) != tt.wantErr { - t.Errorf("IsIPValid() error = %v, wantErr %v", err, tt.wantErr) - return - } - if err != nil && err.Error() != tt.errMsg { - t.Errorf("IsIPValid() error = %v, wantErrMsg %v", - err.Error(), tt.errMsg) - } - }) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path.go b/mind-cluster/component/ascend-common/common-utils/utils/path.go deleted file mode 100644 index b3150b9..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/path.go +++ /dev/null @@ -1,382 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "bufio" - "errors" - "fmt" - "io" - "io/fs" - "log" - "os" - "os/exec" - "path" - "path/filepath" - "strings" - "syscall" -) - -const ( - dirMode = 0700 - - rootUID = 0 - maxPathDepth = 20 - maxPathLength = 1024 - // DefaultWriteFileMode default file mode for write permission check - DefaultWriteFileMode = 0022 - - ldSplitLen = 2 - ldLibNameIndex = 0 - ldLibPathIndex = 1 - ldCommand = "/sbin/ldconfig" - ldParam = "--print-cache" - // LdLibPath LD_LIBRARY_PATH - LdLibPath = "LD_LIBRARY_PATH" - grepCommand = "/bin/grep" -) - -// IsDir check whether the path is a directory. -func IsDir(path string) bool { - if path == "" { - return false - } - - if !IsExist(path) { - return path[len(path)-1:] == "/" - } - s, err := os.Stat(path) - if err != nil { - return false - } - return s.IsDir() -} - -// IsFile check whether the path is a file -func IsFile(path string) bool { - if path == "" { - return false - } - return !IsDir(path) -} - -// IsSoftlink check whether the path is softlink -func IsSoftlink(path string) (bool, error) { - file, err := os.Open(path) - if err != nil { - return false, err - } - defer file.Close() - fileInfo, err := file.Stat() - if err != nil { - return false, err - } - if (fileInfo.Mode() & fs.ModeSymlink) != 0 { - return true, nil - } - return false, nil -} - -// IsExist check whether the path exists, If the file is a symbolic link, the returned the final FileInfo -func IsExist(filePath string) bool { - _, err := os.Stat(filePath) - if err == nil { - return true - } - if os.IsExist(err) { - return true - } - return false -} - -// IsLexist check whether the path exists, If the file is a symbolic link, the returned FileInfo -// describes the symbolic link -func IsLexist(filePath string) bool { - _, err := os.Lstat(filePath) - if err == nil { - return true - } - if os.IsExist(err) { - return true - } - return false -} - -// CheckPath validate given path and return resolved absolute path -func CheckPath(path string) (string, error) { - if path == "" { - return path, nil - } - origin := path - for !IsLexist(path) { - path = filepath.Dir(path) - if path == "." { - return "", os.ErrNotExist - } - } - absPath, err := filepath.Abs(path) - if err != nil { - return "", fmt.Errorf("get the absolute path failed: %v", err) - } - resoledPath, err := filepath.EvalSymlinks(absPath) - if err != nil { - if strings.Contains(err.Error(), "no such file or directory") { - return "", os.ErrNotExist - } - return "", fmt.Errorf("get the symlinks path failed: %v", err) - } - if absPath != resoledPath { - return "", errors.New("can't support symlinks") - } - // get the original full path - absOrigin, err := filepath.Abs(origin) - if err != nil { - return "", fmt.Errorf("get the absolute path failed: %v", err) - } - return absOrigin, nil -} - -// MakeSureDir create directory. The last element of path should end with slash, or it will be omitted. -func MakeSureDir(path string) error { - dir := filepath.Dir(path) - if IsExist(dir) { - return nil - } - - if err := os.MkdirAll(dir, dirMode); err != nil { - return fmt.Errorf("create directory failed: %v", err) - } - - return nil -} - -// CheckMode check input file mode whether includes invalid mode. -// For example, if read operation of group and other is forbidden, then call CheckMode(inputFileMode, 0044). -// All operations are forbidden for group and other, then call CheckMode(inputFileMode, 0077). -// Write operation is forbidden for group and other by default, with calling CheckMode(inputFileMode) -func CheckMode(mode os.FileMode, optional ...os.FileMode) bool { - var targetMode os.FileMode - if len(optional) > 0 { - targetMode = optional[0] - } else { - targetMode = DefaultWriteFileMode - } - checkMode := uint32(mode) & uint32(targetMode) - return checkMode == 0 -} - -// CheckOwnerAndPermission check path owner and permission -func CheckOwnerAndPermission(verifyPath string, mode os.FileMode, uid uint32) (string, error) { - if verifyPath == "" { - return verifyPath, errors.New("empty path") - } - absPath, err := filepath.Abs(verifyPath) - if err != nil { - return "", fmt.Errorf("abs failed %v", err) - } - resoledPath, err := filepath.EvalSymlinks(absPath) - if err != nil { - return "", fmt.Errorf("evalSymlinks failed %v", err) - } - // if symlinks - if absPath != resoledPath { - // check symlinks its self owner - pathInfo, err := os.Lstat(absPath) - if err != nil { - return "", fmt.Errorf("lstat failed, %v", err) - } - stat, ok := pathInfo.Sys().(*syscall.Stat_t) - if !ok || stat.Uid != uid { - return "", errors.New("symlinks owner may not root") - } - } - pathInfo, err := os.Stat(resoledPath) - if err != nil { - return "", fmt.Errorf("stat failed %v", err) - } - stat, ok := pathInfo.Sys().(*syscall.Stat_t) - if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { - return "", errors.New("check uid or mode failed") - } - return resoledPath, nil -} - -// DoCheckOwnerAndPermission check path owner and permission -func DoCheckOwnerAndPermission(path string, mode os.FileMode, uid uint32) error { - if !IsExist(path) { - return nil - } - pathInfo, err := os.Stat(path) - if err != nil { - return fmt.Errorf("stat failed %v", err) - } - stat, ok := pathInfo.Sys().(*syscall.Stat_t) - if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { - return fmt.Errorf("check uid or mode failed : %v", path) - } - return nil -} - -func checkAbsPath(libPath string) (string, error) { - absLibPath, err := CheckOwnerAndPermission(libPath, DefaultWriteFileMode, rootUID) - if err != nil { - return "", fmt.Errorf("%s: %v", libPath, err) - } - count := 0 - fPath := absLibPath - for { - if count >= maxPathDepth { - break - } - count++ - if fPath == "/" { - return absLibPath, nil - } - fPath = filepath.Dir(fPath) - if _, err := CheckOwnerAndPermission(fPath, DefaultWriteFileMode, rootUID); err != nil { - return "", fmt.Errorf("%s: %v", fPath, err) - } - } - return "", errors.New("absolute path check failed") -} - -func checkLibsPath(libraryPaths []string) (string, error) { - errs := make([]string, 0, len(libraryPaths)) - for _, libraryAbsName := range libraryPaths { - absLibPath, err := checkAbsPath(libraryAbsName) - if err == nil { - return absLibPath, nil - } - errs = append(errs, fmt.Sprintf("%s;", err.Error())) - } - return "", fmt.Errorf("lib path is invalid, %v", errs) -} - -func getLibFromEnv(libraryName string) (string, error) { - ldLibraryPath := os.Getenv(LdLibPath) - if len(ldLibraryPath) > maxPathLength { - return "", fmt.Errorf("invalid library path env") - } - libraryPaths := strings.Split(ldLibraryPath, ":") - targetLibs := make([]string, 0, len(ldLibraryPath)) - for _, libraryPath := range libraryPaths { - libraryAbsName := path.Join(libraryPath, libraryName) - if len(libraryAbsName) > maxPathLength || !IsLexist(libraryAbsName) { - continue - } - targetLibs = append(targetLibs, libraryAbsName) - } - if len(libraryPaths) == 0 { - return "", errors.New("file path no exist or too long") - } - return checkLibsPath(targetLibs) -} - -func trimSpaceTable(data string) string { - data = strings.Replace(data, " ", "", -1) - data = strings.Replace(data, "\t", "", -1) - data = strings.Replace(data, "\n", "", -1) - return data -} - -func parserLibPath(line, libraryName string) string { - ldInfo := strings.Split(line, "=>") - if len(ldInfo) < ldSplitLen { - return "" - } - libNames := strings.Split(ldInfo[ldLibNameIndex], " ") - for index, libName := range libNames { - if index >= maxPathDepth { - break - } - if len(libName) == 0 { - continue - } - if name := trimSpaceTable(libName); name != libraryName { - continue - } - return trimSpaceTable(ldInfo[ldLibPathIndex]) - } - return "" -} - -func parseLibFromLdCmd(libraryName string) (string, error) { - ldCmd := exec.Command(ldCommand, ldParam) - grepCmd := exec.Command(grepCommand, libraryName) - ldCmdStdout, err := ldCmd.StdoutPipe() - if err != nil { - return "", fmt.Errorf("command exec failed: %v", err) - } - grepCmd.Stdin = ldCmdStdout - stdout, err := grepCmd.StdoutPipe() - if err != nil { - return "", fmt.Errorf("get pipe failed: %v", err) - } - if err = grepCmd.Start(); err != nil { - return "", fmt.Errorf("command exec failed: %v", err) - } - if err = ldCmd.Run(); err != nil { - return "", fmt.Errorf("command exec failed: %v", err) - } - defer func() { - if err = grepCmd.Wait(); err != nil { - log.Printf("command exec failed, %v", err) - } - }() - reader := bufio.NewReader(stdout) - count := 0 - line := "" - for { - if count >= maxPathLength { - err = errors.New("too many items in command stdout") - break - } - count++ - line, err = reader.ReadString('\n') - if err != nil || io.EOF == err { - break - } - if libPath := parserLibPath(line, libraryName); libPath != "" { - return libPath, nil - } - } - return "", fmt.Errorf("can't find valid lib: %v", err) -} - -func getLibFromLdCmd(libraryName string) (string, error) { - libraryAbsName, err := parseLibFromLdCmd(libraryName) - if err != nil { - return "", err - } - var absLibPath string - if absLibPath, err = checkAbsPath(libraryAbsName); err == nil { - return absLibPath, nil - } - return "", fmt.Errorf("driver lib is not exist or it's permission is invalid, %v", err) -} - -// GetDriverLibPath get driver lib path from ld config -func GetDriverLibPath(libraryName string) (string, error) { - var libPath string - var envErr, cmdErr error - if libPath, envErr = getLibFromEnv(libraryName); envErr == nil { - return libPath, nil - } - if libPath, cmdErr = getLibFromLdCmd(libraryName); cmdErr == nil { - return libPath, nil - } - return "", fmt.Errorf("cannot found valid driver lib, fromEnv: %v, fromLdCmd: %v", envErr, cmdErr) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path_test.go b/mind-cluster/component/ascend-common/common-utils/utils/path_test.go deleted file mode 100644 index 4e2346f..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/path_test.go +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "errors" - "fmt" - "os" - "path/filepath" - "syscall" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -func TestIsDir(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test IsDir func", func() { - res := IsDir("/tmp/") - convey.So(res, convey.ShouldBeTrue) - res = IsDir("/utils/") - convey.So(res, convey.ShouldBeTrue) - res = IsDir("") - convey.So(res, convey.ShouldBeFalse) - }) - }) -} - -func TestIsFile(t *testing.T) { - convey.Convey("test IsFile func", t, func() { - res := IsFile("/tmp/") - convey.So(res, convey.ShouldBeFalse) - res = IsFile("") - convey.So(res, convey.ShouldBeFalse) - }) -} - -func TestIsExist(t *testing.T) { - convey.Convey("test IsExist func", t, func() { - res := IsExist("/xxxx/") - convey.So(res, convey.ShouldBeFalse) - }) -} - -func TestIsLexist(t *testing.T) { - convey.Convey("test IsLexist func", t, func() { - res := IsLexist("/xxxx/") - convey.So(res, convey.ShouldBeFalse) - }) -} - -func TestCheckPath(t *testing.T) { - convey.Convey("test CheckPath func", t, func() { - convey.Convey("should return itself given empty string", func() { - res, err := CheckPath("") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error given not exist path", func() { - res, err := CheckPath("xxxxxxx") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "file does not exist") - }) - - convey.Convey("should return resolve path given normal path", func() { - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldNotBeEmpty) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return err when get abs path failed", func() { - absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { - return "", errors.New("abs failed") - }) - defer absStub.Reset() - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "get the absolute path failed: abs failed") - }) - - convey.Convey("should return err when get eval symbol link failed", func() { - symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { - return "", errors.New("symlinks path failed") - }) - defer symStub.Reset() - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "get the symlinks path failed: symlinks path failed") - }) - - convey.Convey("should return err given symbol link", func() { - symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { - return "xxx", nil - }) - defer symStub.Reset() - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "can't support symlinks") - }) - - }) -} - -func TestMakeSureDir(t *testing.T) { - convey.Convey("test MakeSureDir func", t, func() { - convey.Convey("normal situation, no err returned", func() { - err := MakeSureDir("./testdata/tmp/test") - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("abnormal situation,err returned", func() { - mock := gomonkey.ApplyFunc(os.MkdirAll, func(name string, perm os.FileMode) error { - return fmt.Errorf("error") - }) - defer mock.Reset() - err := MakeSureDir("./xxxx/xxx") - convey.So(err.Error(), convey.ShouldEqual, "create directory failed: error") - }) - }) -} - -func TestGetDriverLibPath(t *testing.T) { - convey.Convey("test GetDriverLibPath func", t, func() { - convey.Convey("should return itself given empty string", func() { - err := os.Setenv(LdLibPath, "") - convey.So(err, convey.ShouldBeNil) - res, err := GetDriverLibPath("") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err, convey.ShouldBeError) - }) - - convey.Convey("should return path when getLibFromEnv succeed", func() { - envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { - return "/test", nil - }) - defer envStub.Reset() - res, err := GetDriverLibPath("") - convey.So(res, convey.ShouldEqual, "/test") - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return path when getLibFromEnv failed but getLibFromLdCmd succeed", func() { - envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { - return "", errors.New("failed") - }) - defer envStub.Reset() - cmdStub := gomonkey.ApplyFunc(getLibFromLdCmd, func(libraryName string) (string, error) { - return "/test", nil - }) - defer cmdStub.Reset() - res, err := GetDriverLibPath("") - convey.So(res, convey.ShouldEqual, "/test") - convey.So(err, convey.ShouldBeNil) - }) - - }) -} - -type mockFileInfo struct { - mode os.FileMode - sys interface{} -} - -func (m *mockFileInfo) Name() string { return "mock" } -func (m *mockFileInfo) Size() int64 { return 0 } -func (m *mockFileInfo) Mode() os.FileMode { return m.mode } -func (m *mockFileInfo) ModTime() time.Time { return time.Now() } -func (m *mockFileInfo) IsDir() bool { return false } -func (m *mockFileInfo) Sys() interface{} { return m.sys } - -func TestDoCheckOwnerAndPermission(t *testing.T) { - var testPath = "/test" - var testMode os.FileMode = 0660 - var excludePermissions os.FileMode = 0002 - patch := gomonkey.NewPatches() - defer patch.Reset() - convey.Convey("should return nil when path is not exist", t, func() { - patch.ApplyFuncReturn(IsExist, false) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldBeNil) - }) - - patch.ApplyFuncReturn(IsExist, true) - convey.Convey("should return err when stat failed", t, func() { - patch.ApplyFuncReturn(os.Stat, nil, os.ErrNotExist) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err.Error(), convey.ShouldContainSubstring, "stat failed") - }) - - convey.Convey("should return err when get uid failed", t, func() { - patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: "invalid-type"}, nil) - defer patch.Reset() - - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") - }) - - convey.Convey("should return err when permission check failure", t, func() { - patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) - patch.ApplyFuncReturn(CheckMode, false) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") - }) - - convey.Convey("should return nil where all checks pass", t, func() { - patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) - patch.ApplyFuncReturn(CheckMode, true) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldBeNil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go deleted file mode 100644 index 49c2f36..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for password handler -package utils - -import ( - "bytes" - "errors" - "regexp" -) - -const ( - lowercaseCharactersRegex = `[a-z]{1,}` - uppercaseCharactersRegex = `[A-Z]{1,}` - baseNumberRegex = `[0-9]{1,}` - specialCharactersRegex = `[!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{1,}` - passWordRegex = `^[a-zA-Z0-9!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{8,64}$` - minComplexCount = 2 -) - -// CheckPassWordComplexity check password complexity -func CheckPassWordComplexity(s []byte) error { - complexCheckRegexArr := []string{ - lowercaseCharactersRegex, - uppercaseCharactersRegex, - baseNumberRegex, - specialCharactersRegex, - } - complexCount := 0 - for _, pattern := range complexCheckRegexArr { - if matched, err := regexp.Match(pattern, s); matched && err == nil { - complexCount++ - } - } - if complexCount < minComplexCount { - return errors.New("password complex not meet the requirement") - } - return nil -} - -// ValidatePassWord validate password -func ValidatePassWord(userName string, passWord []byte) error { - if err := commonCheckForPassWord(userName, passWord); err != nil { - return err - } - return CheckPassWordComplexity(passWord) -} - -func commonCheckForPassWord(userName string, passWord []byte) error { - if matched, err := regexp.Match(passWordRegex, passWord); err != nil || !matched { - return errors.New("password not meet requirement") - } - var userNameByte []byte = []byte(userName) - if bytes.Equal(userNameByte, passWord) { - return errors.New("password cannot equals username") - } - var reverseUserName = ReverseString(userName) - var reverseUserNameByte []byte = []byte(reverseUserName) - if bytes.Equal(reverseUserNameByte, passWord) { - return errors.New("password cannot equal reversed username") - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go deleted file mode 100644 index 808c231..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for password handler -package utils - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -var ( - truePasswd = []byte("aA0!\"#$%&'()*+,-. /:;<=>?@[\\]^_`{|}~") - falsePasswd1 = []byte("userName") - falsePasswd2 = []byte("12345678") - falsePasswd3 = []byte("1234567") - falsePasswd4 = []byte("emaNresu.") - falsePasswd5 = []byte("不支持特殊字符测试test") -) - -// TestCommonCheckForPassWord test common check for passWord -func TestCommonCheckForPassWord(t *testing.T) { - convey.Convey("correct password", t, func() { - err := ValidatePassWord("userName", truePasswd) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("username == password", t, func() { - err := ValidatePassWord("userName", falsePasswd1) - convey.So(err.Error(), convey.ShouldEqual, "password cannot equals username") - }) - convey.Convey("complex not meet the requirement", t, func() { - err := ValidatePassWord("userName", falsePasswd2) - convey.So(err.Error(), convey.ShouldEqual, "password complex not meet the requirement") - }) - convey.Convey("password too short", t, func() { - err := ValidatePassWord("userName", falsePasswd3) - convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") - }) - convey.Convey("username equal reverse password", t, func() { - err := ValidatePassWord(".userName", falsePasswd4) - convey.So(err.Error(), convey.ShouldEqual, "password cannot equal reversed username") - }) - convey.Convey("test special ", t, func() { - err := ValidatePassWord("userName", falsePasswd5) - convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice.go b/mind-cluster/component/ascend-common/common-utils/utils/slice.go deleted file mode 100644 index f673bc1..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/slice.go +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for slice utils -package utils - -import ( - "fmt" - "slices" - "strconv" -) - -// hex hexadecimal -const hex = 16 - -type stringTool struct{} - -// StringTool slice for string tool -var StringTool stringTool - -// HexStringToInt hex string slice to int64 slice -func (s stringTool) HexStringToInt(sources []string) map[int64]struct{} { - intMap := make(map[int64]struct{}, len(sources)) - for _, source := range sources { - num, err := strconv.ParseInt(source, hex, 0) - if err != nil { - fmt.Printf("parse hex to int failed, skip it. error: %v\n", err) - continue - } - intMap[num] = struct{}{} - } - return intMap -} - -// Contains check whether slice contains target -func Contains[T comparable](sources []T, target T) bool { - for _, v := range sources { - if v == target { - return true - } - } - return false -} - -// Remove delete the first matching element in the slice -func Remove[T comparable](slice []T, target T) []T { - for i, v := range slice { - if v == target { - return append(slice[:i], slice[i+1:]...) - } - } - return slice -} - -// RemoveDuplicates remove duplicates from slice -func RemoveDuplicates[T comparable](slice []T) []T { - existMap := make(map[T]struct{}) - result := make([]T, 0) - for _, str := range slice { - if _, ok := existMap[str]; !ok { - existMap[str] = struct{}{} - result = append(result, str) - } - } - return result -} - -// SameElementInMap whether map contains target -func SameElementInMap[T comparable](sources map[T]struct{}, targets []T) bool { - for _, target := range targets { - if _, ok := sources[target]; ok { - return true - } - } - return false -} - -// RemoveEleSli remove element in sources which is in target -func RemoveEleSli[T comparable](source, target []T) []T { - sliMap := make(map[T]struct{}) - for _, item := range target { - sliMap[item] = struct{}{} - } - - result := make([]T, 0) - for _, ele := range source { - if _, ok := sliMap[ele]; !ok { - result = append(result, ele) - } - } - return result -} - -// RemoveElementsNotInSecond remove elements not in slice2 -func RemoveElementsNotInSecond[T comparable](slice1, slice2 []T) []T { - sliMap := make(map[T]struct{}) - for _, item := range slice2 { - sliMap[item] = struct{}{} - } - - result := make([]T, 0) - for _, item := range slice1 { - if _, ok := sliMap[item]; ok { - result = append(result, item) - } - } - return result -} - -// CheckSliceSupport check elements is supported in expects -func CheckSliceSupport(elements []int64, expects []int64) error { - for _, e := range elements { - if !slices.Contains(expects, e) { - return fmt.Errorf("element %v does not contain %v", e, expects) - } - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go b/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go deleted file mode 100644 index b3bf161..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go +++ /dev/null @@ -1,536 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for slice utils -package utils - -import ( - "fmt" - "reflect" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -const ( - decimal1A = 26 - decimalFF = 255 - decimalNeg10 = 16 - decimalNegFF = -255 -) - -func buildHexStringToIntTestCase() []struct { - name string - input []string - expected map[int64]struct{} -} { - return []struct { - name string - input []string - expected map[int64]struct{} - }{ - { - name: "01 - Valid hex strings", - input: []string{"1A", "FF", "10"}, - expected: map[int64]struct{}{ - decimal1A: {}, - decimalFF: {}, - decimalNeg10: {}, - }, - }, - { - name: "02 - Invalid hex strings", - input: []string{"xyz", "ghijk"}, - expected: map[int64]struct{}{}, - }, - { - name: "03 - Empty input array", - input: []string{}, - expected: map[int64]struct{}{}, - }, - { - name: "04 - Duplicate values should be deduplicated", - input: []string{"0x1A", "1A", "0x1a"}, // All represent 26 in decimal - expected: map[int64]struct{}{ - decimal1A: {}, - }, - }, - { - name: "05 - Mixed valid and invalid inputs", - input: []string{"0x1A", "xyz", "0xFF", "invalid", "0x10"}, - expected: map[int64]struct{}{}, - }, - { - name: "06 - Negative hex numbers", - input: []string{"-0x1A", "-FF"}, - expected: map[int64]struct{}{ - decimalNegFF: {}, - }, - }, - } -} - -func TestHexStringToInt(t *testing.T) { - for _, tt := range buildHexStringToIntTestCase() { - t.Run(tt.name, func(t *testing.T) { - result := StringTool.HexStringToInt(tt.input) - for i := range tt.expected { - fmt.Println(i) - } - if len(result) != len(tt.expected) { - t.Errorf("Expected map length %d, but got %d", len(tt.expected), len(result)) - return - } - for key := range tt.expected { - if _, exists := result[key]; !exists { - t.Errorf("Expected key %d not found in result", key) - } - } - for key := range result { - if _, exists := tt.expected[key]; !exists { - t.Errorf("Unexpected key %d found in result", key) - } - } - }) - } -} - -func TestSameElementInMap(t *testing.T) { - for _, tt := range buildSameElementInMapTestCase() { - t.Run(tt.name, func(t *testing.T) { - result := SameElementInMap(tt.sources, tt.targets) - if result != tt.expected { - t.Errorf("SameElementInMap() = %v, expected %v", result, tt.expected) - } - }) - } -} - -func buildSameElementInMapTestCase() []struct { - name string - sources map[int]struct{} - targets []int - expected bool -} { - return []struct { - name string - sources map[int]struct{} - targets []int - expected bool - }{ - { - name: "01 There are identical elements present", - sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, - targets: []int{4, 5, 2}, - expected: true, - }, - { - name: "02 There are no identical elements present\n", - sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, - targets: []int{4, 5, 6}, - expected: false, - }, - { - name: "03 target is nil", - sources: map[int]struct{}{1: {}, 2: {}}, - targets: []int{}, - expected: false, - }, - { - name: "04 source is nil", - sources: map[int]struct{}{}, - targets: []int{1, 2, 3}, - expected: false, - }, - { - name: "05 source and target are both nil", - sources: map[int]struct{}{}, - targets: []int{}, - expected: false, - }, - } -} - -func TestSameElementInMap_StringType(t *testing.T) { - sources := map[string]struct{}{ - "apple": {}, - "banana": {}, - "orange": {}, - } - targets := []string{"grape", "apple", "kiwi"} - result := SameElementInMap(sources, targets) - if !result { - t.Errorf("SameElementInMap() with string type should return true, got false") - } - targetsNoMatch := []string{"grape", "kiwi", "mango"} - resultNoMatch := SameElementInMap(sources, targetsNoMatch) - if resultNoMatch { - t.Errorf("SameElementInMap() with string type should return false, got true") - } -} - -func TestContains(t *testing.T) { - for _, tt := range buildContainsTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.source.(type) { - case []int: - s2 := tt.target.(int) - result := Contains(s1, s2) - if !reflect.DeepEqual(result, tt.expected) { - t.Errorf("Contains() = %v, want %v", result, tt.expected) - } - case []string: - s2 := tt.target.(string) - result := Contains(s1, s2) - if !reflect.DeepEqual(result, tt.expected) { - t.Errorf("Contains() = %v, want %v", result, tt.expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildContainsTestCase() []struct { - name string - source interface{} - target interface{} - expected bool -} { - return []struct { - name string - source interface{} - target interface{} - expected bool - }{ - { - name: "01 contains for int type", - source: []int{1, 2, 3, 4}, - target: 1, - expected: true, - }, - { - name: "02 not contains for int type", - source: []int{1, 2, 3, 4}, - target: 0, - expected: false, - }, - { - name: "03 contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "1", - expected: true, - }, - { - name: "04 not contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "0", - expected: false, - }, - { - name: "05 empty source slice", - source: []int{}, - target: 1, - expected: false, - }, - } -} - -func TestRemove(t *testing.T) { - for _, tt := range buildRemoveTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.source.(type) { - case []int: - s2 := tt.target.(int) - result := Remove(s1, s2) - expected := tt.expected.([]int) - if !reflect.DeepEqual(result, expected) { - t.Errorf("Contains() = %v, want %v", result, expected) - } - case []string: - s2 := tt.target.(string) - result := Remove(s1, s2) - expected := tt.expected.([]string) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildRemoveTestCase() []struct { - name string - source interface{} - target interface{} - expected interface{} -} { - return []struct { - name string - source interface{} - target interface{} - expected interface{} - }{ - { - name: "01 contains for int type", - source: []int{1, 2, 3, 4}, - target: 1, - expected: []int{2, 3, 4}, - }, - { - name: "02 not contains for int type", - source: []int{1, 2, 3, 4}, - target: 0, - expected: []int{1, 2, 3, 4}, - }, - { - name: "03 contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "1", - expected: []string{"2", "3", "4"}, - }, - { - name: "04 not contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "0", - expected: []string{"1", "2", "3", "4"}, - }, - { - name: "05 empty source slice", - source: []int{}, - target: 1, - expected: []int{}, - }, - } -} - -func buildRemoveElementsNotInSecondTestCase() []struct { - name string - slice1 interface{} - slice2 interface{} - expected interface{} -} { - return []struct { - name string - slice1 interface{} - slice2 interface{} - expected interface{} - }{ - { - name: "01 Basic functionality - integer slices with partial overlap", - slice1: []int{1, 2, 3, 4}, - slice2: []int{2, 4, 6, 8}, - expected: []int{2, 4}, - }, - { - name: "02 Empty first slice", - slice1: []int{}, - slice2: []int{1, 2, 3}, - expected: []int{}, - }, - { - name: "03 Empty second slice", - slice1: []int{1, 2, 3}, - slice2: []int{}, - expected: []int{}, - }, - { - name: "04 Both slices empty", - slice1: []int{}, - slice2: []int{}, - expected: []int{}, - }, - { - name: "05 No intersection between slices", - slice1: []int{1, 2, 3}, - slice2: []int{4, 5, 6}, - expected: []int{}, - }, - { - name: "06 String type test", - slice1: []string{"1", "2", "3"}, - slice2: []string{"2", "3", "4"}, - expected: []string{"2", "3"}, - }, - } -} - -func TestRemoveElementsNotInSecond(t *testing.T) { - for _, tt := range buildRemoveElementsNotInSecondTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.slice1.(type) { - case []int: - s2 := tt.slice2.([]int) - expected := tt.expected.([]int) - result := RemoveElementsNotInSecond(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) - } - case []string: - s2 := tt.slice2.([]string) - expected := tt.expected.([]string) - result := RemoveElementsNotInSecond(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildRemoveEleSliTestCase() []struct { - name string - source interface{} - target interface{} - expected interface{} -} { - return []struct { - name string - source interface{} - target interface{} - expected interface{} - }{ - { - name: "01 int type", - source: []int{1, 2, 3, 4, 5}, - target: []int{2, 4}, - expected: []int{1, 3, 5}, - }, - { - name: "02 source is empty for int type", - source: []int{}, - target: []int{1, 2}, - expected: []int{}, - }, - { - name: "03 target is empty for int type", - source: []int{1, 2, 3}, - target: []int{}, - expected: []int{1, 2, 3}, - }, - { - name: "04 source and target are both empty for int type", - source: []int{}, - target: []int{}, - expected: []int{}, - }, - { - name: "05 string type", - source: []string{"a", "b", "c", "d"}, - target: []string{"b", "d"}, - expected: []string{"a", "c"}, - }, - } -} - -func TestRemoveEleSli(t *testing.T) { - for _, tt := range buildRemoveEleSliTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.source.(type) { - case []int: - s2 := tt.target.([]int) - expected := tt.expected.([]int) - result := RemoveEleSli(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveEleSli() = %v, want %v", result, expected) - } - case []string: - s2 := tt.target.([]string) - expected := tt.expected.([]string) - result := RemoveEleSli(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveEleSli() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildRemoveDuplicatesCase() []struct { - name string - input interface{} - expected interface{} -} { - return []struct { - name string - input interface{} - expected interface{} - }{ - { - name: "01 empty slice for int type", - input: []int{}, - expected: []int{}, - }, - { - name: "02 no duplicates for int type", - input: []int{1, 2, 3}, - expected: []int{1, 2, 3}, - }, - { - name: "03 with duplicates for int type", - input: []int{1, 2, 2, 3, 1, 4}, - expected: []int{1, 2, 3, 4}, - }, - { - name: "04 with duplicates for string type", - input: []string{"1", "3", "3", "4"}, - expected: []string{"1", "3", "4"}, - }, - } -} - -func TestRemoveDuplicates(t *testing.T) { - for _, tt := range buildRemoveDuplicatesCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.input.(type) { - case []int: - expected := tt.expected.([]int) - result := RemoveDuplicates(s1) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) - } - case []string: - expected := tt.expected.([]string) - result := RemoveDuplicates(s1) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func TestCheckSliceSupport(t *testing.T) { - convey.Convey("test TestCheckSliceSupport, check ok", t, func() { - elements := []int64{1, 2} - expects := []int64{1, 2, 3} - err := CheckSliceSupport(elements, expects) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("test TestCheckSliceSupport, check fail", t, func() { - elements := []int64{1, 2, 4} - expects := []int64{1, 2, 3} - err := CheckSliceSupport(elements, expects) - convey.So(err, convey.ShouldNotBeNil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings.go b/mind-cluster/component/ascend-common/common-utils/utils/strings.go deleted file mode 100644 index c3d98aa..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/strings.go +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "crypto/sha256" - "fmt" - "unicode" -) - -const ( - maskLen = 2 -) - -// ReplacePrefix replace string with prefix -func ReplacePrefix(source, prefix string) string { - if prefix == "" { - prefix = "****" - } - if len(source) <= maskLen { - return prefix - } - end := string([]rune(source)[maskLen:len(source)]) - return prefix + end -} - -// MaskPrefix mask string prefix with **** -func MaskPrefix(source string) string { - return ReplacePrefix(source, "") -} - -// GetSha256Code return the sha256 hash bytes -func GetSha256Code(data []byte) []byte { - hash256 := sha256.New() - if _, err := hash256.Write(data); err != nil { - fmt.Println(err) - return nil - } - return hash256.Sum(nil) -} - -// ReverseString reverse string -func ReverseString(s string) string { - runes := []rune(s) - for start, end := 0, len(runes)-1; start < end; start, end = start+1, end-1 { - runes[start], runes[end] = runes[end], runes[start] - } - return string(runes) -} - -// IsDigitString return string is all digit -func IsDigitString(s string) bool { - if len(s) == 0 { - return false - } - for _, c := range s { - if !unicode.IsDigit(c) { - return false - } - } - return true -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go b/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go deleted file mode 100644 index 390e424..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -const byteLength = 32 - -func TestReplacePrefix(t *testing.T) { - convey.Convey("relative path", t, func() { - path := ReplacePrefix("./testdata/cert/ca.crt", "****") - convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") - }) - convey.Convey("abconvey.Solute path", t, func() { - path := ReplacePrefix("/testdata/cert/ca.crt", "****") - convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") - }) - convey.Convey("path length less than 2", t, func() { - path := ReplacePrefix("/", "****") - convey.So(path, convey.ShouldEqual, "****") - }) - convey.Convey("empty string", t, func() { - path := ReplacePrefix("", "****") - convey.So(path, convey.ShouldEqual, "****") - }) - -} - -func TestMaskPrefix(t *testing.T) { - convey.Convey("relative path", t, func() { - path := MaskPrefix("./testdata/cert/ca.crt") - convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") - }) - convey.Convey("abconvey.Solute path", t, func() { - path := MaskPrefix("/testdata/cert/ca.crt") - convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") - }) - convey.Convey("path length less than 2", t, func() { - path := MaskPrefix("/") - convey.So(path, convey.ShouldEqual, "****") - }) - convey.Convey("empty string", t, func() { - path := MaskPrefix("") - convey.So(path, convey.ShouldEqual, "****") - }) - -} - -func TestGetSha256Code(t *testing.T) { - convey.Convey("test sha256", t, func() { - hashs := GetSha256Code([]byte("this is a test sentence")) - convey.So(len(hashs), convey.ShouldEqual, byteLength) - }) -} - -func TestIsDigitString(t *testing.T) { - convey.Convey("test IsDigitString", t, func() { - convey.Convey("case IsDigitString is true", func() { - str := "123" - convey.ShouldBeTrue(IsDigitString(str)) - }) - convey.Convey("case IsDigitString is false", func() { - str := "123a" - convey.ShouldBeFalse(IsDigitString(str)) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/devmanager/a310mgr.go b/mind-cluster/component/ascend-common/devmanager/a310mgr.go deleted file mode 100644 index 081f167..0000000 --- a/mind-cluster/component/ascend-common/devmanager/a310mgr.go +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this Ascend310 device manager -package devmanager - -import ( - "ascend-common/devmanager/dcmi" -) - -// A310Manager Ascend310 device manager -type A310Manager struct { - dcmi.DcManager -} diff --git a/mind-cluster/component/ascend-common/devmanager/a310pmgr.go b/mind-cluster/component/ascend-common/devmanager/a310pmgr.go deleted file mode 100644 index b32d1fa..0000000 --- a/mind-cluster/component/ascend-common/devmanager/a310pmgr.go +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this Ascend310P device manager -package devmanager - -import ( - "ascend-common/devmanager/dcmi" -) - -// A310PManager Ascend310P device manager -type A310PManager struct { - dcmi.DcManager -} - -// DcGetDevicePowerInfo query power by mcu interface for 310P -func (d *A310PManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { - return d.DcGetMcuPowerInfo(cardID) -} - -// DcGetMcuPowerInfo this function is only for Ascend310P -func (d *A310PManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { - return dcmi.FuncDcmiMcuGetPowerInfo(cardID) -} diff --git a/mind-cluster/component/ascend-common/devmanager/a910mgr.go b/mind-cluster/component/ascend-common/devmanager/a910mgr.go deleted file mode 100644 index 1bb2beb..0000000 --- a/mind-cluster/component/ascend-common/devmanager/a910mgr.go +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this Ascend910 device manager -package devmanager - -import ( - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// A910Manager Ascend910 device manager -type A910Manager struct { - dcmi.DcManager -} - -// DcGetHbmInfo get HBM information, only for Ascend910 -func (d *A910Manager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { - return dcmi.FuncDcmiGetDeviceHbmInfo(cardID, deviceID) -} diff --git a/mind-cluster/component/ascend-common/devmanager/common/constants.go b/mind-cluster/component/ascend-common/devmanager/common/constants.go deleted file mode 100644 index e39ddac..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/constants.go +++ /dev/null @@ -1,272 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common define common variable -package common - -import ( - "math" - - "k8s.io/apimachinery/pkg/util/sets" -) - -// DeviceType define device type -type DeviceType struct { - // Code device type code - Code int32 - // Name device type name - Name string -} - -var ( - // ProfilingTime for getting PCIe bandwidth - ProfilingTime int - - // HccsBWProfilingTime for getting hccs bandwidth - HccsBWProfilingTime int - - // a3BoardIds for A3 Board IDs - a3BoardIds = sets.NewInt32(A900A3SuperPodBin1BoardId, A900A3SuperPodBin2BoardId, - A900A3SuperPodBin3BoardId, A800IA3BoardId) - - // a900A3SuperPodMainBoardIds for A900 A3 Super Pod Main Board IDs - a900A3SuperPodMainBoardIds = sets.NewInt32(A900A3SuperPodMainBoardId1, A900A3SuperPodMainBoardId2) - - // a9000A3SuperPodMainBoardIds for A9000 A3 Super Pod Main Board IDs - a9000A3SuperPodMainBoardIds = sets.NewInt32(A9000A3SuperPodMainBoardId1, A9000A3SuperPodMainBoardId2) -) - -// DeviceType for utilization -var ( - // AICore Ascend310 & Ascend910 - AICore = DeviceType{Code: 2, Name: "AICore"} - // HbmUtilization utilization rate of hbm - HbmUtilization = DeviceType{Code: 6, Name: "Hbm"} - // VectorCore Ascend310P - VectorCore = DeviceType{Code: 12, Name: "VectorCore"} - // Overall Overall utilization rate of NPU - Overall = DeviceType{Code: 13, Name: "Overall"} -) - -// DeviceType for frequency -var ( - // AICoreCurrentFreq Ascend310 & Ascend910 & Ascend910B & Ascend310P - AICoreCurrentFreq = DeviceType{Code: 7, Name: "AICore Current"} -) - -const ( - // Success for interface return code - Success = 0 - // DeviceNotReadyErrCodeStr for dcmi interface device not ready err code string - DeviceNotReadyErrCodeStr = "-8012" - // DeviceNotReadyErrCode for dcmi interface device not ready err code - DeviceNotReadyErrCode = -8012 - // CardDropFaultCode card drop fault code - CardDropFaultCode = 0x40F84E00 - // RetError return error when the function failed - RetError = -1 - // Percent constant of 100 - Percent = 100 - // MaxErrorCodeCount number of error codes - MaxErrorCodeCount = 128 - // UnRetError return unsigned int error - UnRetError = math.MaxUint32 - // Abnormal status of Abnormal - Abnormal = "Abnormal" - // ChannelStateOk means out band channel is ok for resetting - ChannelStateOk = 1 - - // HiAIMaxCardID max card id for Ascend chip - HiAIMaxCardID = math.MaxInt32 - - // HiAIMaxCardNum max card number - HiAIMaxCardNum = 64 - - // HiAIMaxDeviceNum max device number - HiAIMaxDeviceNum = 4 - - // NpuType present npu chip - NpuType = 0 - - // ReduceOnePercent for calculation reduce one percent - ReduceOnePercent = 0.01 - // ReduceTenth for calculation reduce one tenth - ReduceTenth = 0.1 - // DefaultTemperatureWhenQueryFailed when get temperature failed, use this value - DefaultTemperatureWhenQueryFailed = -275 - - // Ascend310P ascend 310P chip - Ascend310P = "Ascend310P" - // Ascend910 ascend 910 chip - Ascend910 = "Ascend910" - // Ascend910B ascend 910B chip - Ascend910B = "Ascend910B" - // Ascend910A3 ascend Ascend910A3 chip - Ascend910A3 = "Ascend910A3" - // Atlas200ISoc 200 soc env - Atlas200ISoc = "Atlas 200I SoC A1" - - // DcmiApiTimeout dcmi interface timeout seconds - DcmiApiTimeout = 1 - - // SubscribeAllDevice subscribe all device ID - SubscribeAllDevice = -1 - // MinVDevID min value of virtual device id - MinVDevID = 100 - // MaxVDevID max value of virtual device id - MaxVDevID = 1124 - - // InvalidID invalid ID - InvalidID = 0xffffffff - - // FailedMetricValue for failed metric value - FailedMetricValue = -1 - - // FailedValue for failed value - FailedValue = 0xffffffff - - // MaxErrorCodeLen max length of error code for Prometheus - MaxErrorCodeLen = 10 -) - -const ( - // BootStartFinish chip hot reset finish - BootStartFinish = 16 -) - -const ( - // FaultRecover device fault recover - FaultRecover = int8(0) - // FaultOccur device fault occur - FaultOccur = int8(1) - // FaultOnce once device fault - FaultOnce = int8(2) -) - -const ( - // AMPMode for AMP chip work mode - AMPMode = "AMP" - // SMPMode for SMP chip work mode - SMPMode = "SMP" - - // NetworkInit init status - NetworkInit = 6 - // NetworkSuccess chip network is healthy - NetworkSuccess = 0 - - // MaxProcNum process number in device side - MaxProcNum = 32 - // UnitMB MB - UnitMB float64 = 1024 * 1024 - - // Chip910 chip name 910 - Chip910 = "910" - - // A300IA2BoardId board id of A300I A2 and 910proB - A300IA2BoardId = 0x28 - - // A300IA2GB64BoardId board id of A300I A2 64GB - A300IA2GB64BoardId = 0x29 - - // A900A3SuperPodBin1BoardId board id of A900/A9000 A3 SuperPod Bin1 - A900A3SuperPodBin1BoardId = 0xb0 - - // A900A3SuperPodBin2BoardId board id of A900/A9000 A3 SuperPod Bin2 - A900A3SuperPodBin2BoardId = 0xb1 - - // A900A3SuperPodBin3BoardId board id of A900/A9000 A3 SuperPod Bin3 - A900A3SuperPodBin3BoardId = 0xb2 - - // A800IA3BoardId board id of A800I A3 - A800IA3BoardId = 0xb3 - - // A900A3SuperPodMainBoardId1 board id of A900 A3 SuperPod MainBoard1 - A900A3SuperPodMainBoardId1 = 0x18 - - // A900A3SuperPodMainBoardId2 board id of A900 A3 SuperPod MainBoard2 - A900A3SuperPodMainBoardId2 = 0x19 - - // A800IA3MainBoardId A800I A3 MainBoardId - A800IA3MainBoardId = 0x14 - - // A9000A3SuperPodMainBoardId1 board id of A9000 A3 SuperPod MainBoard1 - A9000A3SuperPodMainBoardId1 = 0x1C - - // A9000A3SuperPodMainBoardId2 board id of A9000 A3 SuperPod MainBoard2 - A9000A3SuperPodMainBoardId2 = 0x1D -) - -// log limit domains for metrics -const ( - // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID - DomainForLogicIdErr = "logicID" -) - -// DcmiDeviceType used to represent the dcmi device type -type DcmiDeviceType int32 - -const ( - // DcmiDeviceTypeDDR represents the component type DCMI_DEVICE_TYPE_DDR - DcmiDeviceTypeDDR DcmiDeviceType = 0 - // DcmiDeviceTypeSRAM represents the component type DCMI_DEVICE_TYPE_SRAM - DcmiDeviceTypeSRAM DcmiDeviceType = 1 - // DcmiDeviceTypeHBM represents the component type DCMI_DEVICE_TYPE_HBM - DcmiDeviceTypeHBM DcmiDeviceType = 2 - // DcmiDeviceTypeNPU represents the component type DCMI_DEVICE_TYPE_NPU - DcmiDeviceTypeNPU DcmiDeviceType = 3 - // DcmiDeviceTypeNONE represents the component type DCMI_DEVICE_TYPE_NONE - DcmiDeviceTypeNONE DcmiDeviceType = 0xff -) - -const ( - // ErrMsgInitCardListFailed is used where initialization of the card list fails - ErrMsgInitCardListFailed = "get card list failed for init" - // ErrMsgGetBoardInfoFailed is used where there is a failure in getting board info - ErrMsgGetBoardInfoFailed = "get board info failed, no card found" -) - -const ( - // MaxHccspingMeshAddr is the max number of hccsping addresses - MaxHccspingMeshAddr = 1024 - // MinPktSize is the min packet size - MinPktSize = 1792 - // MaxPktSize is the max packet size - MaxPktSize = 3000 - // MinPktSendNum is the min packet send number - MinPktSendNum = 1 - // MaxPktSendNum is the max packet send number - MaxPktSendNum = 1000 - // MinPktInterval is the min packet interval - MinPktInterval = 1 - // MaxPktInterval is the max packet interval - MaxPktInterval = 1000 - // MinTaskInterval is the min task interval - MinTaskInterval = 1 - // MaxTaskInterval is the max task interval - MaxTaskInterval = 60 - // InternalPingMeshTaskID is the inner ping mesh task id - InternalPingMeshTaskID uint = 0 - // ExternalPingMeshTaskID is the outer ping mesh task id - ExternalPingMeshTaskID uint = 1 - // DefaultPingMeshPortID is the default ping mesh port - DefaultPingMeshPortID = 0 - // DefaultPktSize is the default packet size - DefaultPktSize = 1792 - // DefaultPktSendNum is the default packet send number - DefaultPktSendNum = 10 - // DefaultPktInterval is the default packet interval - DefaultPktInterval = 10 - // DefaultTimeout is the default timeout - DefaultTimeout = 1 -) diff --git a/mind-cluster/component/ascend-common/devmanager/common/types.go b/mind-cluster/component/ascend-common/devmanager/common/types.go deleted file mode 100644 index 870c716..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/types.go +++ /dev/null @@ -1,435 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common define common types -package common - -// MemoryInfo memory information struct -type MemoryInfo struct { - MemorySize uint64 `json:"memory_size"` - MemoryAvailable uint64 `json:"memory_available"` - Frequency uint32 `json:"memory_frequency"` - Utilization uint32 `json:"memory_utilization"` -} - -// HbmInfo high bandwidth memory info -type HbmInfo struct { - MemorySize uint64 `json:"memory_size"` // total size,MB - Frequency uint32 `json:"hbm_frequency"` // frequency MHz - Usage uint64 `json:"memory_usage"` // memory usage,MB - Temp int32 `json:"hbm_temperature"` // temperature - BandWidthUtilRate uint32 `json:"hbm_bandwidth_util"` // bandwidth utilization -} - -// HbmAggregateInfo more comprehensive high bandwidth memory information with ecc information -type HbmAggregateInfo struct { - *HbmInfo - ECCInfo *ECCInfo `json:"hbm_ecc_info"` // ECC information -} - -// ChipInfo chip info -type ChipInfo struct { - Type string `json:"chip_type"` - Name string `json:"chip_name"` - Version string `json:"chip_version"` - NpuName string `json:"npu_name"` - AICoreCnt int `json:"aicore_cnt"` -} - -// ChipBaseInfo all id of chip -type ChipBaseInfo struct { - PhysicID int32 - LogicID int32 - CardID int32 - DeviceID int32 -} - -// CgoCreateVDevOut create virtual device output info -type CgoCreateVDevOut struct { - VDevID uint32 - PcieBus uint32 - PcieDevice uint32 - PcieFunc uint32 - VfgID uint32 - Reserved []uint8 -} - -// CgoCreateVDevRes create virtual device input info -type CgoCreateVDevRes struct { - VDevID uint32 - VfgID uint32 - TemplateName string - Reserved []uint8 -} - -// CgoBaseResource base resource info -type CgoBaseResource struct { - Token uint64 - TokenMax uint64 - TaskTimeout uint64 - VfgID uint32 - VipMode uint8 - Reserved []uint8 -} - -// CgoComputingResource compute resource info -type CgoComputingResource struct { - // accelator resource - Aic float32 - Aiv float32 - Dsa uint16 - Rtsq uint16 - Acsq uint16 - Cdqm uint16 - CCore uint16 - Ffts uint16 - Sdma uint16 - PcieDma uint16 - - // memory resource, MB as unit - MemorySize uint64 - - // id resource - EventID uint32 - NotifyID uint32 - StreamID uint32 - ModelID uint32 - - // cpu resource - TopicScheduleAicpu uint16 - HostCtrlCPU uint16 - HostAicpu uint16 - DeviceAicpu uint16 - TopicCtrlCPUSlot uint16 - - Reserved []uint8 -} - -// CgoMediaResource media resource info -type CgoMediaResource struct { - Jpegd float32 - Jpege float32 - Vpc float32 - Vdec float32 - Pngd float32 - Venc float32 - Reserved []uint8 -} - -// CgoVDevQueryInfo virtual resource special info -type CgoVDevQueryInfo struct { - Name string - Status uint32 - IsContainerUsed uint32 - Vfid uint32 - VfgID uint32 - ContainerID uint64 - Base CgoBaseResource - Computing CgoComputingResource - Media CgoMediaResource -} - -// CgoVDevQueryStru virtual resource info -type CgoVDevQueryStru struct { - VDevID uint32 - QueryInfo CgoVDevQueryInfo -} - -// CgoSocFreeResource soc free resource info -type CgoSocFreeResource struct { - VfgNum uint32 - VfgBitmap uint32 - Base CgoBaseResource - Computing CgoComputingResource - Media CgoMediaResource -} - -// CgoSocTotalResource soc total resource info -type CgoSocTotalResource struct { - VDevNum uint32 - VDevID []uint32 - VfgNum uint32 - VfgBitmap uint32 - Base CgoBaseResource - Computing CgoComputingResource - Media CgoMediaResource -} - -// CgoSuperPodInfo super pod info -type CgoSuperPodInfo struct { - SdId uint32 - ScaleType uint32 - SuperPodId uint32 - ServerId uint32 - Reserve []uint32 -} - -// VirtualDevInfo virtual device infos -type VirtualDevInfo struct { - TotalResource CgoSocTotalResource - FreeResource CgoSocFreeResource - VDevInfo []CgoVDevQueryStru - VDevActivityInfo []VDevActivityInfo -} - -// DevFaultInfo device's fault info -type DevFaultInfo struct { - EventID int64 - LogicID int32 - ModuleType int8 // ModuleType prototype is dcmi node_type - ModuleID int8 // ModuleID prototype is dcmi node_id - SubModuleType int8 // SubModuleType prototype is dcmi sub_node_type - SubModuleID int8 // SubModuleID prototype is dcmi sub_node_id - Severity int8 - Assertion int8 - AlarmRaisedTime int64 -} - -// DevProcessInfo device process info -type DevProcessInfo struct { - DevProcArray []DevProcInfo - ProcNum int32 -} - -// DevProcInfo process info in device side -type DevProcInfo struct { - Pid int32 - // the total amount of memory occupied by the device side OS and allocated by the business, unit is MB - MemUsage float64 -} - -// BoardInfo board info of device -type BoardInfo struct { - BoardId uint32 - PcbId uint32 - BomId uint32 - SlotId uint32 -} - -// VDevActivityInfo vNPU activity info for 310P -type VDevActivityInfo struct { - VDevID uint32 - VDevAiCoreRate uint32 - VDevTotalMem uint64 - VDevUsedMem uint64 - VDevAiCore float64 - IsVirtualDev bool -} - -// PCIEBwStat contains pcie bandwidth -type PCIEBwStat struct { - PcieRxPBw PcieStatValue - PcieRxNPBw PcieStatValue - PcieRxCPLBw PcieStatValue - PcieTxPBw PcieStatValue - PcieTxNPBw PcieStatValue - PcieTxCPLBw PcieStatValue -} - -// PcieStatValue pcie stat three value, like [min_bw,max_bw,avg_bw] -type PcieStatValue struct { - PcieMinBw int32 - PcieMaxBw int32 - PcieAvgBw int32 -} - -// DeviceNetworkHealth dcmi_get_device_network_health api return value -type DeviceNetworkHealth struct { - HealthCode uint32 - RetCode int32 -} - -// ECCInfo dcmi_get_device_ecc_info api return value -type ECCInfo struct { - EnableFlag int32 - SingleBitErrorCnt int64 - DoubleBitErrorCnt int64 - TotalSingleBitErrorCnt int64 - TotalDoubleBitErrorCnt int64 - SingleBitIsolatedPagesCnt int64 - DoubleBitIsolatedPagesCnt int64 -} - -// NpuNetInfo network info of npu -type NpuNetInfo struct { - // The optical info - OpticalInfo *OpticalInfo - // The transfer rate of network port - LinkSpeedInfo *LinkSpeedInfo - // Historical link statistics of network ports - LinkStatInfo *LinkStatInfo - // Statistics about packets - StatInfo *StatInfo - // Network port real-time bandwidth - BandwidthInfo *BandwidthInfo - // LinkStatusInfo refers to the link state - LinkStatusInfo *LinkStatusInfo -} - -// BandwidthInfo contains network port real-time bandwidth -type BandwidthInfo struct { - // TxValue transform speed - TxValue float64 `json:"tx_value"` - // RxValue receive speed - RxValue float64 `json:"rx_value"` -} - -// HccsStatisticInfo contains hccs statistic info -type HccsStatisticInfo struct { - TxCnt []uint64 - RxCnt []uint64 - CrcErrCnt []uint64 - retryCnt []uint64 - reservedFieldCnt []uint64 -} - -// HccsBandwidthInfo contains hccs bandwidth info -type HccsBandwidthInfo struct { - ProfilingTime uint32 - TotalTxbw float64 - TotalRxbw float64 - TxBandwidth []float64 - RxBandwidth []float64 -} - -// SioCrcErrStatisticInfo contains sio crc error statistic info -type SioCrcErrStatisticInfo struct { - TxErrCnt int64 - RxErrCnt int64 - Reserved []uint32 -} - -// StatInfo the statistics about packets -type StatInfo struct { - // Total number of pause frames received by the MAC - MacRxPauseNum float64 - // Total number of pause frames sent by MAC - MacTxPauseNum float64 - // Total number of PFC frames received by MAC - MacRxPfcPktNum float64 - // Total number of PFC frames sent by MAC - MacTxPfcPktNum float64 - // Total number of bad packets received by MAC - MacRxBadPktNum float64 - // Total number of bad packets sent by MAC - MacTxBadPktNum float64 - // The total number of packets received by the RoCE network card - RoceRxAllPktNum float64 - // The total number of packets sent by the RoCE network card - RoceTxAllPktNum float64 - // The number of bad packets received by the RoCE network card - RoceRxErrPktNum float64 - // The number of bad packets sent by the RoCE network card - RoceTxErrPktNum float64 - // The number of CNP type packets received by the RoCE network card - RoceRxCnpPktNum float64 - // The number of CNP type packets sent by the RoCE network card - RoceTxCnpPktNum float64 - // Number of RoCE network card retry messages - RoceNewPktRtyNum float64 - // Total number of bytes of bad packets sent by MAC - MacTxBadOctNum float64 - // Total number of bytes of bad packets received by MAC - MacRxBadOctNum float64 - // The number of unexpected ACK messages received by the RoCE network card - RoceUnexpectedAckNum float64 - // The number of out-of-order packets received by the RoCE network card - RoceOutOfOrderNum float64 - // The number of packets with domain segment verification errors received by the RoCE network card - RoceVerificationErrNum float64 - // The number of messages generated by abnormal QP connection status received by the RoCE network card - RoceQpStatusErrNum float64 - // The number of ecn - RoceEcnDBNum float64 - // The number of err info - MacRXFcsErrPktNum float64 -} - -// LinkStatInfo refers to the historical link statistics, including the times of link-up -type LinkStatInfo struct { - // The times of link-up - LinkUPNum float64 -} - -// LinkStatusInfo refers to the link state -type LinkStatusInfo struct { - // The state of link - LinkState string -} - -// LinkSpeedInfo the transfer rate of network port -type LinkSpeedInfo struct { - // The rate of network port - Speed float64 -} - -// OpticalInfo indicates the optical module information -type OpticalInfo struct { - // Optical module status, indicating whether it is in place (present) - OpticalState float64 - // Power sent by No.0 optical module - OpticalTxPower0 float64 - // Power sent by No.1 optical module - OpticalTxPower1 float64 - // Power sent by No.2 optical module - OpticalTxPower2 float64 - // Power sent by No.3 optical module - OpticalTxPower3 float64 - // Reception power of No.0 optical module - OpticalRxPower0 float64 - // Reception power of No.1 optical module - OpticalRxPower1 float64 - // Reception power of No.2 optical module - OpticalRxPower2 float64 - // Reception power of No.3 optical module - OpticalRxPower3 float64 - // Optical module voltage - OpticalVcc float64 - // Optical module temperature - OpticalTemp float64 -} - -// HccspingMeshOperate refers to the operation of hccsping mesh -type HccspingMeshOperate struct { - DstAddr string - PktSize int - PktSendNum int - PktInterval int - Timeout int - TaskInterval int - TaskId int -} - -// HccspingMeshInfo refers to the result of hccsping mesh -type HccspingMeshInfo struct { - DstAddr []string - SucPktNum []uint - FailPktNum []uint - MaxTime []int - MinTime []int - AvgTime []int - TP95Time []int - ReplyStatNum []int - PingTotalNum []int - DestNum int -} - -// ElabelInfo elabel information structure -type ElabelInfo struct { - ProductName string - Model string - Manufacturer string - ManufacturerDate string - SerialNumber string -} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils.go b/mind-cluster/component/ascend-common/devmanager/common/utils.go deleted file mode 100644 index 87e14df..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/utils.go +++ /dev/null @@ -1,305 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common this for util method -package common - -import ( - "fmt" - "math" - "regexp" - "strings" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" -) - -var ( - reg910A = regexp.MustCompile(api.Ascend910APattern) - reg910B = regexp.MustCompile(api.Ascend910BPattern) - reg310P = regexp.MustCompile(api.Ascend310PPattern) -) - -// IsGreaterThanOrEqualInt32 check num range -func IsGreaterThanOrEqualInt32(num int64) bool { - if num >= int64(math.MaxInt32) { - return true - } - - return false -} - -// IsValidUtilizationRate valid utilization rate is 0-100 -func IsValidUtilizationRate(num uint32) bool { - if num > uint32(Percent) || num < 0 { - return false - } - - return true -} - -// IsValidChipInfo valid chip info is or not empty -func IsValidChipInfo(chip *ChipInfo) bool { - return chip.Name != "" || chip.Type != "" || chip.Version != "" -} - -// IsValidBoardInfo check whether the board info is valid -func IsValidBoardInfo(board *BoardInfo) bool { - return board.BoardId != InvalidID || board.PcbId != InvalidID || - board.BomId != InvalidID || board.SlotId != InvalidID -} - -// IsValidMainBoardInfo check whether the mainBoardId is valid -func IsValidMainBoardInfo(mainBoardId uint32) bool { - return mainBoardId != InvalidID -} - -// IsValidCardID valid card id -func IsValidCardID(cardID int32) bool { - // for cardID, please watch the maximum value of the driver is changed in the future version - return cardID >= 0 && cardID < HiAIMaxCardID -} - -// IsValidDeviceID valid device id -func IsValidDeviceID(deviceID int32) bool { - return deviceID >= 0 && deviceID < HiAIMaxDeviceNum -} - -// IsValidLogicIDOrPhyID valid logic id -func IsValidLogicIDOrPhyID(id int32) bool { - return id >= 0 && id < HiAIMaxCardNum*HiAIMaxDeviceNum -} - -// IsValidCardIDAndDeviceID check two params both needs meet the requirement -func IsValidCardIDAndDeviceID(cardID, deviceID int32) bool { - if !IsValidCardID(cardID) { - return false - } - - return IsValidDeviceID(deviceID) -} - -// IsValidDevNumInCard valid devNum in card -func IsValidDevNumInCard(num int32) bool { - return num > 0 && num <= HiAIMaxDeviceNum -} - -// IsValidVDevID valid vir device id -func IsValidVDevID(vDevID uint32) bool { - return vDevID >= MinVDevID && vDevID < MaxVDevID -} - -// IsValidPortID valid port id -func IsValidPortID(portID int) bool { - return portID == DefaultPingMeshPortID -} - -// IsValidTaskID valid task id -func IsValidTaskID(taskID uint) bool { - return taskID == InternalPingMeshTaskID || taskID == ExternalPingMeshTaskID -} - -// IsValidHccspingMeshOperate valid hccsping mesh operate -func IsValidHccspingMeshOperate(operate HccspingMeshOperate) error { - if len(operate.DstAddr) > MaxHccspingMeshAddr { - return fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(operate.DstAddr), - MaxHccspingMeshAddr) - } - if operate.PktSize < MinPktSize || operate.PktSize > MaxPktSize { - return fmt.Errorf("pkt size %d is invalid, should be between %d and %d", operate.PktSize, MinPktSize, MaxPktSize) - } - if operate.PktSendNum < MinPktSendNum || operate.PktSendNum > MaxPktSendNum { - return fmt.Errorf("pkt send num %d is invalid, should be between %d and %d", operate.PktSendNum, - MinPktSendNum, MaxPktSendNum) - } - if operate.PktInterval < MinPktInterval || operate.PktInterval > MaxPktInterval { - return fmt.Errorf("pkt interval %d is invalid, should be between %d and %d", operate.PktInterval, - MinPktInterval, MaxPktInterval) - } - if operate.TaskInterval < MinTaskInterval || operate.TaskInterval > MaxTaskInterval { - return fmt.Errorf("task interval %d is invalid, should be between %d and %d", operate.TaskInterval, - MinTaskInterval, MaxTaskInterval) - } - if !IsValidTaskID(uint(operate.TaskId)) { - return fmt.Errorf("task id %d is invalid", operate.TaskId) - } - return nil -} - -// GetDeviceTypeByChipName get device type by chipName -func GetDeviceTypeByChipName(chipName string) string { - if reg310P.MatchString(chipName) { - return api.Ascend310P - } - if strings.Contains(chipName, api.Ascend310BNo) { - return api.Ascend310B - } - if strings.Contains(chipName, api.Ascend310No) { - return api.Ascend310 - } - if reg910B.MatchString(chipName) { - return api.Ascend910B - } - if reg910A.MatchString(chipName) { - return api.Ascend910A - } - return "" -} - -func get910TemplateNameList() map[string]struct{} { - return map[string]struct{}{"vir16": {}, "vir08": {}, "vir04": {}, "vir02": {}, "vir01": {}} -} - -func get910BTemplateNameList() map[string]struct{} { - return map[string]struct{}{ - "vir03_1c_8g": {}, "vir05_1c_8g": {}, "vir05_1c_16g": {}, - "vir06_1c_16g": {}, "vir10_3c_16g": {}, "vir10_3c_16g_nm": {}, - "vir10_3c_32g": {}, "vir10_4c_16g_m": {}, "vir12_3c_32g": {}} -} - -func get310PTemplateNameList() map[string]struct{} { - return map[string]struct{}{"vir04": {}, "vir02": {}, "vir01": {}, "vir04_3c": {}, "vir02_1c": {}, - "vir04_4c_dvpp": {}, "vir04_3c_ndvpp": {}} -} - -// IsValidTemplateName check template name meet the requirement -func IsValidTemplateName(devType, templateName string) bool { - isTemplateNameValid := false - switch devType { - case api.Ascend310P: - _, isTemplateNameValid = get310PTemplateNameList()[templateName] - case api.Ascend910A: - _, isTemplateNameValid = get910TemplateNameList()[templateName] - case api.Ascend910B: - _, isTemplateNameValid = get910BTemplateNameList()[templateName] - default: - } - return isTemplateNameValid -} - -// RemoveDuplicate remove duplicate device -func RemoveDuplicate(list *[]string) []string { - listValueMap := make(map[string]string, len(*list)) - var rmDupValueList []string - for _, value := range *list { - listValueMap[value] = value - } - for _, value := range listValueMap { - rmDupValueList = append(rmDupValueList, value) - } - return rmDupValueList -} - -// GetNpuName get npu name eg: name-type-version -func GetNpuName(chipInfo *ChipInfo) string { - if chipInfo == nil { - return "" - } - if len(chipInfo.Name) == 0 && len(chipInfo.Type) == 0 && len(chipInfo.Version) == 0 { - return "" - } - return fmt.Sprintf("%s-%s-%s", chipInfo.Name, chipInfo.Type, chipInfo.Version) -} - -// SetExternalParams transmit npu-exporter's startup parameters -func SetExternalParams(profilingTime int) { - ProfilingTime = profilingTime -} - -// SetHccsBWProfilingTime set hccs bw profiling time -func SetHccsBWProfilingTime(hccsbwProfilingTime int) { - HccsBWProfilingTime = hccsbwProfilingTime -} - -// DeepCopyChipInfo copy chip info deeply -func DeepCopyChipInfo(chipInfo *ChipInfo) *ChipInfo { - if chipInfo == nil { - return nil - } - - return &ChipInfo{ - Type: chipInfo.Type, - Name: chipInfo.Name, - Version: chipInfo.Version, - } -} - -// DeepCopyVDevActivityInfo copy VDevActivityInfo deeply -func DeepCopyVDevActivityInfo(vDevActivityInfo *VDevActivityInfo) *VDevActivityInfo { - if vDevActivityInfo == nil { - return nil - } - - return &VDevActivityInfo{ - VDevID: vDevActivityInfo.VDevID, - VDevAiCoreRate: vDevActivityInfo.VDevAiCoreRate, - VDevTotalMem: vDevActivityInfo.VDevTotalMem, - VDevUsedMem: vDevActivityInfo.VDevUsedMem, - VDevAiCore: vDevActivityInfo.VDevAiCore, - IsVirtualDev: vDevActivityInfo.IsVirtualDev, - } -} - -// DeepCopySlice Deep copy slice -func deepCopySlice(slice interface{}) interface{} { - - switch v := slice.(type) { - case []int: - newSlice := make([]int, len(v)) - copy(newSlice, v) - return newSlice - case []uint32: - newSlice := make([]uint32, len(v)) - copy(newSlice, v) - return newSlice - case []float64: - newSlice := make([]float64, len(v)) - copy(newSlice, v) - return newSlice - default: - hwlog.RunLog.Warn("Unsupported slice type") - return slice - } -} - -// GetDevType get device type by chip name,boardId -func GetDevType(chipName string, boardId uint32) string { - var devType string - if Is910A3Chip(boardId) { - devType = api.Ascend910A3 - } else { - devType = GetDeviceTypeByChipName(chipName) - } - return devType -} - -// Is910A3Chip current chip is 910A3 or not,include A900A3 and A9000A3 -func Is910A3Chip(boardId uint32) bool { - return a3BoardIds.Has(int32(boardId)) -} - -// IsA900A3SuperPod current product is A900A3 super pod or not -func IsA900A3SuperPod(mainBoardId uint32) bool { - return a900A3SuperPodMainBoardIds.Has(int32(mainBoardId)) -} - -// IsA9000A3SuperPod current product is A9000A3 super pod or not -func IsA9000A3SuperPod(mainBoardId uint32) bool { - return a9000A3SuperPodMainBoardIds.Has(int32(mainBoardId)) -} - -// Is800IA3Chip current chip is 800IA3 or not -func Is800IA3Chip(mainBoardId uint32) bool { - return mainBoardId == A800IA3MainBoardId -} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils_test.go b/mind-cluster/component/ascend-common/devmanager/common/utils_test.go deleted file mode 100644 index 548a1c0..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/utils_test.go +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package common - -import ( - "fmt" - "strings" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -// TestDeepCopyHccsBandwidthInfo TestDeepCopySlice -func TestDeepCopyHccsBandwidthInfo(t *testing.T) { - - convey.Convey("should copy a new []int", t, func() { - slice := []int{1, 2} - newSlice := deepCopySlice(slice) - convey.So(&newSlice, convey.ShouldNotEqual, &slice) - }) - - convey.Convey("should copy a new []int32", t, func() { - slice := []uint32{1, 2} - - newSlice := deepCopySlice(slice) - convey.So(&newSlice, convey.ShouldNotEqual, &slice) - }) - - convey.Convey("should copy a new []float64", t, func() { - slice := []float64{1, 2} - newSlice := deepCopySlice(slice) - convey.So(&newSlice, convey.ShouldNotEqual, &slice) - }) -} - -func TestIsValidPortID(t *testing.T) { - convey.Convey("Given a port ID", t, func() { - convey.Convey("01-When the port ID is invalid, should return false", func() { - portID1 := 1 - convey.So(IsValidPortID(portID1), convey.ShouldBeFalse) - }) - - convey.Convey("02-When the port ID is the default, should return true", func() { - portID3 := DefaultPingMeshPortID - convey.So(IsValidPortID(portID3), convey.ShouldBeTrue) - }) - }) -} - -func TestIsValidTaskID(t *testing.T) { - convey.Convey("Given a task ID", t, func() { - convey.Convey("01-When the task ID is valid, should return true", func() { - taskID1 := InternalPingMeshTaskID - convey.So(IsValidTaskID(taskID1), convey.ShouldBeTrue) - - taskID2 := ExternalPingMeshTaskID - convey.So(IsValidTaskID(taskID2), convey.ShouldBeTrue) - }) - - convey.Convey("02-When the task ID is invalid, should return false", func() { - const taskID3 = 3 - convey.So(IsValidTaskID(taskID3), convey.ShouldBeFalse) - }) - }) -} - -func defaultHccspingMeshOperate() HccspingMeshOperate { - return HccspingMeshOperate{ - DstAddr: "1111", - PktSize: MinPktSize, - PktSendNum: MinPktSendNum, - PktInterval: MinPktInterval, - TaskInterval: MinTaskInterval, - TaskId: int(InternalPingMeshTaskID), - } -} - -func check(op HccspingMeshOperate, expectedErr error) { - err := IsValidHccspingMeshOperate(op) - convey.So(err, convey.ShouldResemble, expectedErr) -} - -func expectedError(pattern string, current, min, max int) error { - return fmt.Errorf(pattern, current, min, max) -} - -func TestIsValidHccspingMeshOperate01(t *testing.T) { - convey.Convey("Given a pingmesh operate", t, func() { - op := defaultHccspingMeshOperate() - convey.Convey("01-When operation valid, should return nil", func() { - check(op, nil) - }) - var expectedErr error - convey.Convey("01-When the dst addr is invalid, should return error", func() { - op.DstAddr = strings.Repeat("a", MaxHccspingMeshAddr+1) - expectedErr = fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(op.DstAddr), - MaxHccspingMeshAddr) - check(op, expectedErr) - }) - op.DstAddr = "1111" - convey.Convey("02-When the pkt size is invalid, should return error", func() { - pattern := "pkt size %d is invalid, should be between %d and %d" - op.PktSize = MinPktSize - 1 - check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) - op.PktSize = MaxPktSize + 1 - check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) - }) - op.PktSize = MinPktSize - convey.Convey("03-When the pkt send num is invalid, should return error", func() { - pattern := "pkt send num %d is invalid, should be between %d and %d" - op.PktSendNum = MinPktSendNum - 1 - check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) - op.PktSendNum = MaxPktSendNum + 1 - check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) - }) - op.TaskInterval = MinTaskInterval - convey.Convey("06-When the task id is invalid, should return error", func() { - op.TaskId = int(ExternalPingMeshTaskID) + 1 - expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) - check(op, expectedErr) - }) - }) -} - -func TestIsValidHccspingMeshOperate02(t *testing.T) { - convey.Convey("Given a pingmesh operate", t, func() { - op := defaultHccspingMeshOperate() - convey.Convey("04-When the pkt interval is invalid, should return error", func() { - pattern := "pkt interval %d is invalid, should be between %d and %d" - op.PktInterval = MinPktInterval - 1 - check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) - op.PktInterval = MaxPktInterval + 1 - check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) - }) - op.PktInterval = MinPktInterval - convey.Convey("05-When the task interval is invalid, should return error", func() { - pattern := "task interval %d is invalid, should be between %d and %d" - op.TaskInterval = MinTaskInterval - 1 - check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) - op.TaskInterval = MaxTaskInterval + 1 - check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) - }) - op.TaskInterval = MinTaskInterval - var expectedErr error - convey.Convey("06-When the task id is invalid, should return error", func() { - op.TaskId = int(ExternalPingMeshTaskID) + 1 - expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) - check(op, expectedErr) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go b/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go deleted file mode 100644 index bd68af3..0000000 --- a/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package dcmi this for constants -package dcmi - -// MainCmd main command enum -type MainCmd uint32 - -// VDevMngSubCmd virtual device manager sub command -type VDevMngSubCmd uint32 - -// DieType present chip die type -type DieType int32 - -const ( - // dcmiMaxVdevNum is max number of vdevice, value is from driver specification - dcmiMaxVdevNum = 32 - // dcmiMaxReserveNum is max number of reserve, value is from driver specification - dcmiMaxReserveNum = 8 - // dcmiVDevResNameLen length of vnpu resource name - dcmiVDevResNameLen = 16 - // dcmiHccsMaxPcsNum max pcs number for hccs - dcmiHccsMaxPcsNum = 16 - - maxChipNameLen = 32 - productTypeLen = 64 - dcmiVersionLen = 32 - - // MainCmdChipInf main cmd chip inf - MainCmdChipInf MainCmd = 12 - // MainCmdHccs main cmd of hccs - MainCmdHccs MainCmd = 16 - // MainCmdVDevMng virtual device manager - MainCmdVDevMng MainCmd = 52 - // MainCmdSio SIO status between die - MainCmdSio MainCmd = 56 - - // VmngSubCmdGetVDevResource get virtual device resource info - VmngSubCmdGetVDevResource VDevMngSubCmd = 0 - // VmngSubCmdGetTotalResource get total resource info - VmngSubCmdGetTotalResource VDevMngSubCmd = 1 - // VmngSubCmdGetFreeResource get free resource info - VmngSubCmdGetFreeResource VDevMngSubCmd = 2 - // VmngSubCmdGetVDevActivity get vir device activity info - VmngSubCmdGetVDevActivity VDevMngSubCmd = 5 - // CinfSubCmdGetSPodInfo get super pod info - CinfSubCmdGetSPodInfo VDevMngSubCmd = 1 - // SioSubCmdCrcErrStatistics get SIO err statistics info - SioSubCmdCrcErrStatistics VDevMngSubCmd = 0 - // HccsSubCmdGetStatisticInfo get statistic info - HccsSubCmdGetStatisticInfo VDevMngSubCmd = 3 - // HccsSubCmdGetStatisticInfoU64 get statistic info in u64 - HccsSubCmdGetStatisticInfoU64 VDevMngSubCmd = 5 - - // NDIE NDie ID, only Ascend910 has - NDIE DieType = 0 - // VDIE VDie ID, it can be the uuid of chip - VDIE DieType = 1 - // DieIDCount die id array max length - DieIDCount = 5 - - // ipAddrTypeV6 ip address type of IPv6 - ipAddrTypeV6 = 1 - - agentdrvProfDataNum = 3 -) diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go deleted file mode 100644 index 834397c..0000000 --- a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go +++ /dev/null @@ -1,2213 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package dcmi this for dcmi manager -package dcmi - -// #cgo LDFLAGS: -ldl -/* - #include - #include - #include - #include - - #include "dcmi_interface_api.h" - - static void *dcmiHandle; - #define SO_NOT_FOUND -99999 - #define FUNCTION_NOT_FOUND -99998 - #define SUCCESS 0 - #define ERROR_UNKNOWN -99997 - #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); - - // dcmi - static int (*dcmi_init_func)(); - static int dcmi_init_new(){ - CALL_FUNC(dcmi_init) - } - - static int (*dcmi_get_card_num_list_func)(int *card_num,int *card_list,int list_length); - static int dcmi_get_card_num_list_new(int *card_num,int *card_list,int list_length){ - CALL_FUNC(dcmi_get_card_num_list,card_num,card_list,list_length) - } - - static int (*dcmi_get_device_num_in_card_func)(int card_id,int *device_num); - static int dcmi_get_device_num_in_card_new(int card_id,int *device_num){ - CALL_FUNC(dcmi_get_device_num_in_card,card_id,device_num) - } - - static int (*dcmi_get_device_logic_id_func)(int *device_logic_id,int card_id,int device_id); - static int dcmi_get_device_logic_id_new(int *device_logic_id,int card_id,int device_id){ - CALL_FUNC(dcmi_get_device_logic_id,device_logic_id,card_id,device_id) - } - - static int (*dcmi_create_vdevice_func)(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, - struct dcmi_create_vdev_out *out); - int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, - struct dcmi_create_vdev_out *out){ - CALL_FUNC(dcmi_create_vdevice,card_id,device_id,vdev,out) - } - - static int (*dcmi_get_device_info_func)(int card_id, int device_id, enum dcmi_main_cmd main_cmd, - unsigned int sub_cmd,void *buf, unsigned int *size); - int dcmi_get_device_info(int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, - unsigned int *size){ - CALL_FUNC(dcmi_get_device_info,card_id,device_id,main_cmd,sub_cmd,buf,size) - } - - static int (*dcmi_get_hccs_link_bandwidth_info_func)(int card_id, int device_id, -struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); - int dcmi_get_hccs_link_bandwidth_info(int card_id, int device_id, -struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info){ - CALL_FUNC(dcmi_get_hccs_link_bandwidth_info,card_id,device_id,hccs_bandwidth_info) - } - - static int (*dcmi_set_destroy_vdevice_func)(int card_id,int device_id, unsigned int VDevid); - int dcmi_set_destroy_vdevice(int card_id,int device_id, unsigned int VDevid){ - CALL_FUNC(dcmi_set_destroy_vdevice,card_id,device_id,VDevid) - } - - static int (*dcmi_get_device_type_func)(int card_id,int device_id,enum dcmi_unit_type *device_type); - int dcmi_get_device_type(int card_id,int device_id,enum dcmi_unit_type *device_type){ - CALL_FUNC(dcmi_get_device_type,card_id,device_id,device_type) - } - - static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); - int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ - CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) - } - - static int (*dcmi_get_device_utilization_rate_func)(int card_id, int device_id, int input_type, - unsigned int *utilization_rate); - int dcmi_get_device_utilization_rate(int card_id, int device_id, int input_type, unsigned int *utilization_rate){ - CALL_FUNC(dcmi_get_device_utilization_rate,card_id,device_id,input_type,utilization_rate) - } - - static int (*dcmi_get_device_temperature_func)(int card_id, int device_id, int *temperature); - int dcmi_get_device_temperature(int card_id, int device_id, int *temperature){ - CALL_FUNC(dcmi_get_device_temperature,card_id,device_id,temperature) - } - - static int (*dcmi_get_device_voltage_func)(int card_id, int device_id, unsigned int *voltage); - int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage){ - CALL_FUNC(dcmi_get_device_voltage,card_id,device_id,voltage) - } - - static int (*dcmi_get_device_power_info_func)(int card_id, int device_id, int *power); - int dcmi_get_device_power_info(int card_id, int device_id, int *power){ - CALL_FUNC(dcmi_get_device_power_info,card_id,device_id,power) - } - - static int (*dcmi_get_device_frequency_func)(int card_id, int device_id, enum dcmi_freq_type input_type, - unsigned int *frequency); - int dcmi_get_device_frequency(int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency){ - CALL_FUNC(dcmi_get_device_frequency,card_id,device_id,input_type,frequency) - } - - static int (*dcmi_get_device_memory_info_v3_func)(int card_id, int device_id, - struct dcmi_get_memory_info_stru *memory_info); - int dcmi_get_device_memory_info_v3(int card_id, int device_id, struct dcmi_get_memory_info_stru *memory_info){ - CALL_FUNC(dcmi_get_device_memory_info_v3,card_id,device_id,memory_info) - } - - static int (*dcmi_get_device_hbm_info_func)(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); - int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info){ - CALL_FUNC(dcmi_get_device_hbm_info,card_id,device_id,hbm_info) - } - - static int (*dcmi_get_device_errorcode_v2_func)(int card_id, int device_id, int *error_count, - unsigned int *error_code_list, unsigned int list_len); - int dcmi_get_device_errorcode_v2(int card_id, int device_id, int *error_count, - unsigned int *error_code_list, unsigned int list_len){ - CALL_FUNC(dcmi_get_device_errorcode_v2,card_id,device_id,error_count,error_code_list,list_len) - } - - static int (*dcmi_get_device_chip_info_func)(int card_id, int device_id, struct dcmi_chip_info *chip_info); - int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info){ - CALL_FUNC(dcmi_get_device_chip_info,card_id,device_id,chip_info) - } - - static int (*dcmi_get_device_chip_info_v2_func)(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); - int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info){ - CALL_FUNC(dcmi_get_device_chip_info_v2,card_id,device_id,chip_info) - } - - static int (*dcmi_get_device_phyid_from_logicid_func)(unsigned int logicid, unsigned int *phyid); - int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid){ - CALL_FUNC(dcmi_get_device_phyid_from_logicid,logicid,phyid) - } - - static int (*dcmi_get_device_logicid_from_phyid_func)(unsigned int phyid, unsigned int *logicid); - int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid){ - CALL_FUNC(dcmi_get_device_logicid_from_phyid,phyid,logicid) - } - - static int (*dcmi_get_device_ip_func)(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, - struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); - int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, - struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask){ - CALL_FUNC(dcmi_get_device_ip,card_id,device_id,input_type,port_id,ip,mask) - } - - static int (*dcmi_get_device_network_health_func)(int card_id, int device_id, - enum dcmi_rdfx_detect_result *result); - int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result){ - CALL_FUNC(dcmi_get_device_network_health,card_id,device_id,result) - } - - static int (*dcmi_get_card_list_func)(int *card_num, int *card_list, int list_len); - int dcmi_get_card_list(int *card_num, int *card_list, int list_len){ - CALL_FUNC(dcmi_get_card_list,card_num,card_list,list_len) - } - - static int (*dcmi_get_device_id_in_card_func)(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); - int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id){ - CALL_FUNC(dcmi_get_device_id_in_card,card_id,device_id_max,mcu_id,cpu_id) - } - - static int (*dcmi_get_memory_info_func)(int card_id, int device_id, - struct dcmi_memory_info_stru *device_memory_info); - int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info){ - CALL_FUNC(dcmi_get_memory_info,card_id,device_id,device_memory_info) - } - - static int (*dcmi_get_device_errorcode_func)(int card_id, int device_id, int *error_count, unsigned int *error_code, - int *error_width); - int dcmi_get_device_errorcode(int card_id, int device_id, int *error_count, unsigned int *error_code, - int *error_width){ - CALL_FUNC(dcmi_get_device_errorcode,card_id,device_id,error_count,error_code,error_width) - } - - static int (*dcmi_get_card_id_device_id_from_logicid_func)(int *card_id, int *device_id, - unsigned int device_logic_id); - int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id){ - CALL_FUNC(dcmi_get_card_id_device_id_from_logicid,card_id,device_id,device_logic_id) - } - - static int (*dcmi_mcu_get_power_info_func)(int card_id, int *power); - static int dcmi_mcu_get_power_info_new(int card_id, int *power){ - CALL_FUNC(dcmi_mcu_get_power_info,card_id,power) - } - - static int (*dcmi_get_product_type_func)(int card_id, int device_id, char *product_type_str, int buf_size); - int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size){ - CALL_FUNC(dcmi_get_product_type,card_id,device_id,product_type_str,buf_size) - } - - static int (*dcmi_get_card_elabel_v2_func)(int card_id, struct dcmi_elabel_info *elabel_info); - int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info){ - CALL_FUNC(dcmi_get_card_elabel_v2,card_id,elabel_info) - } - - static int (*dcmi_set_device_reset_func)(int card_id, int device_id, enum dcmi_reset_channel channel_type); - int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type){ - CALL_FUNC(dcmi_set_device_reset,card_id,device_id,channel_type) - } - - static int (*dcmi_get_device_outband_channel_state_func)(int card_id, int device_id, int* channel_state); - int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state){ - CALL_FUNC(dcmi_get_device_outband_channel_state,card_id,device_id,channel_state) - } - - static int (*dcmi_pre_reset_soc_func)(int card_id, int device_id); - int dcmi_pre_reset_soc(int card_id, int device_id){ - CALL_FUNC(dcmi_pre_reset_soc,card_id,device_id) - } - - static int (*dcmi_rescan_soc_func)(int card_id, int device_id); - int dcmi_rescan_soc(int card_id, int device_id){ - CALL_FUNC(dcmi_rescan_soc,card_id,device_id) - } - - static int (*dcmi_get_netdev_brother_device_func)(int card_id, int device_id, int* brother_card_id); - int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id){ - CALL_FUNC(dcmi_get_netdev_brother_device,card_id,device_id,brother_card_id) - } - - static int (*dcmi_get_device_boot_status_func)(int card_id, int device_id, enum dcmi_boot_status *boot_status); - int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status){ - CALL_FUNC(dcmi_get_device_boot_status,card_id,device_id,boot_status) - } - - void goEventFaultCallBack(struct dcmi_dms_fault_event); - static void event_handler(struct dcmi_event *fault_event) { - goEventFaultCallBack(fault_event->event_t.dms_event); - } - - static int (*dcmi_subscribe_fault_event_func)(int card_id, int device_id, struct dcmi_event_filter filter, - void (*f_name)(struct dcmi_event *fault_event)); - int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter){ - CALL_FUNC(dcmi_subscribe_fault_event,card_id,device_id,filter,event_handler) - } - - static int (*dcmi_get_npu_work_mode_func)(int card_id, unsigned char *work_mode); - int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode){ - CALL_FUNC(dcmi_get_npu_work_mode,card_id,work_mode) - } - - static int (*dcmi_get_device_die_v2_func)(int card_id, int device_id, enum dcmi_die_type input_type, - struct dcmi_die_id *die_id); - int dcmi_get_device_die_v2(int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id){ - CALL_FUNC(dcmi_get_device_die_v2,card_id,device_id,input_type,die_id) - } - - static int (*dcmi_get_device_resource_info_func)(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, - int *proc_num); - int dcmi_get_device_resource_info(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, int *proc_num){ - CALL_FUNC(dcmi_get_device_resource_info,card_id,device_id,proc_info,proc_num) - } - - static int (*dcmi_get_device_pcie_info_v2_func)(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); - int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info){ - CALL_FUNC(dcmi_get_device_pcie_info_v2,card_id,device_id,pcie_info) - } - - static int (*dcmi_get_device_board_info_func)(int card_id, int device_id, struct dcmi_board_info *board_info); - int dcmi_get_device_board_info(int card_id, int device_id, struct dcmi_board_info *board_info){ - CALL_FUNC(dcmi_get_device_board_info,card_id,device_id,board_info) - } - - static int (*dcmi_get_pcie_link_bandwidth_info_func)(int card_id, int device_id, - struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); - int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, - struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info){ - CALL_FUNC(dcmi_get_pcie_link_bandwidth_info,card_id,device_id,pcie_link_bandwidth_info) - } - - static int (*dcmi_get_dcmi_version_func)(char *dcmi_ver, int buf_size); - int dcmi_get_dcmi_version(char *dcmi_ver, int buf_size){ - CALL_FUNC(dcmi_get_dcmi_version,dcmi_ver,buf_size) - } - - static int (*dcmi_get_device_ecc_info_func)(int card_id, int device_id, enum dcmi_device_type input_type, - struct dcmi_ecc_info *device_ecc_info); - int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, - struct dcmi_ecc_info *device_ecc_info){ - CALL_FUNC(dcmi_get_device_ecc_info,card_id,device_id,input_type,device_ecc_info) - } - - static int (*dcmi_get_mainboard_id_func)(int card_id, int device_id, unsigned int *mainboard_id); - int dcmi_get_mainboard_id(int card_id, int device_id, unsigned int *mainboard_id){ - CALL_FUNC(dcmi_get_mainboard_id,card_id,device_id,mainboard_id) - } - - static int (*dcmi_start_hccsping_mesh_func)(int card_id, int device_id, int port_id, -struct dcmi_hccsping_mesh_operate *hccsping_mesh); - int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, -struct dcmi_hccsping_mesh_operate *hccsping_mesh){ - CALL_FUNC(dcmi_start_hccsping_mesh,card_id,device_id,port_id,hccsping_mesh) -} - static int (*dcmi_stop_hccsping_mesh_func)(int card_id, int device_id, int port_id, unsigned int task_id); - int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id){ - CALL_FUNC(dcmi_stop_hccsping_mesh,card_id,device_id,port_id,task_id) - } - - static int (*dcmi_get_hccsping_mesh_info_func)(int card_id, int device_id, int port_id, unsigned int task_id, -struct dcmi_hccsping_mesh_info *hccsping_mesh_info); - int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, -struct dcmi_hccsping_mesh_info *hccsping_mesh_info){ - CALL_FUNC(dcmi_get_hccsping_mesh_info,card_id,device_id,port_id,task_id,hccsping_mesh_info) -} - - static int (*dcmi_get_hccsping_mesh_state_func)(int card_id, int device_id, int port_id, unsigned int task_id, -unsigned int *state); - int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, -unsigned int *state){ - CALL_FUNC(dcmi_get_hccsping_mesh_state,card_id,device_id,port_id,task_id,state) -} - - static int (*dcmi_get_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int *status); - int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status){ - CALL_FUNC(dcmi_get_spod_node_status,card_id,device_id,sdid,status) - } - - static int (*dcmi_set_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int status); - int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status){ - CALL_FUNC(dcmi_set_spod_node_status,card_id,device_id,sdid,status) - } - - // load .so files and functions - static int dcmiInit_dl(const char* dcmiLibPath){ - if (dcmiLibPath == NULL) { - fprintf (stderr,"lib path is null\n"); - return SO_NOT_FOUND; - } - dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); - if (dcmiHandle == NULL){ - fprintf (stderr,"%s\n",dlerror()); - return SO_NOT_FOUND; - } - - dcmi_init_func = dlsym(dcmiHandle,"dcmi_init"); - - dcmi_get_card_num_list_func = dlsym(dcmiHandle,"dcmi_get_card_num_list"); - - dcmi_get_device_num_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_num_in_card"); - - dcmi_get_device_logic_id_func = dlsym(dcmiHandle,"dcmi_get_device_logic_id"); - - dcmi_create_vdevice_func = dlsym(dcmiHandle,"dcmi_create_vdevice"); - - dcmi_get_device_info_func = dlsym(dcmiHandle,"dcmi_get_device_info"); - - dcmi_set_destroy_vdevice_func = dlsym(dcmiHandle,"dcmi_set_destroy_vdevice"); - - dcmi_get_device_type_func = dlsym(dcmiHandle,"dcmi_get_device_type"); - - dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); - - dcmi_get_device_utilization_rate_func = dlsym(dcmiHandle,"dcmi_get_device_utilization_rate"); - - dcmi_get_device_temperature_func = dlsym(dcmiHandle,"dcmi_get_device_temperature"); - - dcmi_get_device_voltage_func = dlsym(dcmiHandle,"dcmi_get_device_voltage"); - - dcmi_get_device_power_info_func = dlsym(dcmiHandle,"dcmi_get_device_power_info"); - - dcmi_get_device_frequency_func = dlsym(dcmiHandle,"dcmi_get_device_frequency"); - - dcmi_get_device_memory_info_v3_func = dlsym(dcmiHandle,"dcmi_get_device_memory_info_v3"); - - dcmi_get_device_hbm_info_func = dlsym(dcmiHandle,"dcmi_get_device_hbm_info"); - - dcmi_get_device_errorcode_v2_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode_v2"); - - dcmi_get_device_chip_info_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info"); - - dcmi_get_device_chip_info_v2_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info_v2"); - - dcmi_get_device_phyid_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_device_phyid_from_logicid"); - - dcmi_get_device_logicid_from_phyid_func = dlsym(dcmiHandle,"dcmi_get_device_logicid_from_phyid"); - - dcmi_get_device_ip_func = dlsym(dcmiHandle,"dcmi_get_device_ip"); - - dcmi_get_device_network_health_func = dlsym(dcmiHandle,"dcmi_get_device_network_health"); - - dcmi_get_card_list_func = dlsym(dcmiHandle,"dcmi_get_card_list"); - - dcmi_get_device_id_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_id_in_card"); - - dcmi_get_memory_info_func = dlsym(dcmiHandle,"dcmi_get_memory_info"); - - dcmi_get_device_errorcode_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode"); - - dcmi_get_card_id_device_id_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_card_id_device_id_from_logicid"); - - dcmi_mcu_get_power_info_func = dlsym(dcmiHandle,"dcmi_mcu_get_power_info"); - - dcmi_get_product_type_func = dlsym(dcmiHandle,"dcmi_get_product_type"); - - dcmi_get_card_elabel_v2_func = dlsym(dcmiHandle,"dcmi_get_card_elabel_v2"); - - dcmi_set_device_reset_func = dlsym(dcmiHandle,"dcmi_set_device_reset"); - - dcmi_get_device_outband_channel_state_func = dlsym(dcmiHandle,"dcmi_get_device_outband_channel_state"); - - dcmi_pre_reset_soc_func = dlsym(dcmiHandle,"dcmi_pre_reset_soc"); - - dcmi_rescan_soc_func = dlsym(dcmiHandle,"dcmi_rescan_soc"); - - dcmi_get_netdev_brother_device_func = dlsym(dcmiHandle,"dcmi_get_netdev_brother_device"); - - dcmi_get_device_boot_status_func = dlsym(dcmiHandle,"dcmi_get_device_boot_status"); - - dcmi_subscribe_fault_event_func = dlsym(dcmiHandle,"dcmi_subscribe_fault_event"); - - dcmi_get_npu_work_mode_func = dlsym(dcmiHandle, "dcmi_get_npu_work_mode"); - - dcmi_get_device_die_v2_func = dlsym(dcmiHandle, "dcmi_get_device_die_v2"); - - dcmi_get_device_resource_info_func = dlsym(dcmiHandle, "dcmi_get_device_resource_info"); - - dcmi_get_device_pcie_info_v2_func = dlsym(dcmiHandle, "dcmi_get_device_pcie_info_v2"); - - dcmi_get_device_board_info_func = dlsym(dcmiHandle, "dcmi_get_device_board_info"); - - dcmi_get_pcie_link_bandwidth_info_func = dlsym(dcmiHandle, "dcmi_get_pcie_link_bandwidth_info"); - - dcmi_get_dcmi_version_func = dlsym(dcmiHandle,"dcmi_get_dcmi_version"); - - dcmi_get_device_ecc_info_func = dlsym(dcmiHandle,"dcmi_get_device_ecc_info"); - - dcmi_get_mainboard_id_func = dlsym(dcmiHandle, "dcmi_get_mainboard_id"); - - dcmi_get_hccs_link_bandwidth_info_func = dlsym(dcmiHandle,"dcmi_get_hccs_link_bandwidth_info"); - - dcmi_start_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_start_hccsping_mesh"); - - dcmi_stop_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_stop_hccsping_mesh"); - - dcmi_get_hccsping_mesh_info_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_info"); - - dcmi_get_hccsping_mesh_state_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_state"); - - dcmi_get_spod_node_status_func = dlsym(dcmiHandle,"dcmi_get_spod_node_status"); - - dcmi_set_spod_node_status_func = dlsym(dcmiHandle,"dcmi_set_spod_node_status"); - - return SUCCESS; - } - - static int dcmiShutDown(void){ - if (dcmiHandle == NULL) { - return SUCCESS; - } - return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); - } -*/ -import "C" -import ( - "errors" - "fmt" - "math" - "net" - "strconv" - "strings" - "time" - "unsafe" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "ascend-common/devmanager/common" -) - -// CDcmiMemoryInfoV3 the c struct of memoryInfo for v3 -type CDcmiMemoryInfoV3 = C.struct_dcmi_get_memory_info_stru - -// CDcmiMemoryInfoV1 the c struct of memoryInfo for v1 -type CDcmiMemoryInfoV1 = C.struct_dcmi_memory_info_stru - -// DcDriverInterface interface for dcmi -type DcDriverInterface interface { - DcInit() error - DcShutDown() error - - DcGetDcmiVersion() (string, error) - DcGetDeviceCount() (int32, error) - DcGetLogicIDList() (int32, []int32, error) - DcGetDeviceHealth(int32, int32) (int32, error) - DcGetDeviceNetWorkHealth(int32, int32) (uint32, error) - DcGetDeviceUtilizationRate(int32, int32, common.DeviceType) (int32, error) - DcGetDeviceTemperature(int32, int32) (int32, error) - DcGetDeviceVoltage(int32, int32) (float32, error) - DcGetDevicePowerInfo(int32, int32) (float32, error) - DcGetDeviceFrequency(int32, int32, common.DeviceType) (uint32, error) - DcGetMemoryInfo(int32, int32) (*common.MemoryInfo, error) - DcGetHbmInfo(int32, int32) (*common.HbmInfo, error) - DcGetDeviceErrorCode(int32, int32) (int32, int64, error) - DcGetChipInfo(int32, int32) (*common.ChipInfo, error) - DcGetPhysicIDFromLogicID(int32) (int32, error) - DcGetLogicIDFromPhysicID(int32) (int32, error) - DcGetDeviceLogicID(int32, int32) (int32, error) - DcGetDeviceIPAddress(int32, int32, int32) (string, error) - DcGetMcuPowerInfo(int32) (float32, error) - DcGetDieID(int32, int32, DieType) (string, error) - DcGetPCIeBusInfo(int32, int32) (string, error) - - DcGetCardList() (int32, []int32, error) - DcGetDeviceNumInCard(int32) (int32, error) - DcSetDestroyVirtualDevice(int32, int32, uint32) error - DcCreateVirtualDevice(int32, int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) - DcGetDeviceVDevResource(int32, int32, uint32) (common.CgoVDevQueryStru, error) - DcGetDeviceTotalResource(int32, int32) (common.CgoSocTotalResource, error) - DcGetDeviceFreeResource(int32, int32) (common.CgoSocFreeResource, error) - DcGetVDevActivityInfo(int32, int32, uint32) (common.VDevActivityInfo, error) - DcVGetDeviceInfo(int32, int32) (common.VirtualDevInfo, error) - DcGetCardIDDeviceID(int32) (int32, int32, error) - DcCreateVDevice(int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) - DcGetVDeviceInfo(int32) (common.VirtualDevInfo, error) - DcDestroyVDevice(int32, uint32) error - DcGetProductType(int32, int32) (string, error) - DcGetNpuWorkMode(int32) (int, error) - DcSetDeviceReset(int32, int32) error - DcGetBrotherCardID(int32, int32) (int32, error) - DcPreResetSoc(int32, int32) error - DcGetOutBandChannelState(int32, int32) error - DcSetDeviceResetOutBand(int32, int32) error - DcRescanSoc(int32, int32) error - DcGetDeviceBootStatus(int32) (int, error) - DcGetSuperPodInfo(int32, int32) (common.CgoSuperPodInfo, error) - - DcGetDeviceAllErrorCode(int32, int32) (int32, []int64, error) - DcSubscribeDeviceFaultEvent(int32, int32) error - DcSetFaultEventCallFunc(func(common.DevFaultInfo)) - DcGetDevProcessInfo(int32, int32) (*common.DevProcessInfo, error) - DcGetDeviceBoardInfo(int32, int32) (common.BoardInfo, error) - DcGetPCIEBandwidth(int32, int32, int) (common.PCIEBwStat, error) - DcGetDeviceEccInfo(int32, int32, common.DcmiDeviceType) (*common.ECCInfo, error) - DcGetSioInfo(int32, int32) (common.SioCrcErrStatisticInfo, error) - DcGetHccsStatisticInfo(int32, int32) (common.HccsStatisticInfo, error) - DcGetHccsStatisticInfoU64(int32, int32) (common.HccsStatisticInfo, error) - DcGetDeviceMainBoardInfo(int32, int32) (uint32, error) - DcGetHccsBandwidthInfo(int32, int32, int) (common.HccsBandwidthInfo, error) - - DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error - DcStopHccsPingMesh(int32, int32, int, uint) error - DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) - DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) - DcGetSuperPodStatus(int32, int32, uint32) (int, error) - DcSetSuperPodStatus(int32, int32, uint32, uint32) error - DcGetCardElabelV2(int32) (common.ElabelInfo, error) -} - -const ( - dcmiLibraryName = "libdcmi.so" - templateNameLen = 32 - ipAddrListLen = 1024 - hcclpingMeshMaxNum = 48 -) - -var faultEventCallFunc func(common.DevFaultInfo) = nil -var ( - dcmiErrMap = map[int32]string{ - -8001: "The input parameter is incorrect", - -8002: "Permission error", - -8003: "The memory interface operation failed", - -8004: "The security function failed to be executed", - -8005: "Internal errors", - -8006: "Response timed out", - -8007: "Invalid deviceID", - -8008: "The device does not exist", - -8009: "ioctl returns failed", - -8010: "The message failed to be sent", - -8011: "Message reception failed", - -8012: "Not ready yet,please try again", - -8013: "This API is not supported in containers", - -8014: "The file operation failed", - -8015: "Reset failed", - -8016: "Reset cancels", - -8017: "Upgrading", - -8020: "Device resources are occupied", - -8022: "Partition consistency check,inconsistent partitions were found", - -8023: "The configuration information does not exist", - -8255: "Device ID/function is not supported", - -99997: "dcmi shutdown failed", - -99998: "The called function is missing,please upgrade the driver", - -99999: "dcmi libdcmi.so failed to load", - } -) - -// DcManager for manager dcmi interface -type DcManager struct{} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DcManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, - operate common.HccspingMeshOperate) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return fmt.Errorf("portID(%d) is invalid", portID) - } - if err := common.IsValidHccspingMeshOperate(operate); err != nil { - return fmt.Errorf("operate(%v) is invalid, err: %v", operate, err) - } - dtsAddrLsit := [ipAddrListLen]C.char{0} - for i := 0; i < len(operate.DstAddr) && i < len(dtsAddrLsit); i++ { - dtsAddrLsit[i] = C.char(operate.DstAddr[i]) - } - - op := C.struct_dcmi_hccsping_mesh_operate{ - dst_addr_list: dtsAddrLsit, - pkt_size: C.int(operate.PktSize), - pkt_send_num: C.int(operate.PktSendNum), - pkt_interval: C.int(operate.PktInterval), - timeout: C.int(operate.Timeout), - task_interval: C.int(operate.TaskInterval), - task_id: C.int(operate.TaskId), - } - if retCode := C.dcmi_start_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), - &op); retCode != common.Success { - return fmt.Errorf("dcmi start hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", - cardID, deviceID, int32(retCode)) - } - - return nil -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DcManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return fmt.Errorf("portID(%d) is invalid", portID) - } - if !common.IsValidTaskID(taskID) { - return fmt.Errorf("taskID(%d) is invalid", taskID) - } - if retCode := C.dcmi_stop_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), - C.uint(taskID)); retCode != common.Success { - return fmt.Errorf("dcmi stop hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", - cardID, deviceID, int32(retCode)) - } - return nil -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DcManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, - taskID uint) (*common.HccspingMeshInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return nil, fmt.Errorf("portID(%d) is invalid", portID) - } - if !common.IsValidTaskID(taskID) { - return nil, fmt.Errorf("taskID(%d) is invalid", taskID) - } - var info C.struct_dcmi_hccsping_mesh_info - if retCode := C.dcmi_get_hccsping_mesh_info(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), - &info); retCode != common.Success { - return nil, fmt.Errorf("dcmi get hccs ping mesh info failed cardID(%d) deviceID(%d) error code: %d", - cardID, deviceID, int32(retCode)) - } - return convertHccspingMeshInfo(&info) -} - -func convertHccspingMeshInfo(cInfo *C.struct_dcmi_hccsping_mesh_info) (*common.HccspingMeshInfo, error) { - if int(cInfo.dest_num) > hcclpingMeshMaxNum { - return nil, fmt.Errorf("dest_num(%d) is invalid, should not be greater than %d", int(cInfo.dest_num), - hcclpingMeshMaxNum) - } - info := &common.HccspingMeshInfo{} - for i := 0; i < int(cInfo.dest_num); i++ { - info.DstAddr = append(info.DstAddr, convertToString(cInfo.dst_addr[i])) - info.SucPktNum = append(info.SucPktNum, uint(cInfo.suc_pkt_num[i])) - info.FailPktNum = append(info.FailPktNum, uint(cInfo.fail_pkt_num[i])) - info.MaxTime = append(info.MaxTime, int(cInfo.max_time[i])) - info.MinTime = append(info.MinTime, int(cInfo.min_time[i])) - info.AvgTime = append(info.AvgTime, int(cInfo.avg_time[i])) - info.TP95Time = append(info.TP95Time, int(cInfo.tp95_time[i])) - info.ReplyStatNum = append(info.ReplyStatNum, int(cInfo.reply_stat_num[i])) - info.PingTotalNum = append(info.PingTotalNum, int(cInfo.ping_total_num[i])) - } - info.DestNum = int(cInfo.dest_num) - return info, nil -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DcManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return common.RetError, fmt.Errorf("portID(%d) is invalid", portID) - } - if !common.IsValidTaskID(taskID) { - return common.RetError, fmt.Errorf("taskID(%d) is invalid", taskID) - } - var state C.uint - if retCode := C.dcmi_get_hccsping_mesh_state(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), - &state); retCode != common.Success { - return common.RetError, fmt.Errorf("dcmi get hccs ping mesh state failed cardID(%d) deviceID(%d) error "+ - "code: %d", cardID, deviceID, int32(retCode)) - } - return int(state), nil -} - -// DcInit load symbol and initialize dcmi -func (d *DcManager) DcInit() error { - dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) - if err != nil { - return err - } - cDcmiTemplateName := C.CString(dcmiLibPath) - defer C.free(unsafe.Pointer(cDcmiTemplateName)) - if retCode := C.dcmiInit_dl(cDcmiTemplateName); retCode != C.SUCCESS { - return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) - } - if retCode := C.dcmi_init_new(); retCode != C.SUCCESS { - return fmt.Errorf("dcmi init failed, error code: %d", int32(retCode)) - } - return nil -} - -// DcShutDown clean the dynamically loaded resource -func (d *DcManager) DcShutDown() error { - if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { - return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) - } - - return nil -} - -// DcGetCardList get card list -func (d *DcManager) DcGetCardList() (int32, []int32, error) { - var ids [common.HiAIMaxCardNum]C.int - var cNum C.int - if retCode := C.dcmi_get_card_list(&cNum, &ids[0], common.HiAIMaxCardNum); int32(retCode) != common. - Success { - return common.RetError, nil, fmt.Errorf("get card list failed, error code: %d", int32(retCode)) - } - // checking card's quantity - if cNum <= 0 || cNum > common.HiAIMaxCardNum { - return common.RetError, nil, fmt.Errorf("get error card quantity: %d", int32(cNum)) - } - var cardNum = int32(cNum) - var i int32 - var cardIDList []int32 - for i = 0; i < cardNum; i++ { - cardID := int32(ids[i]) - if cardID < 0 { - hwlog.RunLog.Errorf("get invalid card ID: %d", cardID) - continue - } - cardIDList = append(cardIDList, cardID) - } - return cardNum, cardIDList, nil -} - -// DcGetDeviceNumInCard get device number in the npu card -func (d *DcManager) DcGetDeviceNumInCard(cardID int32) (int32, error) { - if !common.IsValidCardID(cardID) { - return common.RetError, fmt.Errorf("cardID(%d) is invalid", cardID) - } - var deviceNum C.int - if retCode := C.dcmi_get_device_num_in_card_new(C.int(cardID), &deviceNum); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device count on the card failed, error code: %d", int32(retCode)) - } - if !common.IsValidDevNumInCard(int32(deviceNum)) { - return common.RetError, fmt.Errorf("get error device quantity: %d", int32(deviceNum)) - } - return int32(deviceNum), nil -} - -// DcGetDeviceLogicID get device logicID -func (d *DcManager) DcGetDeviceLogicID(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var logicID C.int - if retCode := C.dcmi_get_device_logic_id_new(&logicID, C.int(cardID), - C.int(deviceID)); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("failed to get logicID by cardID(%d) and deviceID(%d), error code: %d", - cardID, deviceID, int32(retCode)) - } - - // check whether logicID is invalid - if !common.IsValidLogicIDOrPhyID(int32(logicID)) { - return common.RetError, fmt.Errorf("get invalid logicID: %d", int32(logicID)) - } - return int32(logicID), nil -} - -// DcSetDestroyVirtualDevice destroy virtual device -func (d *DcManager) DcSetDestroyVirtualDevice(cardID, deviceID int32, vDevID uint32) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if retCode := C.dcmi_set_destroy_vdevice(C.int(cardID), C.int(deviceID), - C.uint(vDevID)); int32(retCode) != common.Success { - return fmt.Errorf("destroy virtual device failed, error code: %d", int32(retCode)) - } - return nil -} - -func convertCreateVDevOut(cCreateVDevOut C.struct_dcmi_create_vdev_out) common.CgoCreateVDevOut { - cgoCreateVDevOut := common.CgoCreateVDevOut{ - VDevID: uint32(cCreateVDevOut.vdev_id), - PcieBus: uint32(cCreateVDevOut.pcie_bus), - PcieDevice: uint32(cCreateVDevOut.pcie_device), - PcieFunc: uint32(cCreateVDevOut.pcie_func), - VfgID: uint32(cCreateVDevOut.vfg_id), - } - return cgoCreateVDevOut -} - -// DcCreateVirtualDevice create virtual device -func (d *DcManager) DcCreateVirtualDevice(cardID, deviceID int32, vDevInfo common.CgoCreateVDevRes) (common. - CgoCreateVDevOut, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoCreateVDevOut{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if len(vDevInfo.TemplateName) > templateNameLen { - return common.CgoCreateVDevOut{}, fmt.Errorf("the length of template name exceeds the upper limit") - } - cTemplateName := [templateNameLen]C.char{0} - for i := 0; i < len(vDevInfo.TemplateName); i++ { - cTemplateName[i] = C.char(vDevInfo.TemplateName[i]) - } - deviceCreateStr := C.struct_dcmi_create_vdev_res_stru{ - vdev_id: C.uint(vDevInfo.VDevID), - vfg_id: C.uint(vDevInfo.VfgID), - template_name: cTemplateName, - } - - var createVDevOut C.struct_dcmi_create_vdev_out - if retCode := C.dcmi_create_vdevice(C.int(cardID), C.int(deviceID), &deviceCreateStr, - &createVDevOut); int32(retCode) != common.Success { - return common.CgoCreateVDevOut{}, fmt.Errorf("create vdevice failed, error is: %d", int32(retCode)) - } - - return convertCreateVDevOut(createVDevOut), nil -} - -func convertToString(cgoArr [dcmiVDevResNameLen]C.char) string { - var charArr []rune - for _, v := range cgoArr { - if v == 0 { - break - } - charArr = append(charArr, rune(v)) - } - return string(charArr) -} - -func convertBaseResource(cBaseResource C.struct_dcmi_base_resource) common.CgoBaseResource { - baseResource := common.CgoBaseResource{ - Token: uint64(cBaseResource.token), - TokenMax: uint64(cBaseResource.token_max), - TaskTimeout: uint64(cBaseResource.task_timeout), - VfgID: uint32(cBaseResource.vfg_id), - VipMode: uint8(cBaseResource.vip_mode), - } - return baseResource -} - -func convertComputingResource(cComputingResource C.struct_dcmi_computing_resource) common.CgoComputingResource { - computingResource := common.CgoComputingResource{ - Aic: float32(cComputingResource.aic), - Aiv: float32(cComputingResource.aiv), - Dsa: uint16(cComputingResource.dsa), - Rtsq: uint16(cComputingResource.rtsq), - Acsq: uint16(cComputingResource.acsq), - Cdqm: uint16(cComputingResource.cdqm), - CCore: uint16(cComputingResource.c_core), - Ffts: uint16(cComputingResource.ffts), - Sdma: uint16(cComputingResource.sdma), - PcieDma: uint16(cComputingResource.pcie_dma), - MemorySize: uint64(cComputingResource.memory_size), - EventID: uint32(cComputingResource.event_id), - NotifyID: uint32(cComputingResource.notify_id), - StreamID: uint32(cComputingResource.stream_id), - ModelID: uint32(cComputingResource.model_id), - TopicScheduleAicpu: uint16(cComputingResource.topic_schedule_aicpu), - HostCtrlCPU: uint16(cComputingResource.host_ctrl_cpu), - HostAicpu: uint16(cComputingResource.host_aicpu), - DeviceAicpu: uint16(cComputingResource.device_aicpu), - TopicCtrlCPUSlot: uint16(cComputingResource.topic_ctrl_cpu_slot), - } - return computingResource -} - -func convertMediaResource(cMediaResource C.struct_dcmi_media_resource) common.CgoMediaResource { - mediaResource := common.CgoMediaResource{ - Jpegd: float32(cMediaResource.jpegd), - Jpege: float32(cMediaResource.jpege), - Vpc: float32(cMediaResource.vpc), - Vdec: float32(cMediaResource.vdec), - Pngd: float32(cMediaResource.pngd), - Venc: float32(cMediaResource.venc), - } - return mediaResource -} - -func convertVDevQueryInfo(cVDevQueryInfo C.struct_dcmi_vdev_query_info) common.CgoVDevQueryInfo { - name := convertToString(cVDevQueryInfo.name) - vDevQueryInfo := common.CgoVDevQueryInfo{ - Name: string(name), - Status: uint32(cVDevQueryInfo.status), - IsContainerUsed: uint32(cVDevQueryInfo.is_container_used), - Vfid: uint32(cVDevQueryInfo.vfid), - VfgID: uint32(cVDevQueryInfo.vfg_id), - ContainerID: uint64(cVDevQueryInfo.container_id), - Base: convertBaseResource(cVDevQueryInfo.base), - Computing: convertComputingResource(cVDevQueryInfo.computing), - Media: convertMediaResource(cVDevQueryInfo.media), - } - return vDevQueryInfo -} - -func convertVDevQueryStru(cVDevQueryStru C.struct_dcmi_vdev_query_stru) common.CgoVDevQueryStru { - vDevQueryStru := common.CgoVDevQueryStru{ - VDevID: uint32(cVDevQueryStru.vdev_id), - QueryInfo: convertVDevQueryInfo(cVDevQueryStru.query_info), - } - return vDevQueryStru -} - -// DcGetDeviceVDevResource get virtual device resource info -func (d *DcManager) DcGetDeviceVDevResource(cardID, deviceID int32, vDevID uint32) (common.CgoVDevQueryStru, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoVDevQueryStru{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetVDevResource - var vDevResource C.struct_dcmi_vdev_query_stru - size := C.uint(unsafe.Sizeof(vDevResource)) - vDevResource.vdev_id = C.uint(vDevID) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&vDevResource), &size); int32(retCode) != common.Success { - return common.CgoVDevQueryStru{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) - } - return convertVDevQueryStru(vDevResource), nil -} - -func convertSocTotalResource(cSocTotalResource C.struct_dcmi_soc_total_resource) common.CgoSocTotalResource { - socTotalResource := common.CgoSocTotalResource{ - VDevNum: uint32(cSocTotalResource.vdev_num), - VfgNum: uint32(cSocTotalResource.vfg_num), - VfgBitmap: uint32(cSocTotalResource.vfg_bitmap), - Base: convertBaseResource(cSocTotalResource.base), - Computing: convertComputingResource(cSocTotalResource.computing), - Media: convertMediaResource(cSocTotalResource.media), - } - for i := uint32(0); i < uint32(cSocTotalResource.vdev_num) && i < dcmiMaxVdevNum; i++ { - socTotalResource.VDevID = append(socTotalResource.VDevID, uint32(cSocTotalResource.vdev_id[i])) - } - return socTotalResource -} - -// DcGetDeviceTotalResource get device total resource info -func (d *DcManager) DcGetDeviceTotalResource(cardID, deviceID int32) (common.CgoSocTotalResource, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoSocTotalResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetTotalResource - var totalResource C.struct_dcmi_soc_total_resource - size := C.uint(unsafe.Sizeof(totalResource)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&totalResource), &size); int32(retCode) != common.Success { - return common.CgoSocTotalResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) - } - if uint32(totalResource.vdev_num) > dcmiMaxVdevNum { - return common.CgoSocTotalResource{}, fmt.Errorf("get error virtual quantity: %d", - uint32(totalResource.vdev_num)) - } - - return convertSocTotalResource(totalResource), nil -} - -func convertSuperPodInfo(cSuperPodInfo C.struct_dcmi_spod_info) common.CgoSuperPodInfo { - superPodInfo := common.CgoSuperPodInfo{ - SdId: uint32(cSuperPodInfo.sdid), - ScaleType: uint32(cSuperPodInfo.scale_type), - SuperPodId: uint32(cSuperPodInfo.super_pod_id), - ServerId: uint32(cSuperPodInfo.server_id), - } - - for i := uint32(0); i < dcmiMaxReserveNum; i++ { - superPodInfo.Reserve = append(superPodInfo.Reserve, uint32(cSuperPodInfo.reserve[i])) - } - - return superPodInfo -} - -// DcGetSuperPodInfo get device total resource info -func (d *DcManager) DcGetSuperPodInfo(cardID, deviceID int32) (common.CgoSuperPodInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoSuperPodInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var unitType C.enum_dcmi_unit_type - if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { - return common.CgoSuperPodInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) - } - if int32(unitType) != common.NpuType { - return common.CgoSuperPodInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) - } - - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdChipInf) - subCmd := CinfSubCmdGetSPodInfo - var sPodInfo C.struct_dcmi_spod_info - size := C.uint(unsafe.Sizeof(sPodInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&sPodInfo), &size); int32(retCode) != common.Success { - return common.CgoSuperPodInfo{}, fmt.Errorf("get super pod info failed, error is: %d", int32(retCode)) - } - - return convertSuperPodInfo(sPodInfo), nil -} - -func convertSocFreeResource(cSocFreeResource C.struct_dcmi_soc_free_resource) common.CgoSocFreeResource { - socFreeResource := common.CgoSocFreeResource{ - VfgNum: uint32(cSocFreeResource.vfg_num), - VfgBitmap: uint32(cSocFreeResource.vfg_bitmap), - Base: convertBaseResource(cSocFreeResource.base), - Computing: convertComputingResource(cSocFreeResource.computing), - Media: convertMediaResource(cSocFreeResource.media), - } - return socFreeResource -} - -// DcGetDeviceFreeResource get device free resource info -func (d *DcManager) DcGetDeviceFreeResource(cardID, deviceID int32) (common.CgoSocFreeResource, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoSocFreeResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetFreeResource - var freeResource C.struct_dcmi_soc_free_resource - size := C.uint(unsafe.Sizeof(freeResource)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&freeResource), &size); int32(retCode) != common.Success { - return common.CgoSocFreeResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) - } - return convertSocFreeResource(freeResource), nil -} - -// DcGetVDevActivityInfo get vir device activity info by virtual device id -func (d *DcManager) DcGetVDevActivityInfo(cardID, deviceID int32, vDevID uint32) (common.VDevActivityInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.VDevActivityInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidVDevID(vDevID) { - return common.VDevActivityInfo{}, fmt.Errorf("vDevID(%d) invalid", vDevID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetVDevActivity - var vDevActivityInfo C.struct_dcmi_vdev_query_stru - size := C.uint(unsafe.Sizeof(vDevActivityInfo)) - vDevActivityInfo.vdev_id = C.uint(vDevID) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&vDevActivityInfo), &size); int32(retCode) != common.Success { - return common.VDevActivityInfo{}, fmt.Errorf("retCode: %d", int32(retCode)) - } - totalMemSize := uint64(vDevActivityInfo.query_info.computing.vdev_memory_total) - usedMemSize := totalMemSize - uint64(vDevActivityInfo.query_info.computing.vdev_memory_free) - if usedMemSize < 0 { - return common.VDevActivityInfo{}, errors.New("used memory value abnormal") - } - return common.VDevActivityInfo{ - VDevID: vDevID, - VDevAiCoreRate: uint32(vDevActivityInfo.query_info.computing.vdev_aicore_utilization), - VDevTotalMem: totalMemSize, - VDevUsedMem: usedMemSize, - IsVirtualDev: true, - }, nil -} - -// DcVGetDeviceInfo get vdevice resource info -func (d *DcManager) DcVGetDeviceInfo(cardID, deviceID int32) (common.VirtualDevInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.VirtualDevInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var unitType C.enum_dcmi_unit_type - if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { - return common.VirtualDevInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) - } - if int32(unitType) != common.NpuType { - return common.VirtualDevInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) - } - - cgoDcmiSocTotalResource, err := d.DcGetDeviceTotalResource(cardID, deviceID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get device total resource failed, error is: %v", err) - } - - cgoDcmiSocFreeResource, err := d.DcGetDeviceFreeResource(cardID, deviceID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get device free resource failed, error is: %v", err) - } - dcmiVDevInfo := common.VirtualDevInfo{ - TotalResource: cgoDcmiSocTotalResource, - FreeResource: cgoDcmiSocFreeResource, - } - for _, vDevID := range cgoDcmiSocTotalResource.VDevID { - cgoVDevQueryStru, err := d.DcGetDeviceVDevResource(cardID, deviceID, vDevID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get device virtual resource failed, error is: %v", err) - } - dcmiVDevInfo.VDevInfo = append(dcmiVDevInfo.VDevInfo, cgoVDevQueryStru) - vDevActivityInfo, err := d.DcGetVDevActivityInfo(cardID, deviceID, vDevID) - if err != nil { - hwlog.RunLog.Warnf("get cur vDev's activity info failed, err: %s", err) - continue - } - vDevActivityInfo.VDevAiCore = float64(cgoVDevQueryStru.QueryInfo.Computing.Aic) - dcmiVDevInfo.VDevActivityInfo = append(dcmiVDevInfo.VDevActivityInfo, vDevActivityInfo) - } - return dcmiVDevInfo, nil -} - -// DcGetCardIDDeviceID get card id and device id from logic id -func (d *DcManager) DcGetCardIDDeviceID(logicID int32) (int32, int32, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) - } - var cardID, deviceID C.int - if retCode := C.dcmi_get_card_id_device_id_from_logicid(&cardID, &deviceID, - C.uint(logicID)); int32(retCode) != common.Success { - return common.RetError, common.RetError, - fmt.Errorf("failed to get card id and device id by logicID(%d), errorcode is: %d", logicID, - int32(retCode)) - } - if !common.IsValidCardIDAndDeviceID(int32(cardID), int32(deviceID)) { - return common.RetError, common.RetError, fmt.Errorf("failed to get card id and device id, "+ - "cardID(%d) or deviceID(%d) is invalid", int32(cardID), int32(deviceID)) - } - - return int32(cardID), int32(deviceID), nil -} - -// DcCreateVDevice create virtual device by logic id -func (d *DcManager) DcCreateVDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. - CgoCreateVDevOut, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return common.CgoCreateVDevOut{}, fmt.Errorf("get card id and device id failed, error is: %v", err) - } - - createVDevOut, err := d.DcCreateVirtualDevice(cardID, deviceID, vDevInfo) - if err != nil { - return common.CgoCreateVDevOut{}, fmt.Errorf("create virtual device failed, error is: %v", err) - } - return createVDevOut, nil -} - -// DcGetVDeviceInfo get virtual device info by logic id -func (d *DcManager) DcGetVDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.VirtualDevInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get card id and device id failed, error is: %v", err) - } - - dcmiVDevInfo, err := d.DcVGetDeviceInfo(cardID, deviceID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v", err) - } - return dcmiVDevInfo, nil -} - -// DcDestroyVDevice destroy spec virtual device by logic id -func (d *DcManager) DcDestroyVDevice(logicID int32, vDevID uint32) error { - if !common.IsValidLogicIDOrPhyID(logicID) { - return fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return fmt.Errorf("get card id and device id failed, error is: %v", err) - } - - if err = d.DcSetDestroyVirtualDevice(cardID, deviceID, vDevID); err != nil { - return fmt.Errorf("destroy virtual device failed, error is: %v", err) - } - return nil -} - -// DcGetDeviceVoltage the accuracy is 0.01v. -func (d *DcManager) DcGetDeviceVoltage(cardID, deviceID int32) (float32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var vol C.uint - if retCode := C.dcmi_get_device_voltage(C.int(cardID), C.int(deviceID), &vol); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("failed to obtain the voltage based on card_id(%d) and "+ - "device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) - } - // the voltage's value is error if it's greater than or equal to MaxInt32 - if common.IsGreaterThanOrEqualInt32(int64(vol)) { - return common.RetError, fmt.Errorf("voltage value out of range(max is int32), "+ - "card_id(%d) and device_id(%d), voltage: %d", cardID, deviceID, int64(vol)) - } - - return float32(vol) * common.ReduceOnePercent, nil -} - -// DcGetDevicePowerInfo the accuracy is 0.1w, the result like: 8.2 -func (d *DcManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cpower C.int - if retCode := C.dcmi_get_device_power_info(C.int(cardID), C.int(deviceID), - &cpower); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("failed to obtain the power based on card_id(%d) and device_id(%d)"+ - ", error code: %d", cardID, deviceID, int32(retCode)) - } - parsedPower := float32(cpower) - if parsedPower < 0 { - return common.RetError, fmt.Errorf("get wrong device power, card_id(%d) and device_id(%d), power: %f", - cardID, deviceID, parsedPower) - } - - return parsedPower * common.ReduceTenth, nil - -} - -// DcGetDeviceFrequency get device frequency, unit MHz -// Ascend910B with frequency type: 2,6,7,9 -// Ascend910 with frequency type: 2,6,7,9 -// Ascend310 with frequency type: 1,2,6,7,9 -// Ascend310P with frequency type: 1,2,7,9,12 -// more information see common.DeviceType -func (d *DcManager) DcGetDeviceFrequency(cardID, deviceID int32, devType common.DeviceType) (uint32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cFrequency C.uint - if retCode := C.dcmi_get_device_frequency(C.int(cardID), C.int(deviceID), C.enum_dcmi_freq_type(devType.Code), - &cFrequency); int32(retCode) != common.Success { - return common.UnRetError, - buildDcmiErr(cardID, deviceID, fmt.Sprintf("frequency (name: %v, code:%d)", devType.Name, devType.Code), retCode) - } - // check whether cFrequency is too big - if common.IsGreaterThanOrEqualInt32(int64(cFrequency)) || int64(cFrequency) < 0 { - return common.UnRetError, fmt.Errorf("frequency value out of range [0, int32),card_id(%d) and device_id(%d), "+ - "frequency (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, int64(cFrequency)) - } - return uint32(cFrequency), nil -} - -// DcGetMemoryInfo use v3 interface to query memory info -func (d *DcManager) DcGetMemoryInfo(cardID, deviceID int32) (*common.MemoryInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cmInfoV3 CDcmiMemoryInfoV3 - if retCode := C.dcmi_get_device_memory_info_v3(C.int(cardID), C.int(deviceID), - &cmInfoV3); int32(retCode) != common.Success { - return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ - "%d) and device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) - } - - if uint64(cmInfoV3.memory_size) < uint64(cmInfoV3.memory_available) { - return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ - "%d) and device_id(%d), total memory is less than available memory", cardID, deviceID) - } - - return &common.MemoryInfo{ - MemorySize: uint64(cmInfoV3.memory_size), - MemoryAvailable: uint64(cmInfoV3.memory_available), - Frequency: uint32(cmInfoV3.freq), - Utilization: uint32(cmInfoV3.utiliza), - }, nil - -} - -// FuncDcmiGetDeviceHbmInfo dcmi_get_device_hbm_info function for outer invoke, only for Ascend910 -func FuncDcmiGetDeviceHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cHbmInfo C.struct_dcmi_hbm_info - if retCode := C.dcmi_get_device_hbm_info(C.int(cardID), C.int(deviceID), - &cHbmInfo); int32(retCode) != common.Success { - return nil, buildDcmiErr(cardID, deviceID, "high bandwidth memory info", retCode) - } - hbmTemp := int32(cHbmInfo.temp) - if hbmTemp < 0 { - return nil, fmt.Errorf("get wrong device HBM temporary, card_id(%d) and device_id(%d), HBM.temp: %d", - cardID, deviceID, hbmTemp) - } - return &common.HbmInfo{ - MemorySize: uint64(cHbmInfo.memory_size), - Frequency: uint32(cHbmInfo.freq), - Usage: uint64(cHbmInfo.memory_usage), - Temp: hbmTemp, - BandWidthUtilRate: uint32(cHbmInfo.bandwith_util_rate)}, nil -} - -// DcGetHbmInfo get HBM information A310/A310P not support -func (d *DcManager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { - return &common.HbmInfo{ - MemorySize: 0, - Frequency: 0, - Usage: 0, - Temp: 0, - BandWidthUtilRate: 0}, nil -} - -// DcGetDeviceErrorCode get the error count and errorcode of the device,only return the first errorcode -func (d *DcManager) DcGetDeviceErrorCode(cardID, deviceID int32) (int32, int64, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, - deviceID) - } - var errCount C.int - var errCodeArray [common.MaxErrorCodeCount]C.uint - if retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], - common.MaxErrorCodeCount); int32(retCode) != common.Success { - return common.RetError, common.RetError, fmt.Errorf("failed to obtain the device errorcode based on "+ - "card_id(%d) and device_id(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), - int32(errCount)) - } - - if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { - return common.RetError, common.RetError, fmt.Errorf("get wrong errorcode count, "+ - "card_id(%d) and device_id(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) - } - - return int32(errCount), int64(errCodeArray[0]), nil -} - -// DcGetDeviceCount get device count -func (d *DcManager) DcGetDeviceCount() (int32, error) { - devNum, _, err := d.DcGetLogicIDList() - if err != nil { - return common.RetError, fmt.Errorf("get device count failed, error: %v", err) - } - return devNum, nil -} - -// DcGetLogicIDList get device logic id list -func (d *DcManager) DcGetLogicIDList() (int32, []int32, error) { - logicIDs := make([]int32, 0) - var totalNum int32 - _, cardList, err := d.DcGetCardList() - if err != nil { - return common.RetError, logicIDs, fmt.Errorf("get card list failed, error: %v", err) - } - for _, cardID := range cardList { - devNumInCard, err := d.DcGetDeviceNumInCard(cardID) - if err != nil { - return common.RetError, logicIDs, fmt.Errorf("get device num by cardID: %d failed, error: %v", - cardID, err) - } - totalNum += devNumInCard - if totalNum > common.HiAIMaxDeviceNum*common.HiAIMaxCardNum { - return common.RetError, nil, fmt.Errorf("get device num: %d greater than %d", - totalNum, common.HiAIMaxDeviceNum*common.HiAIMaxCardNum) - } - for devID := int32(0); devID < devNumInCard; devID++ { - logicID, err := d.DcGetDeviceLogicID(cardID, devID) - if err != nil { - return common.RetError, nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ - "failed, error: %v", cardID, devID, err) - } - logicIDs = append(logicIDs, logicID) - } - } - return totalNum, logicIDs, nil -} - -// DcGetDeviceHealth get device health -func (d *DcManager) DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var health C.uint - if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), - &health); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ - "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) - } - if common.IsGreaterThanOrEqualInt32(int64(health)) { - return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ - "health: %d", cardID, deviceID, int64(health)) - } - return int32(health), nil -} - -// DcGetDeviceUtilizationRate get device utils rate by id -func (d *DcManager) DcGetDeviceUtilizationRate(cardID, deviceID int32, devType common.DeviceType) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var rate C.uint - if retCode := C.dcmi_get_device_utilization_rate(C.int(cardID), C.int(deviceID), C.int(devType.Code), - &rate); int32(retCode) != common.Success { - return common.RetError, - buildDcmiErr(cardID, deviceID, fmt.Sprintf("utilization (name: %v, code:%d)", devType.Name, devType.Code), retCode) - } - if !common.IsValidUtilizationRate(uint32(rate)) { - return common.RetError, fmt.Errorf("get wrong device (cardID: %d, deviceID: %d) "+ - "utilization (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, uint32(rate)) - } - return int32(rate), nil -} - -// DcGetDeviceTemperature get the device temperature -func (d *DcManager) DcGetDeviceTemperature(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var temp C.int - if retCode := C.dcmi_get_device_temperature(C.int(cardID), C.int(deviceID), - &temp); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) temperature failed, error "+ - "code is : %d", cardID, deviceID, int32(retCode)) - } - parsedTemp := int32(temp) - if parsedTemp < int32(common.DefaultTemperatureWhenQueryFailed) { - return common.RetError, fmt.Errorf("get wrong device temperature, devcie (cardID: %d, deviceID: %d), "+ - "temperature: %d", cardID, deviceID, parsedTemp) - } - return parsedTemp, nil -} - -func convertUCharToCharArr(cgoArr [maxChipNameLen]C.uchar) []byte { - var charArr []byte - for _, v := range cgoArr { - if v == 0 { - break - } - charArr = append(charArr, byte(v)) - } - return charArr -} - -// DcGetChipInfo get the chip info by cardID and deviceID -func (d *DcManager) DcGetChipInfo(cardID, deviceID int32) (*common.ChipInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var chipInfo C.struct_dcmi_chip_info_v2 - chip := &common.ChipInfo{} - if rCode := C.dcmi_get_device_chip_info_v2(C.int(cardID), C.int(deviceID), &chipInfo); int32(rCode) != common.Success { - hwlog.RunLog.Debugf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ - " error code: %d", cardID, deviceID, int32(rCode)) - var oldChipInfo C.struct_dcmi_chip_info - if rCode = C.dcmi_get_device_chip_info(C.int(cardID), C.int(deviceID), &oldChipInfo); int32(rCode) != common.Success { - return nil, fmt.Errorf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ - " error code: %d", cardID, deviceID, int32(rCode)) - } - chip.Name = string(convertUCharToCharArr(oldChipInfo.chip_name)) - chip.Type = string(convertUCharToCharArr(oldChipInfo.chip_type)) - chip.Version = string(convertUCharToCharArr(oldChipInfo.chip_ver)) - chip.AICoreCnt = int(oldChipInfo.aicore_cnt) - } else { - chip.Name = string(convertUCharToCharArr(chipInfo.chip_name)) - chip.Type = string(convertUCharToCharArr(chipInfo.chip_type)) - chip.Version = string(convertUCharToCharArr(chipInfo.chip_ver)) - chip.AICoreCnt = int(chipInfo.aicore_cnt) - chip.NpuName = string(convertUCharToCharArr(chipInfo.npu_name)) - } - if !common.IsValidChipInfo(chip) { - return nil, fmt.Errorf("get device ChipInfo information failed, chip info is empty,"+ - " cardID(%d), deviceID(%d)", cardID, deviceID) - } - - return chip, nil -} - -// DcGetPhysicIDFromLogicID get physicID from logicID -func (d *DcManager) DcGetPhysicIDFromLogicID(logicID int32) (int32, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, fmt.Errorf("logicID(%d) is invalid", logicID) - } - var physicID C.uint - if rCode := C.dcmi_get_device_phyid_from_logicid(C.uint(logicID), &physicID); int32(rCode) != common.Success { - return common.RetError, fmt.Errorf("get physic id from logicID(%d) failed, error code: %d", logicID, int32(rCode)) - } - if !common.IsValidLogicIDOrPhyID(int32(physicID)) { - return common.RetError, fmt.Errorf("get wrong physicID(%d) from logicID(%d)", uint32(physicID), logicID) - } - return int32(physicID), nil -} - -// DcGetDeviceIPAddress get device IP address by cardID and deviceID -func (d *DcManager) DcGetDeviceIPAddress(cardID, deviceID, ipType int32) (string, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var portType C.enum_dcmi_port_type = 1 - var portID C.int - var ipAddress C.struct_dcmi_ip_addr - var maskAddress C.struct_dcmi_ip_addr - if ipType == ipAddrTypeV6 { - ipAddress.ip_type = ipAddrTypeV6 - } - rCode := C.dcmi_get_device_ip(C.int(cardID), C.int(deviceID), portType, portID, &ipAddress, &maskAddress) - if int32(rCode) != common.Success { - return "", fmt.Errorf("get device IP address failed, cardID(%d), deviceID(%d), error code: %d", - cardID, deviceID, int32(rCode)) - } - if ipType == ipAddrTypeV6 { - return d.buildIPv6Addr(ipAddress) - } - return d.buildIPv4Addr(ipAddress) -} - -func (d *DcManager) buildIPv4Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { - deviceIP := make([]string, 0, net.IPv4len) - for key, val := range ipAddress.u_addr { - if key >= net.IPv4len { - break - } - deviceIP = append(deviceIP, fmt.Sprintf("%v", val)) - } - if netIP := net.ParseIP(strings.Join(deviceIP, ".")); netIP != nil { - return netIP.String(), nil - } - return "", fmt.Errorf("the device IPv4 address is invalid, value: %v", deviceIP) -} - -func (d *DcManager) buildIPv6Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { - deviceIP := make([]byte, 0, net.IPv6len) - for key, val := range ipAddress.u_addr { - if key >= net.IPv6len { - break - } - deviceIP = append(deviceIP, byte(val)) - } - if netIP := net.IP(deviceIP); netIP != nil { - return netIP.String(), nil - } - return "", fmt.Errorf("the device IPv6 address is invalid, value: %v", deviceIP) -} - -func callDcmiGetDeviceNetworkHealth(cardID, deviceID int32, result chan<- common.DeviceNetworkHealth) { - var healthCode C.enum_dcmi_rdfx_detect_result - rCode := C.dcmi_get_device_network_health(C.int(cardID), C.int(deviceID), &healthCode) - result <- common.DeviceNetworkHealth{HealthCode: uint32(healthCode), RetCode: int32(rCode)} -} - -// DcGetDeviceNetWorkHealth get device network health by cardID and deviceID -func (d *DcManager) DcGetDeviceNetWorkHealth(cardID, deviceID int32) (uint32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - result := make(chan common.DeviceNetworkHealth, 1) - go callDcmiGetDeviceNetworkHealth(cardID, deviceID, result) - select { - case res := <-result: - if res.RetCode != common.Success { - return common.UnRetError, fmt.Errorf("get device network healthCode failed, cardID(%d),"+ - " deviceID(%d), ret code: %d, health code: %d", cardID, deviceID, res.RetCode, res.HealthCode) - } - - if int32(res.HealthCode) < 0 || int32(res.HealthCode) > int32(math.MaxInt8) { - return common.UnRetError, fmt.Errorf("get wrong device network healthCode, cardID(%d), deviceID(%d),"+ - " error healthCode: %d", cardID, deviceID, int32(res.HealthCode)) - } - - return res.HealthCode, nil - // dcmi_get_device_network_health is occasionally blocked for a long time, because of retrying, - // after the card dropped. This method is used to interrupt the execution of the dcmi interface, - // if invoking time excceeds 1 second. - case <-time.After(common.DcmiApiTimeout * time.Second): - return common.UnRetError, fmt.Errorf("accessing dcmi_get_device_network_health interface timeout, "+ - "cardID(%d), deviceID(%d)", cardID, deviceID) - } -} - -// DcGetLogicIDFromPhysicID get logicID from physicID -func (d *DcManager) DcGetLogicIDFromPhysicID(physicID int32) (int32, error) { - if !common.IsValidLogicIDOrPhyID(physicID) { - return common.RetError, fmt.Errorf("physicID(%d) is invalid", physicID) - } - var logicID C.uint - if rCode := C.dcmi_get_device_logicid_from_phyid(C.uint(physicID), &logicID); int32(rCode) != common.Success { - return common.RetError, fmt.Errorf("get logicID from physicID(%d) failed, error code: %d", - physicID, int32(rCode)) - } - - if !common.IsValidLogicIDOrPhyID(int32(logicID)) { - return common.RetError, fmt.Errorf("get wrong logicID(%d) from physicID(%d)", uint32(logicID), physicID) - } - return int32(logicID), nil -} - -// FuncDcmiMcuGetPowerInfo dcmi_mcu_get_power_info_new function for outer invoke -func FuncDcmiMcuGetPowerInfo(cardID int32) (float32, error) { - var power C.int - if retCode := C.dcmi_mcu_get_power_info_new(C.int(cardID), &power); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("mcu_get_power_info failed, error code is:%d", int32(retCode)) - } - parsedPower := float32(power) - if parsedPower < 0 { - return common.RetError, fmt.Errorf("get wrong mcu_get_power_info, cardID: %d, power: %f", cardID, - parsedPower) - } - return parsedPower * common.ReduceTenth, nil -} - -// DcGetMcuPowerInfo this function is only for Ascend310P, A910/A310 not support -func (d *DcManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { - return 0, nil -} - -// DcGetProductType get product type by dcmi interface -func (d *DcManager) DcGetProductType(cardID, deviceID int32) (string, error) { - cProductType := C.CString(string(make([]byte, productTypeLen))) - defer C.free(unsafe.Pointer(cProductType)) - err := C.dcmi_get_product_type(C.int(cardID), C.int(deviceID), (*C.char)(cProductType), productTypeLen+1) - if err != 0 { - return "", fmt.Errorf("get product type failed, errCode: %d", int32(err)) - } - return C.GoString(cProductType), nil -} - -// DcGetNpuWorkMode get npu work mode, this function is only for Ascend910, A310/310P not support -func (d *DcManager) DcGetNpuWorkMode(cardID int32) (int, error) { - var cWorkMode C.uchar - err := C.dcmi_get_npu_work_mode(C.int(cardID), &cWorkMode) - if err != 0 { - return common.RetError, fmt.Errorf("get npu work mode failed, errCode: %d", int32(err)) - } - return int(cWorkMode), nil -} - -// DcSetDeviceReset reset spec device chip -func (d *DcManager) DcSetDeviceReset(cardID, deviceID int32) error { - var channelType C.enum_dcmi_reset_channel = C.INBAND_CHANNEL - return d.setDeviceReset(cardID, deviceID, channelType) -} - -// DcGetBrotherCardID get brother card id -func (d *DcManager) DcGetBrotherCardID(cardID, deviceID int32) (int32, error) { - var broCardID C.int - errCode := C.dcmi_get_netdev_brother_device(C.int(cardID), C.int(deviceID), &broCardID) - if errCode != common.Success { - return common.RetError, fmt.Errorf("unable to get brother card, errCode: %v", errCode) - } - return int32(broCardID), nil -} - -// DcGetOutBandChannelState get out band channel state -func (d *DcManager) DcGetOutBandChannelState(cardID, deviceID int32) error { - var channelState C.int - errCode := C.dcmi_get_device_outband_channel_state(C.int(cardID), C.int(deviceID), &channelState) - if errCode != common.Success { - return fmt.Errorf("get out band channel state error, errCode: %v", errCode) - } - if channelState != common.ChannelStateOk { - return fmt.Errorf("chip reset not support, channel state: %v", channelState) - } - return nil -} - -// DcPreResetSoc pre reset soc, used before reset out band -func (d *DcManager) DcPreResetSoc(cardID, deviceID int32) error { - errCode := C.dcmi_pre_reset_soc(C.int(cardID), C.int(deviceID)) - if errCode != common.Success { - return fmt.Errorf("pre reset failed, cardID: %v, deviceID: %v, errCode: %v", cardID, deviceID, errCode) - } - return nil -} - -// DcSetDeviceResetOutBand reset spec device chip out band -func (d *DcManager) DcSetDeviceResetOutBand(cardID, deviceID int32) error { - var channelType C.enum_dcmi_reset_channel = C.OUTBAND_CHANNEL - return d.setDeviceReset(cardID, deviceID, channelType) -} - -// DcRescanSoc trigger soc rescan, non-blocking -func (d *DcManager) DcRescanSoc(cardID, deviceID int32) error { - errCode := C.dcmi_rescan_soc(C.int(cardID), C.int(deviceID)) - if errCode != common.Success { - return fmt.Errorf("fail to rescan chip cardID %d, deviceID %v, errCode: %v", cardID, deviceID, errCode) - } - return nil -} - -func (d *DcManager) setDeviceReset(cardID, deviceID int32, channelType C.enum_dcmi_reset_channel) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if errCode := C.dcmi_set_device_reset(C.int(cardID), C.int(deviceID), channelType); errCode != 0 { - return fmt.Errorf("cardID(%d) and deviceID(%d) hot reset errCode: %v", cardID, deviceID, errCode) - } - return nil -} - -// DcGetDeviceBootStatus get NPU boot status -func (d *DcManager) DcGetDeviceBootStatus(logicID int32) (int, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return common.RetError, fmt.Errorf("failed to get cardID and deviceID by logicID(%d)", logicID) - } - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var bootStatus C.enum_dcmi_boot_status = C.DCMI_BOOT_STATUS_FINISH - if errCode := C.dcmi_get_device_boot_status(C.int(cardID), C.int(deviceID), &bootStatus); errCode != 0 { - return common.RetError, fmt.Errorf("device boot status errCode: %v", errCode) - } - return int(bootStatus), nil -} - -// DcGetDeviceAllErrorCode get the error count and all error codes of the device -func (d *DcManager) DcGetDeviceAllErrorCode(cardID, deviceID int32) (int32, []int64, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, - deviceID) - } - var errCount C.int - var errCodeArray [common.MaxErrorCodeCount]C.uint - retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], - common.MaxErrorCodeCount) - - var health C.uint - healthRetCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), &health) - - if int32(retCode) != common.Success && int32(healthRetCode) != common.DeviceNotReadyErrCode { - return common.RetError, nil, fmt.Errorf("failed to obtain the device errorcode based on cardID("+ - "%d) and deviceID(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), int32(errCount)) - } - - errCodes := make([]int64, 0, len(errCodeArray)) - for _, errCode := range errCodeArray { - if int64(errCode) != 0 { - errCodes = append(errCodes, int64(errCode)) - } - } - - if int32(healthRetCode) == common.DeviceNotReadyErrCode { - hwlog.RunLog.Errorf("device errorcode v2 ret code: %d, device health ret code: %d, device not ready, "+ - "maybe a card drop fault occurred on cardID(%d) and deviceID(%d)", int32(retCode), int32(healthRetCode), - cardID, deviceID) - errCount += 1 - errCodes = append(errCodes, common.CardDropFaultCode) - } - - if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { - return common.RetError, nil, fmt.Errorf("get wrong errorcode count, "+ - "cardID(%d) and deviceID(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) - } - - return int32(errCount), errCodes, nil -} - -// DcSubscribeDeviceFaultEvent subscribe device fault, callback with func 'faultEventCallFunc' -func (d *DcManager) DcSubscribeDeviceFaultEvent(cardID, deviceID int32) error { - if faultEventCallFunc == nil { - return errors.New("callFunc is invalid, can't start subscribe") - } - - var filter C.struct_dcmi_event_filter - if rCode := C.dcmi_subscribe_fault_event(C.int(cardID), C.int(deviceID), filter); int32(rCode) != common.Success { - return fmt.Errorf("subscribe fault event failed, cardID(%d) and deviceID(%d), error code: %d", - cardID, deviceID, int32(rCode)) - } - return nil -} - -// DcSetFaultEventCallFunc set fault event call back func -func (d *DcManager) DcSetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) { - faultEventCallFunc = businessFunc -} - -//export goEventFaultCallBack -func goEventFaultCallBack(event C.struct_dcmi_dms_fault_event) { - if faultEventCallFunc == nil { - hwlog.RunLog.Errorf("no fault event call back func") - return - } - // recovery event recorded fault event occurrence time, the recovery event time cannot be obtained. - // Therefore, all event occurrence time is recorded as the current host time when the event is received. - devFaultInfo := common.DevFaultInfo{ - EventID: int64(event.event_id), - LogicID: int32(event.deviceid), - ModuleType: int8(event.node_type), - ModuleID: int8(event.node_id), - SubModuleType: int8(event.sub_node_type), - SubModuleID: int8(event.sub_node_id), - Severity: int8(event.severity), - Assertion: int8(event.assertion), - AlarmRaisedTime: time.Now().UnixMilli(), - } - faultEventCallFunc(devFaultInfo) -} - -// DcGetDieID get chip die ID, like VDieID or NDieID, only Ascend910 has NDieID -func (d *DcManager) DcGetDieID(cardID, deviceID int32, dcmiDieType DieType) (string, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - if dcmiDieType != VDIE && dcmiDieType != NDIE { - return "", fmt.Errorf("dcmi die type can only be one of %d or %d", VDIE, NDIE) - } - - var dieIDObj C.struct_dcmi_die_id - if retCode := C.dcmi_get_device_die_v2(C.int(cardID), C.int(deviceID), - C.enum_dcmi_die_type(dcmiDieType), &dieIDObj); int32(retCode) != common.Success { - return "", buildDcmiErr(cardID, deviceID, "chip die ID", retCode) - } - - const hexBase = 16 - dieIDStr := make([]string, DieIDCount) - - hwlog.RunLog.Debugf("cardID(%d), deviceID(%d) get die type(%d) value %v", cardID, deviceID, dcmiDieType, - dieIDObj.soc_die) - for i := 0; i < DieIDCount; i++ { - s := strconv.FormatUint(uint64(dieIDObj.soc_die[i]), hexBase) - // Each part of the die id consists of 8 characters, and if the length is not enough, - // zero is added at the beginning - dieIDStr[i] = fmt.Sprintf("%08s", s) - } - return strings.ToUpper(strings.Join(dieIDStr, "-")), nil -} - -// DcGetDevProcessInfo chip process info -func (d *DcManager) DcGetDevProcessInfo(cardID, deviceID int32) (*common.DevProcessInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info - var procNum C.int - - if retCode := C.dcmi_get_device_resource_info(C.int(cardID), C.int(deviceID), &procList[0], - &procNum); int32(retCode) != common.Success { - return nil, buildDcmiErr(cardID, deviceID, "device resource", retCode) - } - - if int32(procNum) < 0 || int32(procNum) > common.MaxProcNum { - return nil, fmt.Errorf("get invalid proccess num (%d), cardID(%d) and deviceID(%d)", int32(procNum), cardID, - deviceID) - } - - return convertToDevResourceInfo(procList, int32(procNum)), nil -} - -func convertToDevResourceInfo(procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info, - procNum int32) *common.DevProcessInfo { - if procNum < 0 || procNum > common.MaxProcNum { - hwlog.RunLog.Errorf("process num %v is not within in the range [0~%v]", procNum, common.MaxProcNum) - return nil - } - - info := new(common.DevProcessInfo) - if procNum == 0 { - return info - } - - info.ProcNum = procNum - for i := int32(0); i < procNum; i++ { - proc := common.DevProcInfo{ - Pid: int32(procList[i].proc_id), - MemUsage: float64(procList[i].proc_mem_usage) / common.UnitMB, // convert byte to MB - } - info.DevProcArray = append(info.DevProcArray, proc) - } - - return info -} - -// DcGetPCIeBusInfo pcie bus info -func (d *DcManager) DcGetPCIeBusInfo(cardID, deviceID int32) (string, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var pcieInfo C.struct_dcmi_pcie_info_all - - if retCode := C.dcmi_get_device_pcie_info_v2(C.int(cardID), - C.int(deviceID), &pcieInfo); int32(retCode) != common.Success { - return "", buildDcmiErr(cardID, deviceID, "pcie bus", retCode) - } - - info := fmt.Sprintf("%04X:%02X:%02X.%-4X", int32(pcieInfo.domain), uint32(pcieInfo.bdf_busid), - uint32(pcieInfo.bdf_deviceid), uint32(pcieInfo.bdf_funcid)) - hwlog.RunLog.Debugf("pcie bus info is: '%s'", info) - - return strings.TrimRight(info, " "), nil -} - -// DcGetDeviceBoardInfo return board info of device -func (d *DcManager) DcGetDeviceBoardInfo(cardID, deviceID int32) (common.BoardInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.BoardInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var cBoardInfo C.struct_dcmi_board_info - - if retCode := C.dcmi_get_device_board_info(C.int(cardID), C.int(deviceID), - &cBoardInfo); int32(retCode) != common.Success { - return common.BoardInfo{}, buildDcmiErr(cardID, deviceID, "board info", retCode) - } - - return common.BoardInfo{ - BoardId: uint32(cBoardInfo.board_id), - PcbId: uint32(cBoardInfo.pcb_id), - BomId: uint32(cBoardInfo.bom_id), - SlotId: uint32(cBoardInfo.slot_id), - }, nil -} - -// DcGetPCIEBandwidth get pcie bandwidth value -func (d *DcManager) DcGetPCIEBandwidth(cardID, deviceID int32, profilingTime int) (common.PCIEBwStat, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.PCIEBwStat{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var dcmiPCIEBandwidth C.struct_dcmi_pcie_link_bandwidth_info - var pcieBandwidth common.PCIEBwStat - dcmiPCIEBandwidth.profiling_time = C.int(profilingTime) - retCode := C.dcmi_get_pcie_link_bandwidth_info(C.int(cardID), C.int(deviceID), &dcmiPCIEBandwidth) - if int32(retCode) != common.Success { - return pcieBandwidth, buildDcmiErr(cardID, deviceID, "PCIEBandwidth", retCode) - } - - pcieBandwidth.PcieRxPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_p_bw) - pcieBandwidth.PcieRxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_np_bw) - pcieBandwidth.PcieRxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_cpl_bw) - - pcieBandwidth.PcieTxPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_p_bw) - pcieBandwidth.PcieTxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_np_bw) - pcieBandwidth.PcieTxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_cpl_bw) - - return pcieBandwidth, nil -} - -func (d *DcManager) convertPcieBw(pcieBwArr [agentdrvProfDataNum]C.uint) common.PcieStatValue { - return common.PcieStatValue{ - PcieMinBw: int32(pcieBwArr[0]), - PcieMaxBw: int32(pcieBwArr[1]), - PcieAvgBw: int32(pcieBwArr[agentdrvProfDataNum-1]), - } -} - -// DcGetDcmiVersion return dcmi version -func (d *DcManager) DcGetDcmiVersion() (string, error) { - cDcmiVer := C.CString(string(make([]byte, dcmiVersionLen))) - defer C.free(unsafe.Pointer(cDcmiVer)) - if retCode := C.dcmi_get_dcmi_version((*C.char)(cDcmiVer), dcmiVersionLen+1); int32(retCode) != common.Success { - return "", fmt.Errorf("get dcmi version failed, errCode: %d", int32(retCode)) - } - return C.GoString(cDcmiVer), nil -} - -// DcGetDeviceEccInfo get ECC info -func (d *DcManager) DcGetDeviceEccInfo(cardID, deviceID int32, inputType common.DcmiDeviceType) ( - *common.ECCInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - dcmiDeviceType, err := d.getInputType(inputType) - if err != nil { - return nil, err - } - var deviceEccInfo C.struct_dcmi_ecc_info - if retCode := C.dcmi_get_device_ecc_info(C.int(cardID), C.int(deviceID), dcmiDeviceType, - &deviceEccInfo); retCode != 0 { - return nil, buildDcmiErr(cardID, deviceID, "dcmi device ECC", retCode) - } - eccInfo := &common.ECCInfo{ - EnableFlag: int32(deviceEccInfo.enable_flag), - SingleBitErrorCnt: int64(deviceEccInfo.single_bit_error_cnt), - DoubleBitErrorCnt: int64(deviceEccInfo.double_bit_error_cnt), - TotalSingleBitErrorCnt: int64(deviceEccInfo.total_single_bit_error_cnt), - TotalDoubleBitErrorCnt: int64(deviceEccInfo.total_double_bit_error_cnt), - SingleBitIsolatedPagesCnt: int64(deviceEccInfo.single_bit_isolated_pages_cnt), - DoubleBitIsolatedPagesCnt: int64(deviceEccInfo.double_bit_isolated_pages_cnt), - } - return eccInfo, nil -} - -// DcGetHccsStatisticInfo get HCCS statistic info -func (d *DcManager) DcGetHccsStatisticInfo(cardID, deviceID int32) (common.HccsStatisticInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) - subCmd := HccsSubCmdGetStatisticInfo - var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info - // Use a secure function to get the address (for cleanCode) - addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) - if err != nil { - return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) - } - size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - addr, &size); int32(retCode) != common.Success { - return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) - } - return convertHccsStatisticInfoStruct(hccsStatisticInfo), nil -} - -// DcGetHccsStatisticInfoU64 get HCCS statistic info -func (d *DcManager) DcGetHccsStatisticInfoU64(cardID, deviceID int32) (common.HccsStatisticInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) - subCmd := HccsSubCmdGetStatisticInfoU64 - var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64 - // Use a secure function to get the address (for cleanCode) - addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) - if err != nil { - return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) - } - size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - addr, &size); int32(retCode) != common.Success { - return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) - } - return convertHccsStatisticInfoStructU64(hccsStatisticInfo), nil -} - -func convertHccsStatisticInfoStruct(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info) common.HccsStatisticInfo { - cgoHccsStatisticInfo := common.HccsStatisticInfo{} - for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { - cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) - cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) - cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) - } - return cgoHccsStatisticInfo -} - -func convertHccsStatisticInfoStructU64(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64) common.HccsStatisticInfo { - cgoHccsStatisticInfo := common.HccsStatisticInfo{} - for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { - cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) - cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) - cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) - } - return cgoHccsStatisticInfo -} - -// DcGetHccsBandwidthInfo get HCCS bandwidth info -func (d *DcManager) DcGetHccsBandwidthInfo(cardID int32, deviceID int32, - profilingTime int) (common.HccsBandwidthInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.HccsBandwidthInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info - hccsBandwidthInfo.profiling_time = C.int(profilingTime) - if retCode := C.dcmi_get_hccs_link_bandwidth_info(C.int(cardID), C.int(deviceID), - &hccsBandwidthInfo); int32(retCode) != common.Success { - return common.HccsBandwidthInfo{}, buildDcmiErr(cardID, deviceID, "hccs bandwidth", retCode) - } - return convertHccsBandwidthInfoStruct(hccsBandwidthInfo), nil -} - -func convertHccsBandwidthInfoStruct(hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info) common.HccsBandwidthInfo { - cgoHccsBWInfo := common.HccsBandwidthInfo{} - cgoHccsBWInfo.ProfilingTime = uint32(hccsBandwidthInfo.profiling_time) - cgoHccsBWInfo.TotalTxbw = float64(hccsBandwidthInfo.total_txbw) - cgoHccsBWInfo.TotalRxbw = float64(hccsBandwidthInfo.total_rxbw) - for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { - cgoHccsBWInfo.TxBandwidth = append(cgoHccsBWInfo.TxBandwidth, float64(hccsBandwidthInfo.tx_bandwidth[i])) - cgoHccsBWInfo.RxBandwidth = append(cgoHccsBWInfo.RxBandwidth, float64(hccsBandwidthInfo.rx_bandwidth[i])) - } - return cgoHccsBWInfo -} - -// DcGetSioInfo get SIO info -func (d *DcManager) DcGetSioInfo(cardID, deviceID int32) (common.SioCrcErrStatisticInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.SioCrcErrStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdSio) - subCmd := SioSubCmdCrcErrStatistics - var sioInfo C.struct_dcmi_sio_crc_err_statistic_info - // Use a secure function to get the address (for cleanCode) - addr, err := getAddrWithOffset(unsafe.Pointer(&sioInfo), unsafe.Sizeof(sioInfo), 0) - if err != nil { - return common.SioCrcErrStatisticInfo{}, fmt.Errorf("get sioInfo addr failed, error is: %v", err) - } - size := C.uint(unsafe.Sizeof(sioInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - addr, &size); int32(retCode) != common.Success { - return common.SioCrcErrStatisticInfo{}, buildDcmiErr(cardID, deviceID, "super pod sio", retCode) - } - return convertSioInfoStruct(sioInfo), nil -} - -func convertSioInfoStruct(sPodSioInfo C.struct_dcmi_sio_crc_err_statistic_info) common.SioCrcErrStatisticInfo { - cgoSPodSioInfo := common.SioCrcErrStatisticInfo{ - TxErrCnt: int64(sPodSioInfo.tx_error_count), - RxErrCnt: int64(sPodSioInfo.rx_error_count), - } - for i := uint32(0); i < dcmiMaxReserveNum; i++ { - cgoSPodSioInfo.Reserved = append(cgoSPodSioInfo.Reserved, uint32(sPodSioInfo.reserved[i])) - } - return cgoSPodSioInfo -} - -func (d *DcManager) getInputType(inputType common.DcmiDeviceType) (C.enum_dcmi_device_type, error) { - switch inputType { - case common.DcmiDeviceTypeDDR: - return C.DCMI_DEVICE_TYPE_DDR, nil - case common.DcmiDeviceTypeSRAM: - return C.DCMI_DEVICE_TYPE_SRAM, nil - case common.DcmiDeviceTypeHBM: - return C.DCMI_DEVICE_TYPE_HBM, nil - case common.DcmiDeviceTypeNPU: - return C.DCMI_DEVICE_TYPE_NPU, nil - case common.DcmiDeviceTypeNONE: - return C.DCMI_DEVICE_TYPE_NONE, nil - default: - return C.DCMI_DEVICE_TYPE_NONE, fmt.Errorf("invalid input type for getting device ecc info") - } -} - -// Define a safe function to get address offsets (for cleanCode) -func getAddrWithOffset(addr unsafe.Pointer, length uintptr, offset uintptr) (unsafe.Pointer, error) { - if offset > length { - return nil, fmt.Errorf("offset(%d) is invalid, length(%d)", offset, length) - } - return (unsafe.Pointer)(uintptr(addr) + offset), nil -} - -// DcGetDeviceMainBoardInfo return mainboardId of device -func (d *DcManager) DcGetDeviceMainBoardInfo(cardID, deviceID int32) (uint32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainBoardId C.uint - if retCode := C.dcmi_get_mainboard_id(C.int(cardID), C.int(deviceID), - &cMainBoardId); int32(retCode) != common.Success { - return 0, buildDcmiErr(cardID, deviceID, "mainBoardId", retCode) - } - - return uint32(cMainBoardId), nil -} -func buildDcmiErr(cardID, deviceID int32, msg string, errCode C.int) error { - errDesc, ok := dcmiErrMap[int32(errCode)] - if !ok { - errDesc = "unknown error code" - } - return fmt.Errorf("cardID(%d),deviceID(%d):get %s info failed,error code: %v,error desc: %v", - cardID, deviceID, msg, errCode, errDesc) -} - -// DcGetSuperPodStatus get super pod status -func (d *DcManager) DcGetSuperPodStatus(cardID, deviceID int32, sdid uint32) (int, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var status C.uint - if retCode := C.dcmi_get_spod_node_status(C.int(cardID), C.int(deviceID), - C.unsigned(sdid), &status); int32(retCode) != common.Success { - return 0, buildDcmiErr(cardID, deviceID, "GetSuperPodStatus", retCode) - } - return int(status), nil -} - -// DcSetSuperPodStatus set super pod status -func (d *DcManager) DcSetSuperPodStatus(cardID, deviceID int32, sdid, status uint32) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if retCode := C.dcmi_set_spod_node_status(C.int(cardID), C.int(deviceID), - C.uint(sdid), C.uint(status)); int32(retCode) != common.Success { - return buildDcmiErr(cardID, deviceID, "DcSetSuperPodStatus", retCode) - } - return nil -} - -// DcGetCardElabelV2 get card elabel information -func (d *DcManager) DcGetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - if !common.IsValidCardID(cardID) { - return common.ElabelInfo{}, fmt.Errorf("cardID(%d) is invalid", cardID) - } - var elabelInfo C.struct_dcmi_elabel_info - if retCode := C.dcmi_get_card_elabel_v2(C.int(cardID), &elabelInfo); int32(retCode) != common.Success { - return common.ElabelInfo{}, fmt.Errorf("cardID(%d): get elabel info failed, error code: %v", cardID, retCode) - } - return common.ElabelInfo{ - ProductName: C.GoString(&elabelInfo.product_name[0]), - Model: C.GoString(&elabelInfo.model[0]), - Manufacturer: C.GoString(&elabelInfo.manufacturer[0]), - ManufacturerDate: C.GoString(&elabelInfo.manufacturer_date[0]), - SerialNumber: C.GoString(&elabelInfo.serial_number[0]), - }, nil -} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h deleted file mode 100644 index 7ffe468..0000000 --- a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h +++ /dev/null @@ -1,596 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef __DCMI_INTERFACE_API_H__ -#define __DCMI_INTERFACE_API_H__ - -#ifdef __cplusplus -#if __cplusplus -extern "C" { -#endif -#endif /* __cplusplus */ - -#define DCMIDLLEXPORT static - -#define MAX_CHIP_NAME_LEN 32 // Maximum length of chip name -#define TEMPLATE_NAME_LEN 32 -#define DIE_ID_COUNT 5 // Number of die ID characters -#define AGENTDRV_PROF_DATA_NUM 3 -#define MAX_LENGTH 256 // Maximum length for elabel info fields - -/*----------------------------------------------* - * Structure description * - *----------------------------------------------*/ -struct dcmi_chip_info { - unsigned char chip_type[MAX_CHIP_NAME_LEN]; - unsigned char chip_name[MAX_CHIP_NAME_LEN]; - unsigned char chip_ver[MAX_CHIP_NAME_LEN]; - unsigned int aicore_cnt; -}; - -struct dcmi_chip_info_v2 { - unsigned char chip_type[MAX_CHIP_NAME_LEN]; - unsigned char chip_name[MAX_CHIP_NAME_LEN]; - unsigned char chip_ver[MAX_CHIP_NAME_LEN]; - unsigned int aicore_cnt; - unsigned char npu_name[MAX_CHIP_NAME_LEN]; -}; - -struct dcmi_pcie_info_all { - unsigned int venderid; /* 厂商id */ - unsigned int subvenderid; /* 厂商子id */ - unsigned int deviceid; /* 设备id */ - unsigned int subdeviceid; /* 设备子id */ - int domain; - unsigned int bdf_busid; - unsigned int bdf_deviceid; - unsigned int bdf_funcid; - unsigned char reserve[32]; /* the size of dcmi_pcie_info_all is 64 */ -}; - -struct dcmi_die_id { - unsigned int soc_die[DIE_ID_COUNT]; -}; - -struct dcmi_ecc_info { - int enable_flag; - unsigned int single_bit_error_cnt; - unsigned int double_bit_error_cnt; - unsigned int total_single_bit_error_cnt; - unsigned int total_double_bit_error_cnt; - unsigned int single_bit_isolated_pages_cnt; - unsigned int double_bit_isolated_pages_cnt; - unsigned int single_bit_next_isolated_pages_cnt; - unsigned int double_bit_next_isolated_pages_cnt; -}; - -struct dcmi_hbm_info { - unsigned long long memory_size; - unsigned int freq; - unsigned long long memory_usage; - int temp; - unsigned int bandwith_util_rate; -}; - -struct dcmi_get_memory_info_stru { - unsigned long long memory_size; /* unit:MB */ - unsigned long long memory_available; /* free + hugepages_free * hugepagesize */ - unsigned int freq; - unsigned long hugepagesize; /* unit:KB */ - unsigned long hugepages_total; - unsigned long hugepages_free; - unsigned int utiliza; /* ddr memory info usages */ - unsigned char reserve[60]; /* the size of dcmi_memory_info is 96 */ -}; - -enum dcmi_ip_addr_type { - DCMI_IPADDR_TYPE_V4 = 0, /** IPv4 */ - DCMI_IPADDR_TYPE_V6 = 1, /** IPv6 */ - DCMI_IPADDR_TYPE_ANY = 2 /** IPv4+IPv6 ("dual-stack") */ -}; - -struct dcmi_ip_addr { - union { - unsigned char ip6[16]; - unsigned char ip4[4]; - } u_addr; - enum dcmi_ip_addr_type ip_type; -}; - -enum dcmi_unit_type { - NPU_TYPE = 0, - MCU_TYPE = 1, - CPU_TYPE = 2, - INVALID_TYPE = 0xFF -}; - -enum dcmi_rdfx_detect_result { - DCMI_RDFX_DETECT_OK = 0, - DCMI_RDFX_DETECT_SOCK_FAIL = 1, - DCMI_RDFX_DETECT_RECV_TIMEOUT = 2, - DCMI_RDFX_DETECT_UNREACH = 3, - DCMI_RDFX_DETECT_TIME_EXCEEDED = 4, - DCMI_RDFX_DETECT_FAULT = 5, - DCMI_RDFX_DETECT_INIT = 6, - DCMI_RDFX_DETECT_THREAD_ERR = 7, - DCMI_RDFX_DETECT_IP_SET = 8, - DCMI_RDFX_DETECT_MAX = 0xFF -}; - -enum dcmi_port_type { - DCMI_VNIC_PORT = 0, - DCMI_ROCE_PORT = 1, - DCMI_INVALID_PORT -}; - -enum dcmi_main_cmd { - DCMI_MAIN_CMD_DVPP = 0, - DCMI_MAIN_CMD_ISP, - DCMI_MAIN_CMD_TS_GROUP_NUM, - DCMI_MAIN_CMD_CAN, - DCMI_MAIN_CMD_UART, - DCMI_MAIN_CMD_UPGRADE = 5, - DCMI_MAIN_CMD_HCCS = 16, - DCMI_MAIN_CMD_TEMP = 50, - DCMI_MAIN_CMD_SVM = 51, - DCMI_MAIN_CMD_VDEV_MNG, - DCMI_MAIN_CMD_SIO = 56, - DCMI_MAIN_CMD_DEVICE_SHARE = 0x8001, - DCMI_MAIN_CMD_MAX -}; - -enum dcmi_freq_type { - DCMI_FREQ_DDR = 1, - DCMI_FREQ_CTRLCPU = 2, - DCMI_FREQ_HBM = 6, - DCMI_FREQ_AICORE_CURRENT_ = 7, - DCMI_FREQ_AICORE_MAX = 9, - DCMI_FREQ_VECTORCORE_CURRENT = 12 -}; - -enum dcmi_reset_channel { - OUTBAND_CHANNEL = 0, // out-of-band reset - INBAND_CHANNEL // in-band reset -}; - -enum dcmi_boot_status { - DCMI_BOOT_STATUS_UNINIT = 0, // not init - DCMI_BOOT_STATUS_BIOS, // BIOS starting - DCMI_BOOT_STATUS_OS, // OS starting - DCMI_BOOT_STATUS_FINISH // started -}; - -enum dcmi_device_type { - DCMI_DEVICE_TYPE_DDR, - DCMI_DEVICE_TYPE_SRAM, - DCMI_DEVICE_TYPE_HBM, - DCMI_DEVICE_TYPE_NPU, - DCMI_DEVICE_TYPE_NONE = 0xff -}; - -enum dcmi_event_type { - DCMI_DMS_FAULT_EVENT = 0, -}; - -enum dcmi_die_type { - NDIE, - VDIE -}; - -#define DCMI_VDEV_RES_NAME_LEN 16 -#define DCMI_VDEV_SIZE 20 -#define DCMI_VDEV_FOR_RESERVE 32 -#define DCMI_SOC_SPLIT_MAX 32 -#define DCMI_MAX_EVENT_NAME_LENGTH 256 -#define DCMI_MAX_EVENT_DATA_LENGTH 32 -#define DCMI_EVENT_FILTER_FLAG_EVENT_ID (1UL << 0) -#define DCMI_EVENT_FILTER_FLAG_SERVERITY (1UL << 1) -#define DCMI_EVENT_FILTER_FLAG_NODE_TYPE (1UL << 2) -#define DCMI_MAX_EVENT_RESV_LENGTH 32 -#define HCCS_MAX_PCS_NUM 16 -#define HCCS_RES_PCS_NUM 64 -#define IP_ADDR_LIST_LEN 1024 -#define HCCS_PING_MESH_MAX_NUM 48 -#define ADDR_MAX_LEN 16 - -struct dcmi_base_resource { - unsigned long long token; - unsigned long long token_max; - unsigned long long task_timeout; - unsigned int vfg_id; - unsigned char vip_mode; - unsigned char reserved[DCMI_VDEV_FOR_RESERVE - 1]; /* bytes aligned */ -}; - -/* total types of computing resource */ -struct dcmi_computing_resource { - /* accelator resource */ - float aic; - float aiv; - unsigned short dsa; - unsigned short rtsq; - unsigned short acsq; - unsigned short cdqm; - unsigned short c_core; - unsigned short ffts; - unsigned short sdma; - unsigned short pcie_dma; - - /* memory resource, MB as unit */ - unsigned long long memory_size; - - /* id resource */ - unsigned int event_id; - unsigned int notify_id; - unsigned int stream_id; - unsigned int model_id; - - /* cpu resource */ - unsigned short topic_schedule_aicpu; - unsigned short host_ctrl_cpu; - unsigned short host_aicpu; - unsigned short device_aicpu; - unsigned short topic_ctrl_cpu_slot; - - /* vnpu resource */ - unsigned int vdev_aicore_utilization; - unsigned long long vdev_memory_total; - unsigned long long vdev_memory_free; - - unsigned char reserved[DCMI_VDEV_FOR_RESERVE-DCMI_VDEV_SIZE]; -}; - -struct dcmi_media_resource { - /* dvpp resource */ - float jpegd; - float jpege; - float vpc; - float vdec; - float pngd; - float venc; - unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; -}; - -struct dcmi_create_vdev_out { - unsigned int vdev_id; - unsigned int pcie_bus; - unsigned int pcie_device; - unsigned int pcie_func; - unsigned int vfg_id; - unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; -}; - -struct dcmi_create_vdev_res_stru { - unsigned int vdev_id; - unsigned int vfg_id; - char template_name[TEMPLATE_NAME_LEN]; - unsigned char reserved[64]; -}; - -struct dcmi_vdev_query_info { - char name[DCMI_VDEV_RES_NAME_LEN]; - unsigned int status; - unsigned int is_container_used; - unsigned int vfid; - unsigned int vfg_id; - unsigned long long container_id; - struct dcmi_base_resource base; - struct dcmi_computing_resource computing; - struct dcmi_media_resource media; -}; - -/* for single search */ -struct dcmi_vdev_query_stru { - unsigned int vdev_id; - struct dcmi_vdev_query_info query_info; -}; - -struct dcmi_soc_free_resource { - unsigned int vfg_num; - unsigned int vfg_bitmap; - struct dcmi_base_resource base; - struct dcmi_computing_resource computing; - struct dcmi_media_resource media; -}; - -struct dcmi_soc_total_resource { - unsigned int vdev_num; - unsigned int vdev_id[DCMI_SOC_SPLIT_MAX]; - unsigned int vfg_num; - unsigned int vfg_bitmap; - struct dcmi_base_resource base; - struct dcmi_computing_resource computing; - struct dcmi_media_resource media; -}; - -struct dcmi_spod_info { - unsigned int sdid; - unsigned int scale_type; - unsigned int super_pod_id; - unsigned int server_id; - unsigned int reserve[8]; -}; - -struct dcmi_dms_fault_event { - unsigned int event_id; /* Event ID */ - unsigned short deviceid; /* Device ID */ - unsigned char node_type; /* Node type */ - unsigned char node_id; /* Node ID */ - unsigned char sub_node_type; /* Subnode type */ - unsigned char sub_node_id; /* Subnode ID */ - unsigned char severity; /* Event severity. 0: warning; 1: minor; 2: major; 3: critical */ - unsigned char assertion; /* Event type. 0: fault recovery; 1: fault generation; 2: one-off event */ - int event_serial_num; /* Alarm serial number */ - int notify_serial_num; /* Notification serial number*/ - /* Time when the event occurs, presenting as the number of seconds that have elapsed since the Unix epoch. */ - unsigned long long alarm_raised_time; - char event_name[DCMI_MAX_EVENT_NAME_LENGTH]; /* Event description */ - char additional_info[DCMI_MAX_EVENT_DATA_LENGTH]; /* Additional event information */ - unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /**< Reserves 32 bytes */ -}; - -struct dcmi_event { - enum dcmi_event_type type; /* Event type */ - union { - struct dcmi_dms_fault_event dms_event; /* Event content */ - } event_t; -}; - -struct dcmi_event_filter { - /* It can be used to enable one or all filter criteria. The filter criteria are as follows: - 0: disables the filter criteria. - DCMI_EVENT_FILTER_FLAG_EVENT_ID: receives only specified events. - DCMI_EVENT_FILTER_FLAG_SERVERITY: receives only the events of a specified level and higher levels. - DCMI_EVENT_FILTER_FLAG_NODE_TYPE: receives only events of a specified node type. */ - unsigned long long filter_flag; - /* Receives a specified event. For details, see the Health Management Error Definition. */ - unsigned int event_id; - /* Receives events of a specified level and higher levels. For details, - see the severity definition in the struct dcmi_dms_fault_event structure. */ - unsigned char severity; - /* Receives only events of a specified node type. For details, see the Health Management Error Definition. */ - unsigned char node_type; - unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /* < Reserves 32 bytes. */ -}; - -struct dcmi_proc_mem_info { - int proc_id; - // unit is byte - unsigned long proc_mem_usage; -}; - -struct dcmi_board_info { - unsigned int board_id; - unsigned int pcb_id; - unsigned int bom_id; - unsigned int slot_id; // slot_id indicates pcie slot ID of the chip -}; - -struct dcmi_pcie_link_bandwidth_info { - int profiling_time; - unsigned int tx_p_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int tx_np_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int tx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int tx_np_lantency[AGENTDRV_PROF_DATA_NUM]; - unsigned int rx_p_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int rx_np_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int rx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; -}; - -struct dcmi_hccs_statistic_info { - unsigned int tx_cnt[HCCS_MAX_PCS_NUM]; - unsigned int rx_cnt[HCCS_MAX_PCS_NUM]; - unsigned int crc_err_cnt[HCCS_MAX_PCS_NUM]; - unsigned int retry_cnt[HCCS_MAX_PCS_NUM]; - unsigned int reserved_field_cnt[HCCS_RES_PCS_NUM]; -}; - -struct dcmi_hccs_statistic_info_u64 { - unsigned long long tx_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long rx_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long crc_err_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long retry_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long reserved[HCCS_RES_PCS_NUM]; -}; - -struct dcmi_hccs_bandwidth_info { - int profiling_time; - double total_txbw; - double total_rxbw; - double tx_bandwidth[HCCS_MAX_PCS_NUM]; - double rx_bandwidth[HCCS_MAX_PCS_NUM]; -}; - -struct dcmi_sio_crc_err_statistic_info { - unsigned short tx_error_count; - unsigned short rx_error_count; - unsigned char reserved[8]; -}; - -struct dcmi_elabel_info { - char product_name[MAX_LENGTH]; - char model[MAX_LENGTH]; - char manufacturer[MAX_LENGTH]; - char manufacturer_date[MAX_LENGTH]; - char serial_number[MAX_LENGTH]; -}; - -struct dcmi_hccsping_mesh_operate { - char dst_addr_list[IP_ADDR_LIST_LEN]; - int pkt_size; - int pkt_send_num; - int pkt_interval; - int timeout; - int task_interval; - int task_id; -}; - -struct dcmi_hccsping_mesh_info { - char dst_addr[HCCS_PING_MESH_MAX_NUM][ADDR_MAX_LEN]; - unsigned int suc_pkt_num[HCCS_PING_MESH_MAX_NUM]; - unsigned int fail_pkt_num[HCCS_PING_MESH_MAX_NUM]; - long max_time[HCCS_PING_MESH_MAX_NUM]; - long min_time[HCCS_PING_MESH_MAX_NUM]; - long avg_time[HCCS_PING_MESH_MAX_NUM]; - long tp95_time[HCCS_PING_MESH_MAX_NUM]; - int reply_stat_num[HCCS_PING_MESH_MAX_NUM]; - unsigned long long ping_total_num[HCCS_PING_MESH_MAX_NUM]; - int dest_num; -}; - -#define DCMI_VERSION_1 -#define DCMI_VERSION_2 - -#if defined DCMI_VERSION_2 - -DCMIDLLEXPORT int dcmi_init(void); - -DCMIDLLEXPORT int dcmi_get_card_list(int *card_num, int *card_list, int list_len); - -DCMIDLLEXPORT int dcmi_get_device_num_in_card(int card_id, int *device_num); - -DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); - -DCMIDLLEXPORT int dcmi_get_device_type(int card_id, int device_id, enum dcmi_unit_type *device_type); - -DCMIDLLEXPORT int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); - -DCMIDLLEXPORT int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info); - -DCMIDLLEXPORT int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); - -DCMIDLLEXPORT int dcmi_get_device_power_info(int card_id, int device_id, int *power); - -DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); - -DCMIDLLEXPORT int dcmi_get_device_errorcode_v2( - int card_id, int device_id, int *error_count, unsigned int *error_code_list, unsigned int list_len); - -DCMIDLLEXPORT int dcmi_get_device_temperature(int card_id, int device_id, int *temperature); - -DCMIDLLEXPORT int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage); - -DCMIDLLEXPORT int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, - struct dcmi_ecc_info *device_ecc_info); - -DCMIDLLEXPORT int dcmi_get_device_frequency( - int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency); - -DCMIDLLEXPORT int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); - -DCMIDLLEXPORT int dcmi_get_device_memory_info_v3(int card_id, int device_id, - struct dcmi_get_memory_info_stru *memory_info); - -DCMIDLLEXPORT int dcmi_get_device_utilization_rate( - int card_id, int device_id, int input_type, unsigned int *utilization_rate); - -DCMIDLLEXPORT int dcmi_get_device_info( - int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, unsigned int *size); - -DCMIDLLEXPORT int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, - struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); - -DCMIDLLEXPORT int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result); - -DCMIDLLEXPORT int dcmi_get_device_logic_id(int *device_logic_id, int card_id, int device_id); - -DCMIDLLEXPORT int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, - struct dcmi_create_vdev_out *out); - -DCMIDLLEXPORT int dcmi_set_destroy_vdevice(int card_id, int device_id, unsigned int vdevid); - -DCMIDLLEXPORT int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid); - -DCMIDLLEXPORT int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid); - -DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id); - -DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_phyid(int *card_id, int *device_id, unsigned int device_phy_id); - -DCMIDLLEXPORT int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size); - -DCMIDLLEXPORT int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type); - -DCMIDLLEXPORT int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state); - -DCMIDLLEXPORT int dcmi_pre_reset_soc(int card_id, int device_id); - -DCMIDLLEXPORT int dcmi_rescan_soc(int card_id, int device_id); - -DCMIDLLEXPORT int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id); - -DCMIDLLEXPORT int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status); - -DCMIDLLEXPORT int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter); - -DCMIDLLEXPORT int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode); - -DCMIDLLEXPORT int dcmi_get_device_die_v2( - int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id); - -DCMIDLLEXPORT int dcmi_get_device_resource_info (int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, - int *proc_num); - -DCMIDLLEXPORT int dcmi_get_device_board_info (int card_id, int device_id, struct dcmi_board_info *board_info); - -DCMIDLLEXPORT int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, - struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); - -DCMIDLLEXPORT int dcmi_get_dcmi_version (char *dcmi_ver, int buf_size); - -DCMIDLLEXPORT int dcmi_get_mainboard_id (int card_id, int device_id, unsigned int *mainboard_id); - -DCMIDLLEXPORT int dcmi_get_hccs_link_bandwidth_info (int card_id, int device_id, struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); - -DCMIDLLEXPORT int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, struct dcmi_hccsping_mesh_operate *hccsping_mesh); - -DCMIDLLEXPORT int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id); - -DCMIDLLEXPORT int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, struct dcmi_hccsping_mesh_info *hccsping_mesh_reply); - -DCMIDLLEXPORT int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, unsigned int *state); - -DCMIDLLEXPORT int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status); - -DCMIDLLEXPORT int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status); - -#endif - -#if defined DCMI_VERSION_1 -/* The following interfaces are V1 version interfaces. In order to ensure the compatibility is temporarily reserved, - * the later version will be deleted. Please switch to the V2 version interface as soon as possible */ - -struct dcmi_memory_info_stru { - unsigned long long memory_size; - unsigned int freq; - unsigned int utiliza; -}; - -DCMIDLLEXPORT int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info); - -DCMIDLLEXPORT int dcmi_get_device_errorcode( - int card_id, int device_id, int *error_count, unsigned int *error_code, int *error_width); - -DCMIDLLEXPORT int dcmi_mcu_get_power_info(int card_id, int *power); - -DCMIDLLEXPORT int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info); -#endif - -#ifdef __cplusplus -#if __cplusplus -} -#endif -#endif /* __cplusplus */ - -#endif /* __DCMI_INTERFACE_API_H__ */ diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager.go b/mind-cluster/component/ascend-common/devmanager/devmanager.go deleted file mode 100644 index fe21931..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager.go +++ /dev/null @@ -1,1197 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager -package devmanager - -import ( - "errors" - "fmt" - "math" - "strings" - "sync" - "time" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// DeviceInterface for common device interface -type DeviceInterface interface { - Init() error - ShutDown() error - GetDcmiVersion() string - GetDeviceCount() (int32, error) - GetCardList() (int32, []int32, error) - GetDeviceNumInCard(cardID int32) (int32, error) - GetDeviceList() (int32, []int32, error) - GetChipBaseInfos() ([]*common.ChipBaseInfo, error) - GetDeviceHealth(logicID int32) (uint32, error) - GetDeviceNetWorkHealth(logicID int32) (uint32, error) - GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) - GetDeviceTemperature(logicID int32) (int32, error) - GetDeviceVoltage(logicID int32) (float32, error) - GetDevicePowerInfo(logicID int32) (float32, error) - GetMcuPowerInfo(cardID int32) (float32, error) - GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) - GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) - GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) - GetDeviceErrorCode(logicID int32) (int32, int64, error) - GetChipInfo(logicID int32) (*common.ChipInfo, error) - GetPhysicIDFromLogicID(logicID int32) (int32, error) - GetLogicIDFromPhysicID(physicID int32) (int32, error) - GetDeviceLogicID(cardID, deviceID int32) (int32, error) - GetCardIDDeviceID(logicID int32) (int32, int32, error) - GetDeviceIPAddress(logicID, ipType int32) (string, error) - CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) - GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) - DestroyVirtualDevice(logicID int32, vDevID uint32) error - GetDevType() string - GetProductTypeArray() []string - GetProductType(cardID, deviceID int32) (string, error) - GetAllProductType() ([]string, error) - GetNpuWorkMode() string - SetDeviceReset(cardID, deviceID int32) error - GetBrotherCardID(int32, int32) (int32, error) - PreResetSoc(int32, int32) error - GetOutBandChannelState(int32, int32) error - SetDeviceResetOutBand(int32, int32) error - RescanSoc(int32, int32) error - GetDeviceBootStatus(logicID int32) (int, error) - GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) - SubscribeDeviceFaultEvent(logicID int32) error - SetFaultEventCallFunc(func(common.DevFaultInfo)) error - GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) - GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) - GetPCIeBusInfo(logicID int32) (string, error) - GetBoardInfo(logicID int32) (common.BoardInfo, error) - GetCardElabelV2(cardID int32) (common.ElabelInfo, error) - GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) - SetIsTrainingCard() error - IsTrainingCard() bool - GetValidChipInfo() (common.ChipInfo, error) - GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) - GetSuperPodInfo(int32) (common.CgoSuperPodInfo, error) - GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) - GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) - GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) - GetMainBoardId() uint32 - GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) - - DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error - DcStopHccsPingMesh(int32, int32, int, uint) error - DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) - DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) - DcGetSuperPodStatus(int32, int32, uint32) (int, error) - DcSetSuperPodStatus(int32, int32, uint32, uint32) error -} - -const ( - // init dcmi interface max retry times - maxRetries = 6 - // init dcmi interface retry delay - defaultRetryDelay = 10 -) - -var ( - devManager *DeviceManager = nil - devManagerOnce sync.Once - idCache sync.Map -) - -// npuIdMapping the mapping between the three IDs -type npuIdMapping struct { - logicId int32 - cardId int32 - deviceId int32 -} - -// GetDeviceManager singleton to init global device manager and init dcmi interface -func GetDeviceManager(resetTimeout int) (*DeviceManager, error) { - devManagerOnce.Do(func() { - // a common dcmi Manager is initiated for init dcmi interface, you can specify an specific manager in later - dcMgr := dcmi.DcManager{} - var retryDelay time.Duration = defaultRetryDelay - hwlog.RunLog.Infof("get card list from dcmi reset timeout is %d", resetTimeout) - for currentTime, retryCount := 0, 0; currentTime <= resetTimeout; currentTime += int(retryDelay) { - if err := dcMgr.DcInit(); err != nil { - hwlog.RunLog.Errorf("deviceManager init failed, prepare dcmi failed, err: %v", err) - return - } - cardNum, cardList, err := dcMgr.DcGetCardList() - if err == nil && int(cardNum) == len(cardList) { - hwlog.RunLog.Infof("deviceManager get cardList is %v, cardList length equal to cardNum: %v", - cardList, cardNum) - break - } - if diffTime := float64(resetTimeout - currentTime); diffTime > 0 { - retryDelay = time.Duration(math.Min(float64(defaultRetryDelay), diffTime)) - } - retryCount++ - hwlog.RunLog.Warnf("deviceManager get card list failed (attempt %d), cardNum=%d, cardList=%v, "+ - "err: %v", retryCount, cardNum, cardList, err) - if currentTime+int(retryDelay) <= resetTimeout { - if err = dcMgr.DcShutDown(); err != nil { - hwlog.RunLog.Errorf("deviceManager shut down failed, err: %v", err) - return - } - time.Sleep(retryDelay * time.Second) - continue - } - if int(cardNum) != len(cardList) { - hwlog.RunLog.Warnf("deviceManager get cardList is %v, but cardNum is %v, "+ - "please check whether the real number of npu matches the cardList", cardList, cardNum) - } - } - devManager = &DeviceManager{} - devManager.DcMgr = &dcMgr - dcmiVer, err := dcMgr.DcGetDcmiVersion() - if err != nil { - hwlog.RunLog.Warnf("deviceManager get dcmi version failed, err: %v", err) - } - hwlog.RunLog.Infof("the dcmi version is %s", dcmiVer) - devManager.dcmiVersion = dcmiVer - }) - if devManager == nil { - return nil, errors.New("device Manager is nil, may encounter an exception during initialization. " + - "You can check the system log to confirm") - } - return devManager, nil -} - -// DeviceManager common device manager for Ascend910/310P/310 -type DeviceManager struct { - // DcMgr for common dev manager - DcMgr dcmi.DcDriverInterface - // DevType the value is the same as the device type corresponding to the DcMgr variable. - // Options: api.Ascend310,api.Ascend310P,api.Ascend910 - DevType string - // ProductTypes product type in server, multi type will be in 310P mix scene - ProductTypes []string - // isTrainingCard whether the device is used for training - isTrainingCard bool - dcmiVersion string - // mainBoardId used to distinguish between A900A3SuperPod and A9000A3SuperPod - mainBoardId uint32 -} - -// GetProductTypeArray return product types -func (d *DeviceManager) GetProductTypeArray() []string { - return d.ProductTypes -} - -// GetDevType return dev type -func (d *DeviceManager) GetDevType() string { - return d.DevType -} - -// AutoInit auto detect npu chip type and return the corresponding processing object -func AutoInit(dType string, resetTimeout int) (*DeviceManager, error) { - chipInfo, boardInfo, err := getDeviceInfoForInit(resetTimeout) - if err != nil { - return nil, fmt.Errorf("auto init failed, err: %s", err) - } - var devMgr *DeviceManager - if devMgr, err = GetDeviceManager(resetTimeout); err != nil || devMgr == nil { - return nil, err - } - mainBoardId, err := getValidMainBoardInfo(devMgr.DcMgr) - if err != nil { - // Non-blocking when the main board ID is not found - hwlog.RunLog.Warn(err) - } - devMgr.mainBoardId = mainBoardId - var devType = common.GetDevType(chipInfo.Name, boardInfo.BoardId) - - switch devType { - case api.Ascend910A, api.Ascend910B, api.Ascend910A3: - devMgr.DcMgr = &A910Manager{} - case api.Ascend310P: - devMgr.DcMgr = &A310PManager{} - case api.Ascend310, api.Ascend310B: - devMgr.DcMgr = &A310Manager{} - default: - return nil, fmt.Errorf("unsupport device type (%s)", devType) - } - hwlog.RunLog.Infof("chipName: %v, devType: %v", chipInfo.Name, devType) - if dType != "" && devType != dType { - return nil, fmt.Errorf("the value of dType(%s) is inconsistent with the actual chip type(%s)", - dType, devType) - } - devMgr.DevType = devType - if err := devMgr.SetIsTrainingCard(); err != nil { - hwlog.RunLog.Errorf("auto recognize training card failed, err: %s", err) - } - - pTypes, err := devMgr.GetAllProductType() - if err != nil { - hwlog.RunLog.Debugf("auto init product types failed, err: %s", err) - } - devMgr.ProductTypes = pTypes - return devMgr, nil -} - -func getDeviceInfoForInit(resetTimeout int) (common.ChipInfo, common.BoardInfo, error) { - var mgr *DeviceManager - var err error - if mgr, err = GetDeviceManager(resetTimeout); err != nil || mgr == nil { - return common.ChipInfo{}, common.BoardInfo{}, fmt.Errorf("get chip info failed, err: %v", err) - } - dcMgr := mgr.DcMgr - chipInfo, err := getValidChipInfo(dcMgr) - if err != nil { - hwlog.RunLog.Error(err) - return common.ChipInfo{}, common.BoardInfo{}, err - } - boardInfo, err := getValidBoardInfo(dcMgr) - if err != nil { - hwlog.RunLog.Error(err) - return chipInfo, common.BoardInfo{}, err - } - - return chipInfo, boardInfo, nil -} - -func getValidChipInfo(dcMgr dcmi.DcDriverInterface) (common.ChipInfo, error) { - // get card list - cardNum, cardList, err := dcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return common.ChipInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) - } - if cardNum == 0 { - return common.ChipInfo{}, fmt.Errorf("get chip info failed, no card found") - } - // get device in card, then get chip info by cardID and deviceID - for _, cardID := range cardList { - devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) - if err != nil || devNum == 0 { - hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) - continue - } - for devID := int32(0); devID < devNum; devID++ { - chipInfo, err := dcMgr.DcGetChipInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Debugf("get chip info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - if !common.IsValidChipInfo(chipInfo) { - hwlog.RunLog.Debugf("invalid chip info by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - return *chipInfo, nil - } - } - return common.ChipInfo{}, errors.New("cannot get valid chip info") -} - -func getValidBoardInfo(dcMgr dcmi.DcDriverInterface) (common.BoardInfo, error) { - // get card list - cardNum, cardList, err := dcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return common.BoardInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) - } - if cardNum == 0 { - return common.BoardInfo{}, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) - } - // get device in card, then get board info by cardID and deviceID - for _, cardID := range cardList { - devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) - if err != nil || devNum == 0 { - hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) - continue - } - for devID := int32(0); devID < devNum; devID++ { - boardInfo, err := dcMgr.DcGetDeviceBoardInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Debugf("get board info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - if !common.IsValidBoardInfo(&boardInfo) { - hwlog.RunLog.Debugf("invalid board info by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - return boardInfo, nil - } - } - return common.BoardInfo{}, errors.New("cannot get valid board info") -} -func getValidMainBoardInfo(dcMgr dcmi.DcDriverInterface) (uint32, error) { - // get card list - cardNum, cardList, err := dcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return 0, fmt.Errorf(common.ErrMsgInitCardListFailed) - } - if cardNum == 0 { - return 0, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) - } - // get device in card, then get board info by cardID and deviceID - for _, cardID := range cardList { - devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) - if err != nil || devNum == 0 { - hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) - continue - } - for devID := int32(0); devID < devNum; devID++ { - mainBoardId, err := dcMgr.DcGetDeviceMainBoardInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Debug(err) - continue - } - if !common.IsValidMainBoardInfo(mainBoardId) { - hwlog.RunLog.Warnf("invalid mainBoardId info by cardID(%d), deviceID(%d), error: %v", cardID, devID, err) - continue - } - return mainBoardId, nil - } - } - return 0, errors.New("cannot get main board id") -} - -// Init load symbol and initialize dcmi -func (d *DeviceManager) Init() error { - return d.DcMgr.DcInit() -} - -// ShutDown clean the dynamically loaded resource -func (d *DeviceManager) ShutDown() error { - return d.DcMgr.DcShutDown() -} - -// GetDeviceCount get npu device count -func (d *DeviceManager) GetDeviceCount() (int32, error) { - return d.DcMgr.DcGetDeviceCount() -} - -// GetCardList get all card list -func (d *DeviceManager) GetCardList() (int32, []int32, error) { - return d.DcMgr.DcGetCardList() -} - -// GetDeviceNumInCard get all device list in one card -func (d *DeviceManager) GetDeviceNumInCard(cardID int32) (int32, error) { - return d.DcMgr.DcGetDeviceNumInCard(cardID) -} - -// GetDeviceList get all device logicID list -func (d *DeviceManager) GetDeviceList() (int32, []int32, error) { - return d.DcMgr.DcGetLogicIDList() -} - -// GetDeviceHealth query npu device health status -func (d *DeviceManager) GetDeviceHealth(logicID int32) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get health code by logicID(%d)", logicID) - } - healthCode, err := d.DcMgr.DcGetDeviceHealth(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, err - } - - return uint32(healthCode), nil -} - -// GetDeviceNetWorkHealth query npu device network health status -func (d *DeviceManager) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get network health code by logicID(%d)", logicID) - } - healthCode, err := d.DcMgr.DcGetDeviceNetWorkHealth(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, err - } - - return healthCode, nil -} - -// GetDeviceUtilizationRate get npu device utilization -func (d *DeviceManager) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get utilization by logicID(%d)", logicID) - } - rate, err := d.DcMgr.DcGetDeviceUtilizationRate(cardID, deviceID, deviceType) - if err != nil { - return common.UnRetError, err - } - - return uint32(rate), nil -} - -// GetDeviceTemperature get npu device temperature -func (d *DeviceManager) GetDeviceTemperature(logicID int32) (int32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) - } - temp, err := d.DcMgr.DcGetDeviceTemperature(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) - } - - return temp, nil -} - -// GetDeviceVoltage get npu device voltage -func (d *DeviceManager) GetDeviceVoltage(logicID int32) (float32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) - } - voltage, err := d.DcMgr.DcGetDeviceVoltage(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) - } - - return voltage, nil -} - -// GetDevicePowerInfo get npu device power info -func (d *DeviceManager) GetDevicePowerInfo(logicID int32) (float32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) - } - power, err := d.DcMgr.DcGetDevicePowerInfo(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) - } - - return power, nil -} - -// GetDeviceFrequency get npu device work frequency -func (d *DeviceManager) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) - } - frequency, err := d.DcMgr.DcGetDeviceFrequency(cardID, deviceID, deviceType) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) - } - - return frequency, nil -} - -// GetDeviceMemoryInfo get npu memory information -func (d *DeviceManager) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) - } - - // 910B and 910A3 don't have DDR module. Therefore, DDR information cannot be queried. - if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { - hwlog.RunLog.Debugf("%v doesn't have DDR module. Therefore, DDR information cannot be queried", d.DevType) - return nil, nil - } - - memInfo, err := d.DcMgr.DcGetMemoryInfo(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) - } - - return memInfo, nil -} - -// GetDeviceHbmInfo get npu HBM module memory and frequency information -func (d *DeviceManager) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get hbm info by logicID(%d)", logicID) - } - hbmInfo, err := d.DcMgr.DcGetHbmInfo(cardID, deviceID) - if err != nil { - return nil, err - } - - return hbmInfo, nil -} - -// GetDeviceErrorCode get npu device error code -func (d *DeviceManager) GetDeviceErrorCode(logicID int32) (int32, int64, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", - logicID) - } - errCount, errCode, err := d.DcMgr.DcGetDeviceErrorCode(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", - logicID) - } - - return errCount, errCode, nil -} - -// GetChipInfo get npu device error code -func (d *DeviceManager) GetChipInfo(logicID int32) (*common.ChipInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d), error: %v", logicID, err) - } - chipInfo, err := d.DcMgr.DcGetChipInfo(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get chip info code by logicID(%d)", logicID) - } - - return chipInfo, nil -} - -// GetPhysicIDFromLogicID get device physic id from logic id -func (d *DeviceManager) GetPhysicIDFromLogicID(logicID int32) (int32, error) { - physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get physicID by logicID(%d)", logicID) - } - - return physicID, nil -} - -// GetLogicIDFromPhysicID get device logic id from physic id -func (d *DeviceManager) GetLogicIDFromPhysicID(physicID int32) (int32, error) { - logicID, err := d.DcMgr.DcGetLogicIDFromPhysicID(physicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get logicID by physicID(%d)", physicID) - } - - return logicID, nil -} - -// GetDeviceLogicID get device logic id from card id and device id -func (d *DeviceManager) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { - return d.DcMgr.DcGetDeviceLogicID(cardID, deviceID) -} - -// GetDeviceIPAddress get device ip address -func (d *DeviceManager) GetDeviceIPAddress(logicID, ipType int32) (string, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return "", fmt.Errorf("failed to get cardID and deviceID by logicID(%d), %w", logicID, err) - } - return d.DcMgr.DcGetDeviceIPAddress(cardID, deviceID, ipType) -} - -// CreateVirtualDevice create virtual device -func (d *DeviceManager) CreateVirtualDevice( - logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { - if !common.IsValidTemplateName(d.DevType, vDevInfo.TemplateName) { - return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid template name: %s", vDevInfo.TemplateName) - } - return d.DcMgr.DcCreateVDevice(logicID, vDevInfo) -} - -// GetVirtualDeviceInfo get virtual device info -func (d *DeviceManager) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - cgoVDevInfo, err := d.DcMgr.DcGetVDeviceInfo(logicID) - if err != nil { - hwlog.RunLog.Debug(err) - return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v "+ - "and vdev num is: %d", err, int32(cgoVDevInfo.TotalResource.VDevNum)) - } - for _, vDevInfo := range cgoVDevInfo.VDevInfo { - if !common.IsValidTemplateName(d.DevType, vDevInfo.QueryInfo.Name) { - return common.VirtualDevInfo{}, fmt.Errorf("vdevice id %d, it's template name is invalid: %s", - vDevInfo.VDevID, vDevInfo.QueryInfo.Name) - } - } - return cgoVDevInfo, nil -} - -// DestroyVirtualDevice destroy virtual device -func (d *DeviceManager) DestroyVirtualDevice(logicID int32, vDevID uint32) error { - return d.DcMgr.DcDestroyVDevice(logicID, vDevID) -} - -// GetMcuPowerInfo get mcu power info for cardID -func (d *DeviceManager) GetMcuPowerInfo(cardID int32) (float32, error) { - return d.DcMgr.DcGetMcuPowerInfo(cardID) -} - -// GetCardIDDeviceID get cardID and deviceID by logicID -func (d *DeviceManager) GetCardIDDeviceID(logicID int32) (int32, int32, error) { - return d.getCardIdAndDeviceId(logicID) -} - -// GetProductType get product type by cardID and deviceID -func (d *DeviceManager) GetProductType(cardID, deviceID int32) (string, error) { - return d.DcMgr.DcGetProductType(cardID, deviceID) -} - -// GetAllProductType get all product type -func (d *DeviceManager) GetAllProductType() ([]string, error) { - productTypes := make([]string, 0) - cardNum, cardList, err := d.GetCardList() - if err != nil || cardNum == 0 { - hwlog.RunLog.Errorf("failed to get card list, err: %v", err) - return productTypes, err - } - for _, cardID := range cardList { - devNum, err := d.GetDeviceNumInCard(cardID) - if err != nil { - hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) - continue - } - if devNum == 0 { - hwlog.RunLog.Debugf("not found device on card %d", cardID) - continue - } - for devID := int32(0); devID < devNum; devID++ { - productType, err := d.GetProductType(cardID, devID) - if err != nil { - hwlog.RunLog.Debugf("get product type by card %d deviceID %d failed, err: %v", cardID, devID, err) - continue - } - productTypes = append(productTypes, productType) - break - } - } - if len(productTypes) != 0 { - productTypes = common.RemoveDuplicate(&productTypes) - } - return productTypes, nil -} - -// GetNpuWorkMode get work mode of NPU -func (d *DeviceManager) GetNpuWorkMode() string { - if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { - hwlog.RunLog.Warnf("only AMP mode is available on %s", d.DevType) - return common.AMPMode - } - - _, cardList, err := d.DcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return "" - } - if len(cardList) > 0 { - mode, err := d.DcMgr.DcGetNpuWorkMode(cardList[0]) - if err != nil { - hwlog.RunLog.Error(err) - return "" - } - if mode == 0 { - return common.AMPMode - } - return common.SMPMode - } - return "" -} - -// SetDeviceReset reset spec device -func (d *DeviceManager) SetDeviceReset(cardID, deviceID int32) error { - return d.DcMgr.DcSetDeviceReset(cardID, deviceID) -} - -// GetBrotherCardID get brother card id -func (d *DeviceManager) GetBrotherCardID(cardID, deviceID int32) (int32, error) { - return d.DcMgr.DcGetBrotherCardID(cardID, deviceID) -} - -// GetOutBandChannelState get out band channel state -func (d *DeviceManager) GetOutBandChannelState(cardID, deviceID int32) error { - return d.DcMgr.DcGetOutBandChannelState(cardID, deviceID) -} - -// PreResetSoc pre reset soc, used before reset out band -func (d *DeviceManager) PreResetSoc(cardID, deviceID int32) error { - return d.DcMgr.DcPreResetSoc(cardID, deviceID) -} - -// SetDeviceResetOutBand reset spec device out band -func (d *DeviceManager) SetDeviceResetOutBand(cardID, deviceID int32) error { - return d.DcMgr.DcSetDeviceResetOutBand(cardID, deviceID) -} - -// RescanSoc trigger soc rescan, non-blocking -func (d *DeviceManager) RescanSoc(cardID, deviceID int32) error { - return d.DcMgr.DcRescanSoc(cardID, deviceID) -} - -// GetDeviceBootStatus get device boot status -func (d *DeviceManager) GetDeviceBootStatus(logicID int32) (int, error) { - return d.DcMgr.DcGetDeviceBootStatus(logicID) -} - -// GetDeviceAllErrorCode get npu device all error code -func (d *DeviceManager) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", - logicID) - } - errCount, errCodes, err := d.DcMgr.DcGetDeviceAllErrorCode(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, nil, fmt.Errorf("failed to get device error code by logicID(%d)", logicID) - } - return errCount, errCodes, nil -} - -// SubscribeDeviceFaultEvent get npu device error code by subscribe -func (d *DeviceManager) SubscribeDeviceFaultEvent(logicID int32) error { - var cardID, deviceID int32 - if logicID == common.SubscribeAllDevice { - cardID = common.SubscribeAllDevice - deviceID = common.SubscribeAllDevice - } else { - var err error - cardID, deviceID, err = d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return fmt.Errorf("failed to get cardID in subscribe device error code by logicID(%d)", logicID) - } - } - if err := d.DcMgr.DcSubscribeDeviceFaultEvent(cardID, deviceID); err != nil { - hwlog.RunLog.Error(err) - return fmt.Errorf("failed to subscribe device error code by logicID(%d)", logicID) - } - return nil -} - -// SetFaultEventCallFunc set fault event call func -func (d *DeviceManager) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { - if businessFunc == nil { - return errors.New("business func can't be nil") - } - d.DcMgr.DcSetFaultEventCallFunc(businessFunc) - return nil -} - -// GetDieID return die id by dcmi die type, vdie id or ndie id -func (d *DeviceManager) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetDieID(cardID, deviceID, dcmiDieType) -} - -// GetDevProcessInfo get process and process memory in device side -func (d *DeviceManager) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetDevProcessInfo(cardID, deviceID) -} - -// GetPCIeBusInfo pcie bus info -func (d *DeviceManager) GetPCIeBusInfo(logicID int32) (string, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetPCIeBusInfo(cardID, deviceID) -} - -// GetBoardInfo return board info of device -func (d *DeviceManager) GetBoardInfo(logicID int32) (common.BoardInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.BoardInfo{}, fmt.Errorf("failed to get cardID in "+ - "get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetDeviceBoardInfo(cardID, deviceID) -} - -// GetCardElabelV2 get card elabel information -func (d *DeviceManager) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - return d.DcMgr.DcGetCardElabelV2(cardID) -} - -// GetPCIEBandwidth get pcie bandwidth -func (d *DeviceManager) GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.PCIEBwStat{}, fmt.Errorf("get cardID(deviceID) failed, error by logicID(%d)", logicID) - } - pciePCIEBw, err := d.DcMgr.DcGetPCIEBandwidth(cardID, deviceID, profilingTime) - if err != nil { - return common.PCIEBwStat{}, err - } - return pciePCIEBw, nil -} - -// SetIsTrainingCard identifies whether it is a training card according to the usage of card -func (d *DeviceManager) SetIsTrainingCard() error { - devType := d.GetDevType() - if strings.HasPrefix(devType, api.Ascend310) { - d.isTrainingCard = false - return nil - } - - boardInfo := common.BoardInfo{} - cardNum, cardList, err := d.GetCardList() - if err != nil || cardNum == 0 { - hwlog.RunLog.Errorf("failed to get card list when set 'IsTrainingCard' err: %v", err) - return err - } - for _, cardID := range cardList { - devNum, err := d.GetDeviceNumInCard(cardID) - if err != nil { - hwlog.RunLog.Warnf("get device num by cardID(%d) failed when set 'IsTrainingCard', error: %v", cardID, err) - continue - } - if devNum == 0 { - hwlog.RunLog.Warnf("not found device on card %d when set 'IsTrainingCard'", cardID) - continue - } - - for devID := int32(0); devID < devNum; devID++ { - boardInfo, err = d.DcMgr.DcGetDeviceBoardInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Warnf("get board info by card %d deviceID %d failed, err: %v", cardID, devID, err) - continue - } - break - } - if err == nil { - break - } - } - - if devType == api.Ascend910B && - (boardInfo.BoardId == common.A300IA2BoardId || boardInfo.BoardId == common.A300IA2GB64BoardId) { - d.isTrainingCard = false - return nil - } - - d.isTrainingCard = true - return nil -} - -// IsTrainingCard return true if it is a training card -func (d *DeviceManager) IsTrainingCard() bool { - return d.isTrainingCard -} - -// GetDcmiVersion get dcmi version -func (d *DeviceManager) GetDcmiVersion() string { - return d.dcmiVersion -} - -// GetMainBoardId get mainBoardId -func (d *DeviceManager) GetMainBoardId() uint32 { - return d.mainBoardId -} - -// GetValidChipInfo find a valid chip info from all cards -func (d *DeviceManager) GetValidChipInfo() (common.ChipInfo, error) { - chipInfo, err := getValidChipInfo(d.DcMgr) - if err != nil { - hwlog.RunLog.Error("failed to get valid chip info") - return common.ChipInfo{}, err - } - return chipInfo, nil -} - -// GetDeviceEccInfo query device ECC info -func (d *DeviceManager) GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Errorf("get cardID and deviceID by logicID(%d) failed, error: %v", logicID, err) - return nil, err - } - return d.DcMgr.DcGetDeviceEccInfo(cardID, deviceID, dcmiDeviceType) -} - -// GetSuperPodInfo get 910A3 super pod info -func (d *DeviceManager) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.CgoSuperPodInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) - } - - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get super pod info, error: %v", logicID, err) - } - cgoSuperPodInfo, err := d.DcMgr.DcGetSuperPodInfo(cardID, deviceID) - if err != nil { - return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get super pod info by logicID(%d), error: %v", - logicID, err) - } - - return cgoSuperPodInfo, nil -} - -// GetSioInfo get SIO info -func (d *DeviceManager) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return nil, fmt.Errorf("input invalid logicID when get sio info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) when get sio info , error: %v", logicID, err) - } - cgoSPodSioInfo, err := d.DcMgr.DcGetSioInfo(cardID, deviceID) - if err != nil { - return nil, err - } - - return &cgoSPodSioInfo, nil -} - -// GetHccsStatisticInfo get HCCS statistic info -func (d *DeviceManager) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get hccs statistic info, error: %v", logicID, err) - } - cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfo(cardID, deviceID) - if err != nil { - return buildFailedHccsInfo(), err - - } - - return &cgoHccsStatusInfo, nil -} - -// GetHccsStatisticInfoInU64 get hccs statistic info in u64 -func (d *DeviceManager) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get hccs statistic info, error: %v", logicID, err) - } - cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfoU64(cardID, deviceID) - if err != nil { - return buildFailedHccsInfo(), err - } - return &cgoHccsStatusInfo, nil -} - -// GetHccsBandwidthInfo get hccs bandwidth info -func (d *DeviceManager) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - - if !common.IsValidLogicIDOrPhyID(logicID) { - return buildFailedHccsBWInfo(), fmt.Errorf("input invalid logicID when get hccs bandwidth info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return buildFailedHccsBWInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get hccs bandwidth info, error: %v", logicID, err) - } - cgoHccsBandwidthInfo, err := d.DcMgr.DcGetHccsBandwidthInfo(cardID, deviceID, common.HccsBWProfilingTime) - if err != nil { - return buildFailedHccsBWInfo(), fmt.Errorf("failed to get hccs bandwidth info by cardId(%d) deviceID(%d), error: %v", - cardID, deviceID, err) - } - - return &cgoHccsBandwidthInfo, nil -} - -// buildFailedHccsInfo build failed hccs info -func buildFailedHccsInfo() *common.HccsStatisticInfo { - errorResult := &common.HccsStatisticInfo{ - TxCnt: make([]uint64, 8), - RxCnt: make([]uint64, 8), - CrcErrCnt: make([]uint64, 8), - } - for i := 0; i < 8; i++ { - errorResult.TxCnt[i] = common.FailedValue - errorResult.RxCnt[i] = common.FailedValue - errorResult.CrcErrCnt[i] = common.FailedValue - } - return errorResult -} - -// buildFailedHccsBWInfo build failed hccs bandwidth info -func buildFailedHccsBWInfo() *common.HccsBandwidthInfo { - errorResult := &common.HccsBandwidthInfo{ - ProfilingTime: uint32(common.HccsBWProfilingTime), - TotalTxbw: common.FailedValue, - TotalRxbw: common.FailedValue, - TxBandwidth: make([]float64, 8), - RxBandwidth: make([]float64, 8), - } - for i := 0; i < 8; i++ { - errorResult.TxBandwidth[i] = common.FailedValue - errorResult.RxBandwidth[i] = common.FailedValue - } - return errorResult -} - -func (d *DeviceManager) getCardIdAndDeviceId(logicID int32) (int32, int32, error) { - - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) - } - - result, ok := idCache.Load(logicID) - if !ok { - return d.doGetCardIDAndDeviceID(logicID) - } - idMapping, ok := result.(npuIdMapping) - if !ok { - idCache.Delete(logicID) - return d.doGetCardIDAndDeviceID(logicID) - } - hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from cache, cardId:%v, deviceId:%v", - logicID, idMapping.cardId, idMapping.deviceId) - return idMapping.cardId, idMapping.deviceId, nil -} - -func (d *DeviceManager) doGetCardIDAndDeviceID(logicID int32) (int32, int32, error) { - cardId, deviceId, err := d.DcMgr.DcGetCardIDDeviceID(logicID) - if err != nil { - hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID, - "failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err) - return common.RetError, common.RetError, err - } - hwlog.ResetErrCnt(common.DomainForLogicIdErr, logicID) - hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from dcmi, cardId:%v, deviceId:%v", - logicID, cardId, deviceId) - idCache.Store(logicID, npuIdMapping{logicId: logicID, cardId: cardId, deviceId: deviceId}) - return cardId, deviceId, nil -} - -// GetChipBaseInfos get chip base info -func (d *DeviceManager) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { - _, cardList, err := d.DcMgr.DcGetCardList() - if err != nil { - return nil, fmt.Errorf("get card list failed, error: %v", err) - } - var chips = []*common.ChipBaseInfo{} - for _, cardID := range cardList { - devNumInCard, err := d.DcMgr.DcGetDeviceNumInCard(cardID) - if err != nil { - return nil, fmt.Errorf("get device num by cardID: %d failed, error: %v", - cardID, err) - } - for devID := int32(0); devID < devNumInCard; devID++ { - logicID, err := d.DcMgr.DcGetDeviceLogicID(cardID, devID) - if err != nil { - return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ - "failed, error: %v", cardID, devID, err) - } - physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) - if err != nil { - return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) physic id "+"failed, error: %v", - cardID, devID, err) - } - hwlog.RunLog.Infof("get chip base info, cardID: %d, deviceID: %d, logicID: %d, physicID: %d", cardID, - devID, logicID, physicID) - chips = append(chips, &common.ChipBaseInfo{ - PhysicID: physicID, - LogicID: logicID, - CardID: cardID, - DeviceID: devID, - }) - } - } - return chips, nil -} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DeviceManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, - operate common.HccspingMeshOperate) error { - return d.DcMgr.DcStartHccsPingMesh(cardID, deviceID, portID, operate) -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DeviceManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { - return d.DcMgr.DcStopHccsPingMesh(cardID, deviceID, portID, taskID) -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DeviceManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, - taskID uint) (*common.HccspingMeshInfo, error) { - return d.DcMgr.DcGetHccsPingMeshInfo(cardID, deviceID, portID, taskID) -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DeviceManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { - return d.DcMgr.DcGetHccsPingMeshState(cardID, deviceID, portID, taskID) -} - -// DcGetSuperPodStatus get super pod status -func (d *DeviceManager) DcGetSuperPodStatus(cardID int32, deviceID int32, sdid uint32) (int, error) { - var err error - var status int - for i := 0; i < maxRetries; i++ { - if status, err = d.DcMgr.DcGetSuperPodStatus(cardID, deviceID, sdid); err != nil { - hwlog.RunLog.Errorf("get super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ - "sdid: %d, error: %v", i, cardID, deviceID, sdid, err) - continue - } - break - } - return status, err -} - -// DcSetSuperPodStatus set super pod status -func (d *DeviceManager) DcSetSuperPodStatus(cardID int32, deviceID int32, sdid, status uint32) error { - var err error - for i := 0; i < maxRetries; i++ { - if err = d.DcMgr.DcSetSuperPodStatus(cardID, deviceID, sdid, status); err != nil { - hwlog.RunLog.Errorf("set super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ - "sdid: %d, status: %d, error: %v", i, cardID, deviceID, sdid, status, err) - continue - } - break - } - return err -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go deleted file mode 100644 index ca7121b..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager mock -package devmanager - -import ( - "ascend-common/api" -) - -// DeviceManager910A3Mock common device manager mock for Ascend910A3 -type DeviceManager910A3Mock struct { - DeviceManagerMock -} - -// GetDevType return mock type -func (d *DeviceManager910A3Mock) GetDevType() string { - return api.Ascend910A3 -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go deleted file mode 100644 index 817f06e..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager error mock -package devmanager - -import ( - "errors" - - "ascend-common/api" - "ascend-common/devmanager/common" -) - -// DeviceManager910A3MockErr common device manager mock error for Ascend910A3 -type DeviceManager910A3MockErr struct { - DeviceManagerMockErr -} - -// GetDevType return mock type -func (d *DeviceManager910A3MockErr) GetDevType() string { - return api.Ascend910A3 -} - -// GetHccsStatisticInfo get hccs statistic info -func (d *DeviceManager910A3MockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - return &common.HccsStatisticInfo{}, errors.New(errorMsg) -} - -// GetHccsBandwidthInfo get hccs statistic info -func (d *DeviceManager910A3MockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - return &common.HccsBandwidthInfo{}, errors.New(errorMsg) -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go deleted file mode 100644 index 3d7fff4..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager for device driver manager -package devmanager - -import ( - "errors" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -const ( - mockLogicID int32 = 0 - mockCardID int32 = 0 - mockDeviceID int32 = 0 - invalidLogicID int32 = -1 - mockErrorMsg string = "mock error" - hccsArrayLen int = 8 -) - -type getHccsStatisticInfoInU64TestCase struct { - name string - logicID int32 - isValidID bool - getCardIDErr error - dcmiCallErr error - expectedErr bool -} - -func TestGetHccsStatisticInfoInU64(t *testing.T) { - testCases := buildGetHccsStatisticInfoInU64TestCases() - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - - clearIdCache(tc.logicID) - manager := createMockDeviceManager() - setupGetHccsStatisticInfoInU64Patches(patches, manager, tc) - result, err := manager.GetHccsStatisticInfoInU64(tc.logicID) - verifyGetHccsStatisticInfoInU64Result(result, err, tc) - }) - } -} - -func clearIdCache(logicID int32) { - idCache.Delete(logicID) -} - -func buildGetHccsStatisticInfoInU64TestCases() []getHccsStatisticInfoInU64TestCase { - return []getHccsStatisticInfoInU64TestCase{ - {name: "should return failed info when logicID is invalid", - logicID: invalidLogicID, - isValidID: false, - expectedErr: true}, - {name: "should return failed info when getCardIdAndDeviceId fails", - logicID: mockLogicID, - isValidID: true, - getCardIDErr: errors.New(mockErrorMsg), - expectedErr: true}, - {name: "should return failed info when DcGetHccsStatisticInfoU64 fails", - logicID: mockLogicID, - isValidID: true, - dcmiCallErr: errors.New(mockErrorMsg), - expectedErr: true}, - {name: "should return success info when all operations succeed", - logicID: mockLogicID, - isValidID: true, - expectedErr: false}, - } -} - -func createMockDeviceManager() *DeviceManager { - return &DeviceManager{ - DcMgr: &dcmi.DcManager{}, - } -} - -func setupGetHccsStatisticInfoInU64Patches(patches *gomonkey.Patches, - manager *DeviceManager, tc getHccsStatisticInfoInU64TestCase) { - patches.ApplyFuncReturn(common.IsValidLogicIDOrPhyID, tc.isValidID) - if !tc.isValidID { - return - } - if tc.getCardIDErr != nil { - patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", - mockCardID, mockDeviceID, tc.getCardIDErr) - } else { - patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", - mockCardID, mockDeviceID, nil) - if tc.dcmiCallErr != nil { - patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", - common.HccsStatisticInfo{}, tc.dcmiCallErr) - } else { - mockHccsInfo := createMockHccsStatisticInfo() - patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", - mockHccsInfo, nil) - } - } -} - -func createMockHccsStatisticInfo() common.HccsStatisticInfo { - txCnt := make([]uint64, hccsArrayLen) - rxCnt := make([]uint64, hccsArrayLen) - crcErrCnt := make([]uint64, hccsArrayLen) - for i := 0; i < hccsArrayLen; i++ { - txCnt[i] = uint64(i + 1) - rxCnt[i] = uint64(i + 1) - crcErrCnt[i] = 0 - } - return common.HccsStatisticInfo{ - TxCnt: txCnt, - RxCnt: rxCnt, - CrcErrCnt: crcErrCnt, - } -} - -func verifyGetHccsStatisticInfoInU64Result(result *common.HccsStatisticInfo, - err error, tc getHccsStatisticInfoInU64TestCase) { - if tc.expectedErr { - convey.So(err, convey.ShouldNotBeNil) - convey.So(result, convey.ShouldNotBeNil) - verifyFailedHccsInfo(result) - } else { - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldNotBeNil) - verifySuccessHccsInfo(result) - } -} - -func verifyFailedHccsInfo(result *common.HccsStatisticInfo) { - convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) - for i := 0; i < hccsArrayLen; i++ { - convey.So(result.TxCnt[i], convey.ShouldEqual, common.FailedValue) - convey.So(result.RxCnt[i], convey.ShouldEqual, common.FailedValue) - convey.So(result.CrcErrCnt[i], convey.ShouldEqual, common.FailedValue) - } -} - -func verifySuccessHccsInfo(result *common.HccsStatisticInfo) { - convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(result.TxCnt[0], convey.ShouldEqual, uint64(1)) - convey.So(result.RxCnt[0], convey.ShouldEqual, uint64(1)) -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go deleted file mode 100644 index c3bde2b..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go +++ /dev/null @@ -1,370 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager mock -package devmanager - -import ( - "ascend-common/api" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// DeviceManagerMock common device manager mock for Ascend910/310P/310 -type DeviceManagerMock struct { -} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DeviceManagerMock) DcStartHccsPingMesh(i int32, i2 int32, i3 int, operate common.HccspingMeshOperate) error { - return nil -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DeviceManagerMock) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { - return nil -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DeviceManagerMock) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, error) { - return &common.HccspingMeshInfo{}, nil -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DeviceManagerMock) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { - return 0, nil -} - -// Init load symbol and initialize dcmi -func (d *DeviceManagerMock) Init() error { - return nil -} - -// ShutDown clean the dynamically loaded resource -func (d *DeviceManagerMock) ShutDown() error { - return nil -} - -// GetDevType return mock type -func (d *DeviceManagerMock) GetDevType() string { - return api.Ascend910A -} - -// GetDeviceCount get npu device count -func (d *DeviceManagerMock) GetDeviceCount() (int32, error) { - return 1, nil -} - -// GetCardList get all card list -func (d *DeviceManagerMock) GetCardList() (int32, []int32, error) { - return 1, []int32{0}, nil -} - -// GetDeviceNumInCard get all device list in one card -func (d *DeviceManagerMock) GetDeviceNumInCard(cardID int32) (int32, error) { - return 1, nil -} - -// GetDeviceList get all device logicID list -func (d *DeviceManagerMock) GetDeviceList() (int32, []int32, error) { - return 1, []int32{0}, nil -} - -// GetDeviceHealth query npu device health status -func (d *DeviceManagerMock) GetDeviceHealth(logicID int32) (uint32, error) { - return 0, nil -} - -// GetDeviceNetWorkHealth query npu device network health status -func (d *DeviceManagerMock) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { - return 0, nil -} - -// GetDeviceUtilizationRate get npu device utilization -func (d *DeviceManagerMock) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, nil -} - -// GetDeviceTemperature get npu device temperature -func (d *DeviceManagerMock) GetDeviceTemperature(logicID int32) (int32, error) { - return 1, nil -} - -// GetDeviceVoltage get npu device voltage -func (d *DeviceManagerMock) GetDeviceVoltage(logicID int32) (float32, error) { - return 1, nil -} - -// GetDevicePowerInfo get npu device power info -func (d *DeviceManagerMock) GetDevicePowerInfo(logicID int32) (float32, error) { - return 1, nil -} - -// GetDeviceFrequency get npu device work frequency -func (d *DeviceManagerMock) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, nil -} - -// GetDeviceMemoryInfo get npu memory information -func (d *DeviceManagerMock) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { - return &common.MemoryInfo{ - MemorySize: 1, - MemoryAvailable: 1, - Frequency: 1, - Utilization: 1, - }, nil -} - -// GetDeviceHbmInfo get npu HBM module memory and frequency information -func (d *DeviceManagerMock) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { - return &common.HbmInfo{ - MemorySize: 1, - Frequency: 1, - Usage: 1, - Temp: 1, - BandWidthUtilRate: 1, - }, nil -} - -// GetDeviceErrorCode get npu device error code -func (d *DeviceManagerMock) GetDeviceErrorCode(logicID int32) (int32, int64, error) { - return int32(0), int64(0), nil -} - -// GetChipInfo get npu device error code -func (d *DeviceManagerMock) GetChipInfo(logicID int32) (*common.ChipInfo, error) { - chip := &common.ChipInfo{ - Type: "ascend", - Name: common.Chip910, - Version: "v1", - } - return chip, nil -} - -// GetPhysicIDFromLogicID get device physic id from logic id -func (d *DeviceManagerMock) GetPhysicIDFromLogicID(logicID int32) (int32, error) { - return 1, nil -} - -// GetLogicIDFromPhysicID get device logic id from physic id -func (d *DeviceManagerMock) GetLogicIDFromPhysicID(physicID int32) (int32, error) { - return 1, nil -} - -// GetDeviceLogicID get device logic id from card id and device id -func (d *DeviceManagerMock) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { - return 1, nil -} - -// GetDeviceIPAddress get device ip address -func (d *DeviceManagerMock) GetDeviceIPAddress(logicID, ipType int32) (string, error) { - if ipType == 0 { - return "127.0.0.1", nil - } - return "::1", nil -} - -// CreateVirtualDevice create virtual device -func (d *DeviceManagerMock) CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. - CgoCreateVDevOut, error) { - return common.CgoCreateVDevOut{}, nil -} - -// GetVirtualDeviceInfo get virtual device info -func (d *DeviceManagerMock) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - return common.VirtualDevInfo{}, nil -} - -// DestroyVirtualDevice destroy virtual device -func (d *DeviceManagerMock) DestroyVirtualDevice(logicID int32, vDevID uint32) error { - return nil -} - -// GetMcuPowerInfo get mcu power info for cardID -func (d *DeviceManagerMock) GetMcuPowerInfo(cardID int32) (float32, error) { - return 1, nil -} - -// GetCardIDDeviceID get cardID and deviceID by logicID -func (d *DeviceManagerMock) GetCardIDDeviceID(logicID int32) (int32, int32, error) { - return 0, 0, nil -} - -// GetProductType get product type success -func (d *DeviceManagerMock) GetProductType(cardID, deviceID int32) (string, error) { - return "", nil -} - -// GetAllProductType get all product type success -func (d *DeviceManagerMock) GetAllProductType() ([]string, error) { - return []string{}, nil -} - -// GetNpuWorkMode get npu chip work mode SMP success -func (d *DeviceManagerMock) GetNpuWorkMode() string { - return common.SMPMode -} - -// SetDeviceReset set device reset success -func (d *DeviceManagerMock) SetDeviceReset(cardID, deviceID int32) error { - return nil -} - -// GetDeviceBootStatus get device boot status success -func (d *DeviceManagerMock) GetDeviceBootStatus(logicID int32) (int, error) { - return common.BootStartFinish, nil -} - -// GetDeviceAllErrorCode get device all error code success -func (d *DeviceManagerMock) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { - return 0, []int64{}, nil -} - -// SubscribeDeviceFaultEvent subscribe device fault event success -func (d *DeviceManagerMock) SubscribeDeviceFaultEvent(logicID int32) error { - return nil -} - -// SetFaultEventCallFunc set fault event call func success -func (d *DeviceManagerMock) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { - return nil -} - -// GetDieID get die id success -func (d *DeviceManagerMock) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { - return "ABCDEFGHIGKLMNOPQRSTUVWXYZ01234567890123", nil -} - -// GetDevProcessInfo get process info -func (d *DeviceManagerMock) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { - return &common.DevProcessInfo{}, nil -} - -// GetPCIeBusInfo get pcie bus info -func (d *DeviceManagerMock) GetPCIeBusInfo(logicID int32) (string, error) { - return "0000:61:00.0", nil -} - -// GetBoardInfo Get board info -func (d *DeviceManagerMock) GetBoardInfo(logicID int32) (common.BoardInfo, error) { - return common.BoardInfo{}, nil -} - -// GetCardElabelV2 get card elabel information -func (d *DeviceManagerMock) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - return common.ElabelInfo{}, nil -} - -// GetProductTypeArray test for get product type array -func (d *DeviceManagerMock) GetProductTypeArray() []string { - return []string{common.Atlas200ISoc} -} - -// GetPCIEBandwidth get pcie bandwidth -func (d *DeviceManagerMock) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { - return common.PCIEBwStat{}, nil -} - -// SetIsTrainingCard set IsTrainingCard -func (d *DeviceManagerMock) SetIsTrainingCard() error { - return nil -} - -// IsTrainingCard get IsTrainingCard -func (d *DeviceManagerMock) IsTrainingCard() bool { - return true -} - -// GetDcmiVersion get dcmi version -func (d *DeviceManagerMock) GetDcmiVersion() string { - return "v1" -} - -// GetValidChipInfo get valid chip info from all npu -func (d *DeviceManagerMock) GetValidChipInfo() (common.ChipInfo, error) { - return common.ChipInfo{}, nil -} - -// GetDeviceEccInfo get device ECC info -func (d *DeviceManagerMock) GetDeviceEccInfo(logicID int32, - dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { - return &common.ECCInfo{EnableFlag: 1}, nil -} - -// GetSuperPodInfo get super pod info -func (d *DeviceManagerMock) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { - return common.CgoSuperPodInfo{}, nil -} - -// GetSioInfo get sio info -func (d *DeviceManagerMock) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { - return &common.SioCrcErrStatisticInfo{ - TxErrCnt: 0, - RxErrCnt: 0, - }, nil -} - -// GetHccsStatisticInfo get hccs statistic info -func (d *DeviceManagerMock) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - return &common.HccsStatisticInfo{}, nil -} - -// GetHccsStatisticInfoInU64 get hccs statistic info in u64 -func (d *DeviceManagerMock) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { - return &common.HccsStatisticInfo{}, nil -} - -// GetMainBoardId get main board id -func (d *DeviceManagerMock) GetMainBoardId() uint32 { - return 0 -} - -// GetHccsBandwidthInfo get hccs statistic info -func (d *DeviceManagerMock) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - return &common.HccsBandwidthInfo{}, nil -} - -// GetBrotherCardID get brother card id -func (d *DeviceManagerMock) GetBrotherCardID(cardID, deviceID int32) (int32, error) { - const noneBroCard = -1 - return noneBroCard, nil -} - -// GetOutBandChannelState get out band channel state -func (d *DeviceManagerMock) GetOutBandChannelState(cardID, deviceID int32) error { - return nil -} - -// PreResetSoc pre reset soc, used before reset out band -func (d *DeviceManagerMock) PreResetSoc(cardID, deviceID int32) error { - return nil -} - -// SetDeviceResetOutBand reset spec device out band -func (d *DeviceManagerMock) SetDeviceResetOutBand(cardID, deviceID int32) error { - return nil -} - -// RescanSoc trigger soc rescan, non-blocking -func (d *DeviceManagerMock) RescanSoc(cardID, deviceID int32) error { - return nil -} - -// GetChipBaseInfos get chip base info -func (d *DeviceManagerMock) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { - return nil, nil -} - -func (d *DeviceManagerMock) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } - -func (d *DeviceManagerMock) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go deleted file mode 100644 index 8ad8d7c..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go +++ /dev/null @@ -1,369 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager error mock -package devmanager - -import ( - "errors" - - "ascend-common/api" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -var errorMsg = "mock error" - -// DeviceManagerMockErr common device manager mock error for Ascend910/310P/310 -type DeviceManagerMockErr struct { -} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DeviceManagerMockErr) DcStartHccsPingMesh(i int32, i2 int32, i3 int, - operate common.HccspingMeshOperate) error { - return errors.New(errorMsg) -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DeviceManagerMockErr) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { - return errors.New(errorMsg) -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DeviceManagerMockErr) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, - error) { - return nil, errors.New(errorMsg) -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DeviceManagerMockErr) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { - return 1, errors.New(errorMsg) -} - -// Init load symbol and initialize dcmi -func (d *DeviceManagerMockErr) Init() error { - return errors.New(errorMsg) -} - -// ShutDown clean the dynamically loaded resource -func (d *DeviceManagerMockErr) ShutDown() error { - return errors.New(errorMsg) -} - -// GetDevType return mock type -func (d *DeviceManagerMockErr) GetDevType() string { - return api.Ascend910A -} - -// GetDeviceCount get npu device count -func (d *DeviceManagerMockErr) GetDeviceCount() (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetCardList get all card list -func (d *DeviceManagerMockErr) GetCardList() (int32, []int32, error) { - return 1, []int32{0}, errors.New(errorMsg) -} - -// GetDeviceNumInCard get all device list in one card -func (d *DeviceManagerMockErr) GetDeviceNumInCard(cardID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceList get all device logicID list -func (d *DeviceManagerMockErr) GetDeviceList() (int32, []int32, error) { - return 1, []int32{0}, errors.New(errorMsg) -} - -// GetDeviceHealth query npu device health status -func (d *DeviceManagerMockErr) GetDeviceHealth(logicID int32) (uint32, error) { - return 0, errors.New(errorMsg) -} - -// GetDeviceNetWorkHealth query npu device network health status -func (d *DeviceManagerMockErr) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { - return 0, errors.New(errorMsg) -} - -// GetDeviceUtilizationRate get npu device utilization -func (d *DeviceManagerMockErr) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceTemperature get npu device temperature -func (d *DeviceManagerMockErr) GetDeviceTemperature(logicID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceVoltage get npu device voltage -func (d *DeviceManagerMockErr) GetDeviceVoltage(logicID int32) (float32, error) { - return 1, errors.New(errorMsg) -} - -// GetDevicePowerInfo get npu device power info -func (d *DeviceManagerMockErr) GetDevicePowerInfo(logicID int32) (float32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceFrequency get npu device work frequency -func (d *DeviceManagerMockErr) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceMemoryInfo get npu memory information -func (d *DeviceManagerMockErr) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { - return &common.MemoryInfo{ - MemorySize: 1, - MemoryAvailable: 1, - Frequency: 1, - Utilization: 1, - }, errors.New(errorMsg) -} - -// GetDeviceHbmInfo get npu HBM module memory and frequency information -func (d *DeviceManagerMockErr) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { - return &common.HbmInfo{ - MemorySize: 1, - Frequency: 1, - Usage: 1, - Temp: 1, - BandWidthUtilRate: 1, - }, errors.New(errorMsg) -} - -// GetDeviceErrorCode get npu device error code -func (d *DeviceManagerMockErr) GetDeviceErrorCode(logicID int32) (int32, int64, error) { - return int32(0), int64(0), errors.New(errorMsg) -} - -// GetChipInfo get npu device error code -func (d *DeviceManagerMockErr) GetChipInfo(logicID int32) (*common.ChipInfo, error) { - chip := &common.ChipInfo{ - Type: "ascend", - Name: common.Chip910, - Version: "v1", - } - return chip, errors.New(errorMsg) -} - -// GetPhysicIDFromLogicID get device physic id from logic id -func (d *DeviceManagerMockErr) GetPhysicIDFromLogicID(logicID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetLogicIDFromPhysicID get device logic id from physic id -func (d *DeviceManagerMockErr) GetLogicIDFromPhysicID(physicID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceLogicID get device logic id from card id and device id -func (d *DeviceManagerMockErr) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceIPAddress get device ip address -func (d *DeviceManagerMockErr) GetDeviceIPAddress(logicID, ipType int32) (string, error) { - return "127.0.0.1", errors.New(errorMsg) -} - -// CreateVirtualDevice create virtual device -func (d *DeviceManagerMockErr) CreateVirtualDevice(logicID int32, - vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { - return common.CgoCreateVDevOut{}, errors.New(errorMsg) -} - -// GetVirtualDeviceInfo get virtual device info -func (d *DeviceManagerMockErr) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - return common.VirtualDevInfo{}, errors.New(errorMsg) -} - -// DestroyVirtualDevice destroy virtual device -func (d *DeviceManagerMockErr) DestroyVirtualDevice(logicID int32, vDevID uint32) error { - return errors.New(errorMsg) -} - -// GetMcuPowerInfo get mcu power info for cardID -func (d *DeviceManagerMockErr) GetMcuPowerInfo(cardID int32) (float32, error) { - return 1, errors.New(errorMsg) -} - -// GetCardIDDeviceID get cardID and deviceID by logicID -func (d *DeviceManagerMockErr) GetCardIDDeviceID(logicID int32) (int32, int32, error) { - return 0, 0, errors.New(errorMsg) -} - -// GetProductType get product type failed -func (d *DeviceManagerMockErr) GetProductType(cardID, deviceID int32) (string, error) { - return "", errors.New("not found product type name") -} - -// GetAllProductType get all product type failed -func (d *DeviceManagerMockErr) GetAllProductType() ([]string, error) { - return []string{}, errors.New("not found product type name") -} - -// GetNpuWorkMode get npu work mode failed -func (d *DeviceManagerMockErr) GetNpuWorkMode() string { - return "" -} - -// SetDeviceReset set device reset failed -func (d *DeviceManagerMockErr) SetDeviceReset(cardID, deviceID int32) error { - return errors.New(errorMsg) -} - -// GetDeviceBootStatus get device boot status failed -func (d *DeviceManagerMockErr) GetDeviceBootStatus(logicID int32) (int, error) { - return common.RetError, errors.New(errorMsg) -} - -// GetDeviceAllErrorCode get device all error code failed -func (d *DeviceManagerMockErr) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { - return common.RetError, nil, errors.New(errorMsg) -} - -// SubscribeDeviceFaultEvent subscribe device fault event failed -func (d *DeviceManagerMockErr) SubscribeDeviceFaultEvent(logicID int32) error { - return errors.New(errorMsg) -} - -// SetFaultEventCallFunc set fault event call func failed -func (d *DeviceManagerMockErr) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { - return errors.New(errorMsg) -} - -// GetDieID get die id failed -func (d *DeviceManagerMockErr) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { - return "", errors.New(errorMsg) -} - -// GetDevProcessInfo get process info -func (d *DeviceManagerMockErr) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetPCIeBusInfo get PCIe bus info -func (d *DeviceManagerMockErr) GetPCIeBusInfo(logicID int32) (string, error) { - return "", errors.New(errorMsg) -} - -// GetBoardInfo get board info -func (d *DeviceManagerMockErr) GetBoardInfo(logicID int32) (common.BoardInfo, error) { - return common.BoardInfo{}, errors.New(errorMsg) -} - -// GetProductTypeArray test for get empty product type array -func (d *DeviceManagerMockErr) GetProductTypeArray() []string { - return nil -} - -// GetPCIEBandwidth get pcie bandwidth -func (d *DeviceManagerMockErr) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { - return common.PCIEBwStat{}, errors.New(errorMsg) -} - -// SetIsTrainingCard set IsTrainingCard -func (d *DeviceManagerMockErr) SetIsTrainingCard() error { - return errors.New(errorMsg) -} - -// IsTrainingCard get IsTrainingCard -func (d *DeviceManagerMockErr) IsTrainingCard() bool { - return false -} - -// GetDcmiVersion get dcmi version failed -func (d *DeviceManagerMockErr) GetDcmiVersion() string { - return "" -} - -// GetValidChipInfo get valid chip info from all npu -func (d *DeviceManagerMockErr) GetValidChipInfo() (common.ChipInfo, error) { - return common.ChipInfo{}, errors.New("failed to find chip info") -} - -// GetDeviceEccInfo get device ECC info -func (d *DeviceManagerMockErr) GetDeviceEccInfo(logicID int32, - dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { - return nil, errors.New("failed to get device ECC info") -} - -// GetSuperPodInfo get super pod info -func (d *DeviceManagerMockErr) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { - return common.CgoSuperPodInfo{}, nil -} - -// GetSioInfo get sio info -func (d *DeviceManagerMockErr) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetHccsStatisticInfo get hccs statistic info -func (d *DeviceManagerMockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetHccsStatisticInfoInU64 get hccs statistic info in u64 -func (d *DeviceManagerMockErr) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetMainBoardId get main board id -func (d *DeviceManagerMockErr) GetMainBoardId() uint32 { - return 0 -} - -// GetHccsBandwidthInfo get hccs statistic info -func (d *DeviceManagerMockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetBrotherCardID get brother card id -func (d *DeviceManagerMockErr) GetBrotherCardID(cardID, deviceID int32) (int32, error) { - return -1, nil -} - -// GetOutBandChannelState get out band channel state -func (d *DeviceManagerMockErr) GetOutBandChannelState(cardID, deviceID int32) error { - return nil -} - -// PreResetSoc pre reset soc, used before reset out band -func (d *DeviceManagerMockErr) PreResetSoc(cardID, deviceID int32) error { - return nil -} - -// SetDeviceResetOutBand reset spec device out band -func (d *DeviceManagerMockErr) SetDeviceResetOutBand(cardID, deviceID int32) error { - return nil -} - -// RescanSoc trigger soc rescan, non-blocking -func (d *DeviceManagerMockErr) RescanSoc(cardID, deviceID int32) error { - return nil -} - -// GetChipBaseInfos get chip base info -func (d *DeviceManagerMockErr) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { - return nil, errors.New(errorMsg) -} - -func (d *DeviceManagerMockErr) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } - -func (d *DeviceManagerMockErr) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } - -// GetCardElabelV2 get card elabel information -func (d *DeviceManagerMockErr) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - return common.ElabelInfo{}, nil -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_test.go deleted file mode 100644 index 221a812..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_test.go +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package devmanager for device driver manager -package devmanager - -import ( - "errors" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// TestGetCardIdAndDeviceId test the getCardIdAndDeviceId function -func TestGetCardIdAndDeviceId(t *testing.T) { - - var ( - cardId, deviceId = int32(0), int32(0) - err error - returnValue = int32(0) - errReturnValue = int32(-1) - ) - manager := &DeviceManager{DcMgr: &dcmi.DcManager{}} - convey.Convey("failed to get info by dcmi", t, func() { - mk2 := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", - errReturnValue, errReturnValue, errors.New("mock err")) - defer mk2.Reset() - cardId, deviceId, err = manager.getCardIdAndDeviceId(0) - - convey.So(cardId, convey.ShouldEqual, common.RetError) - convey.So(deviceId, convey.ShouldEqual, common.RetError) - convey.So(err, convey.ShouldNotBeNil) - - }) - - mk := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", returnValue, returnValue, nil) - defer mk.Reset() - - convey.Convey("get info from dcmi", t, func() { - testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) - }) - convey.Convey("get info from cache", t, func() { - testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) - }) - -} - -func testGetCardIdAndDeviceId(t *testing.T, cardId int32, deviceId int32, err error, manager *DeviceManager) { - cardId, deviceId, err = manager.getCardIdAndDeviceId(0) - - convey.So(cardId, convey.ShouldEqual, 0) - convey.So(deviceId, convey.ShouldEqual, 0) - convey.So(err, convey.ShouldBeNil) - -} -func init() { - config := hwlog.LogConfig{ - OnlyToStdout: true, - } - hwlog.InitRunLogger(&config, nil) -} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go deleted file mode 100644 index b6388f4..0000000 --- a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hccn this for npu hccn info -package hccn - -import ( - "fmt" - "os" - "os/exec" - "strconv" - "strings" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/limiter" - "ascend-common/common-utils/utils" - "ascend-common/devmanager/common" -) - -const ( - space = " " - newLine = "\n" - colon = ":" - - // LinkUp npu interface up - LinkUp string = "UP" - // LinkDown npu interface down - LinkDown string = "DOWN" - - opticalPartLen = 2 - secondIndex = 2 - linkStatusPart = 3 - base64 = 64 - - cardHealthy = 0 - - normalCode = 1 - abnormalCode = 0 - - naValue = "NA" - notSupport = "not supported" - unknownStr = "Unknown!" - - limitSize = 1024 * 1024 -) - -func getInfoFromHccnTool(args ...string) (string, error) { - const hccnTool = "/usr/local/Ascend/driver/tools/hccn_tool" - if _, err := utils.CheckPath(hccnTool); err != nil { - return "", err - } - cmd := exec.Command(hccnTool, args...) - cmd.Env = []string{ - "PATH=" + os.Getenv("PATH"), - utils.LdLibPath + "=" + os.Getenv(utils.LdLibPath), - } - limitStdout := limiter.NewLimitedWriter(limitSize) - cmd.Stdout = limitStdout - cmd.Stderr = limiter.NewLimitedWriter(limitSize) - err := cmd.Run() - if err != nil { - return "", err - } - - return string(limitStdout.GetBufferBytes()), nil -} - -// GetNPULinkStatus exec "hccn_tool -i * -link -g" to get link status -func GetNPULinkStatus(phyID int32) (string, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-link", "-g"} - // command example: hccn_tool -i 0 -link -g - // success result example is: link status: DOWN - outStr, err := getInfoFromHccnTool(args...) - hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) - if err != nil { - return common.Abnormal, buildHccnErr(phyID, "link status", err) - } - replacedStr := strings.ReplaceAll(outStr, newLine, "") - outArr := strings.Split(replacedStr, space) - if len(outArr) != linkStatusPart { - return common.Abnormal, buildHccnErr(phyID, "link status", - fmt.Errorf("length of output %v is not equal to %v", outArr, linkStatusPart)) - } - - status := outArr[secondIndex] - hwlog.RunLog.Debugf("hccn_tool get npu link status: %s", status) - return status, nil -} - -// GetNPULinkSpeed exec "hccn_tool -i * -speed -g" to get link speed -func GetNPULinkSpeed(phyID int32) (int, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-speed", "-g"} - // command example: hccn_tool -i 0 -speed -g - // success result example is: Speed: 100000 Mb/s - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return common.RetError, buildHccnErr(phyID, "link speed", err) - } - return getSpeedFromOutStr(outStr, phyID) -} - -func getSpeedFromOutStr(outStr string, phyID int32) (int, error) { - if strings.Contains(outStr, unknownStr) { - return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("npu link speed is unknown")) - } - replacedStr := strings.ReplaceAll(outStr, newLine, "") - outArr := strings.Split(replacedStr, space) - if len(outArr) != linkStatusPart { - return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("length of output %v is not equal to %v", - outArr, linkStatusPart)) - } - const midIndex = 1 - speed, err := strconv.Atoi(outArr[midIndex]) - if err != nil { - return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("covert speed from string failed: %s", err)) - } - - return speed, nil -} - -// GetNPULinkUpNum exec "hccn_tool -i * -link_stat -g" to get link up count -func GetNPULinkUpNum(phyID int32) (int, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-link_stat", "-g"} - // command example: hccn_tool -i 0 -link_stat -g - // success result include: [device x]link up count : y - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return common.RetError, buildHccnErr(phyID, "link stat", err) - } - - const ( - linkUpArrLen = 6 - linkUpStr = "link up count" - ) - linkUPCount := 0 - lines := strings.Split(outStr, newLine) - for _, line := range lines { - if line == "" || !strings.Contains(line, linkUpStr) { - continue - } - - linkUpArr := strings.Fields(line) - if len(linkUpArr) != linkUpArrLen { - return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("length of output %v is not "+ - "equal to %v", linkUpArr, linkUpArrLen)) - } - if linkUPCount, err = strconv.Atoi(linkUpArr[linkUpArrLen-1]); err != nil { - return common.RetError, buildHccnErr(phyID, "link up num", - fmt.Errorf("covert link up num from string failed: %s", err)) - } - return linkUPCount, nil - } - - return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("did not find link up count")) -} - -// GetNPUStatInfo exec "hccn_tool -i * -stat -g" to get stat info -func GetNPUStatInfo(phyID int32) (map[string]int, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-stat", "-g"} - // command example: hccn_tool -i 0 -stat -g - // success result include: [device x]link up count : y - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return nil, buildHccnErr(phyID, "stat", err) - } - lines := strings.Split(outStr, newLine) - statInfoMap := make(map[string]int) - const statPartLen = 2 - for _, line := range lines { - statParts := strings.Split(line, colon) - if len(statParts) != statPartLen || statParts[1] == "" { - continue - } - statNum, err := strconv.Atoi(statParts[1]) - if err != nil { - hwlog.RunLog.Errorf("covert stat num of [%s] from string failed: %s", statParts[1], err) - continue - } - statInfoMap[statParts[0]] = statNum - } - - return statInfoMap, nil -} - -// GetNPUOpticalInfo exec "hccn_tool -i * -optical -g" to get optical info -func GetNPUOpticalInfo(phyID int32) (map[string]string, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-optical", "-g"} - // command example: hccn_tool -i 0 -optical -g - // success result include: [device x]link up count : y - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return nil, buildHccnErr(phyID, "optical", err) - } - lines := strings.Split(outStr, newLine) - opticalInfoMap := make(map[string]string) - for _, line := range lines { - opticalParts := strings.Split(line, colon) - if len(opticalParts) != opticalPartLen { - continue - } - opticalKey := strings.ReplaceAll(strings.TrimSpace(opticalParts[0]), space, "_") - opticalValue := strings.TrimSpace(opticalParts[1]) - opticalInfoMap[opticalKey] = opticalValue - } - - return opticalInfoMap, nil -} - -// GetNPUInterfaceTraffic exec "hccn_tool -i * -bandwidth -g" to get bandwidth info -func GetNPUInterfaceTraffic(phyID int32) (float64, float64, error) { - const ( - noTraffic = common.RetError - trafficPartLen = 4 - txStr = "TX:" - rxStr = "RX:" - ) - - args := []string{"-i", strconv.Itoa(int(phyID)), "-bandwidth", "-g"} - // command example: hccn_tool -i 0 -bandwidth -g - // success result has two lines: - // Bandwidth TX: 0.00 MB/sec - // Bandwidth RX: 0.00 MB/sec - outStr, err := getInfoFromHccnTool(args...) - hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) - if err != nil { - return noTraffic, noTraffic, buildHccnErr(phyID, "interface traffic", err) - } - - var ( - tx = float64(noTraffic) - rx = float64(noTraffic) - ) - - lines := strings.Split(outStr, newLine) - for _, line := range lines { - if line == "" { - continue - } - - trafficArr := strings.Fields(line) - hwlog.RunLog.Debugf("npu bandwidth split as: %v", trafficArr) - if len(trafficArr) != trafficPartLen { - continue - } - if strings.Contains(line, txStr) { - tmpTx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) - if err != nil { - hwlog.RunLog.Errorf("get float data from Bandwidth TX err: %s", err) - continue - } - tx = tmpTx - } - if strings.Contains(line, rxStr) { - tmpRx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) - if err != nil { - hwlog.RunLog.Errorf("get float data from Bandwidth RX err: %s", err) - continue - } - rx = tmpRx - } - } - return tx, rx, nil -} - -// GetFloatDataFromStr get float data from string with space -func GetFloatDataFromStr(str, dataType string) float64 { - if str == "" || strings.Contains(str, naValue) || strings.Contains(str, notSupport) { - return common.RetError - } - dataParts := strings.Split(str, space) - if len(dataParts) != opticalPartLen { - errMsg := fmt.Sprintf("convert %v optical data type failed, "+ - "the length of optical data %v is %v not equal to %d. ", dataType, dataParts, len(dataParts), opticalPartLen) - hwlog.RunLog.Error(errMsg) - return common.RetError - } - floatData, err := strconv.ParseFloat(dataParts[0], base64) - if err != nil { - hwlog.RunLog.Errorf("convert %v optical data type to a floating-point number failed, "+ - "get float data from string %v failed, err: %v", dataType, dataParts[0], err) - return common.RetError - } - return floatData -} - -// GetHealthCode return union healthy code -func GetHealthCode(healthCode uint32) int { - if healthCode == common.UnRetError { - return common.RetError - } - - if healthCode == cardHealthy { - return normalCode - } - return abnormalCode -} - -// GetLinkStatusCode return union link status code -func GetLinkStatusCode(status string) int { - if status == common.Abnormal { - return common.RetError - } - - if status == LinkUp { - return normalCode - } - return abnormalCode -} - -// GetNetworkHealthy return union network healthy code -func GetNetworkHealthy(netCode uint32) int { - if netCode == common.UnRetError { - return common.RetError - } - - if netCode == common.NetworkInit || netCode == common.NetworkSuccess { - return normalCode - } - return abnormalCode -} - -func buildHccnErr(phyID int32, msg string, err error) error { - return fmt.Errorf("phyID(%d),get npu %s info failed,error is :%v", phyID, msg, err) -} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go deleted file mode 100644 index 7d4fe17..0000000 --- a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hccn this for npu hccn info -package hccn - -import ( - "fmt" - "strings" - "testing" -) - -func TestBuildHccnErr(t *testing.T) { - t.Run("normal error", func(t *testing.T) { - phyID := int32(1) - msg := "status" - originalErr := fmt.Errorf("permission denied") - - err := buildHccnErr(phyID, msg, originalErr) - - if !strings.Contains(err.Error(), "phyID(1)") { - t.Error("should contain phyID") - } - if !strings.Contains(err.Error(), "npu status") { - t.Error("should contain npu message") - } - if !strings.Contains(err.Error(), "permission denied") { - t.Error("should contain original error") - } - }) - - t.Run("nil error", func(t *testing.T) { - err := buildHccnErr(0, "", nil) - if !strings.Contains(err.Error(), "error is :nil") { - t.Error("should handle nil error") - } - }) -} diff --git a/mind-cluster/component/ascend-common/go.mod b/mind-cluster/component/ascend-common/go.mod deleted file mode 100644 index e1e3bbb..0000000 --- a/mind-cluster/component/ascend-common/go.mod +++ /dev/null @@ -1,55 +0,0 @@ -module ascend-common - -go 1.18 - -require ( - github.com/agiledragon/gomonkey/v2 v2.8.0 - github.com/fsnotify/fsnotify v1.6.0 - github.com/kubeflow/common v0.4.3 - github.com/smartystreets/goconvey v1.6.4 - k8s.io/api v0.25.3 - k8s.io/apimachinery v0.25.3 - k8s.io/client-go v0.25.3 -) - -require ( - github.com/PuerkitoBio/purell v1.1.1 // indirect - github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/emicklei/go-restful/v3 v3.8.0 // indirect - github.com/go-logr/logr v1.2.3 // indirect - github.com/go-openapi/jsonpointer v0.19.5 // indirect - github.com/go-openapi/jsonreference v0.19.5 // indirect - github.com/go-openapi/swag v0.19.14 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.2 // indirect - github.com/google/gnostic v0.5.7-v3refs // indirect - github.com/google/go-cmp v0.5.8 // indirect - github.com/google/gofuzz v1.1.0 // indirect - github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect - github.com/josharian/intern v1.0.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect - github.com/jtolds/gls v4.20.0+incompatible // indirect - github.com/mailru/easyjson v0.7.6 // indirect - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect - golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect - golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect - golang.org/x/text v0.3.7 // indirect - golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect - google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.0 // indirect - gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/klog/v2 v2.70.1 // indirect - k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect - k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect - sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect - sigs.k8s.io/yaml v1.3.0 // indirect -) diff --git a/mind-cluster/component/ascend-common/go.sum b/mind-cluster/component/ascend-common/go.sum deleted file mode 100644 index 000ced7..0000000 --- a/mind-cluster/component/ascend-common/go.sum +++ /dev/null @@ -1,492 +0,0 @@ -cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= -cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= -cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= -cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= -cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= -cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= -cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= -cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= -cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= -cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= -cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= -cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= -cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= -cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= -cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= -cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= -cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= -cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= -cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= -cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= -cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= -cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= -cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= -cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= -cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= -cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= -cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= -cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= -cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= -cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= -dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= -github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= -github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= -github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= -github.com/emicklei/go-restful/v3 v3.8.0 h1:eCZ8ulSerjdAiaNpF7GxXIE7ZCMo1moN1qX+S609eVw= -github.com/emicklei/go-restful/v3 v3.8.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= -github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= -github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= -github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= -github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= -github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= -github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= -github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= -github.com/go-openapi/jsonreference v0.19.5 h1:1WJP/wi4OjB4iV8KVbH73rQaoialJrqv8gitZLxGLtM= -github.com/go-openapi/jsonreference v0.19.5/go.mod h1:RdybgQwPxbL4UEjuAruzK1x3nE69AqPYEJeo/TWfEeg= -github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= -github.com/go-openapi/swag v0.19.14 h1:gm3vOOXfiuw5i9p5N9xJvfjvuofpyvLA9Wr6QfK5Fng= -github.com/go-openapi/swag v0.19.14/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= -github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= -github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= -github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= -github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= -github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= -github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= -github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/gnostic v0.5.7-v3refs h1:FhTMOKj2VhjpouxvWJAV1TL304uMlb9zcDqkl6cEI54= -github.com/google/gnostic v0.5.7-v3refs/go.mod h1:73MKFl6jIHelAJNaBGFzt3SPtZULs9dYrGFt8OiIsHQ= -github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= -github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= -github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= -github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= -github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= -github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= -github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= -github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= -github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubeflow/common v0.4.3 h1:vVoOMNPOZK4wzZvQ4rsRLvC3SDi+J1fVKNHSXC/QRvU= -github.com/kubeflow/common v0.4.3/go.mod h1:Qb/5aON7/OWVkN8OnjRqqT0i8X/XzMekRIZ8lkLosj4= -github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= -github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= -github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= -github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= -go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= -go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= -golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= -golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= -golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= -golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= -golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= -golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= -golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= -golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= -golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= -golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= -golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= -golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 h1:RerP+noqYHUQ8CMRcPlC2nvTa4dcBIjegkuWdcUDuqg= -golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= -golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= -golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= -golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= -golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= -golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= -golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= -google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= -google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= -google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= -google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= -google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= -google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= -google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= -google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= -google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= -google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= -google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= -google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= -google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= -google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= -google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= -google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= -google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= -google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= -google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= -google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= -google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= -google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= -google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= -google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= -google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= -google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= -google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw= -google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= -gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= -honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -k8s.io/api v0.25.3 h1:Q1v5UFfYe87vi5H7NU0p4RXC26PPMT8KOpr1TLQbCMQ= -k8s.io/api v0.25.3/go.mod h1:o42gKscFrEVjHdQnyRenACrMtbuJsVdP+WVjqejfzmI= -k8s.io/apimachinery v0.25.3 h1:7o9ium4uyUOM76t6aunP0nZuex7gDf8VGwkR5RcJnQc= -k8s.io/apimachinery v0.25.3/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= -k8s.io/client-go v0.25.3 h1:oB4Dyl8d6UbfDHD8Bv8evKylzs3BXzzufLiO27xuPs0= -k8s.io/client-go v0.25.3/go.mod h1:t39LPczAIMwycjcXkVc+CB+PZV69jQuNx4um5ORDjQA= -k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= -k8s.io/klog/v2 v2.70.1 h1:7aaoSdahviPmR+XkS7FyxlkkXs6tHISSG03RxleQAVQ= -k8s.io/klog/v2 v2.70.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= -k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkIFQtZShWqoha7snGixVgEA= -k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= -k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= -k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= -rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= -rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= -rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= -sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= -sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= -sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/mind-cluster/component/npu-exporter/.gitignore b/mind-cluster/component/npu-exporter/.gitignore deleted file mode 100644 index 723ef36..0000000 --- a/mind-cluster/component/npu-exporter/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.idea \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/LICENSE b/mind-cluster/component/npu-exporter/LICENSE deleted file mode 100644 index f49a4e1..0000000 --- a/mind-cluster/component/npu-exporter/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/README.md b/mind-cluster/component/npu-exporter/README.md deleted file mode 100644 index 4bde4a9..0000000 --- a/mind-cluster/component/npu-exporter/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# NPU-Exporter - -# 组件介绍 - - -Prometheus(普罗米修斯)是一个开源的系统监测和警报工具包,Exporter就是专门为Prometheus提供数据源的组件。由于Prometheus社区的活跃和大量的使用,已经有很多厂商或者服务提供了Exporter,如Prometheus官方的Node Exporter,MySQL官方出的MySQL Server Exporter和NVIDA的NVIDIA GPU Exporter。这些Exporter负责将特定监测对象的指标,转成Prometheus能够识别的数据格式,供Prometheus集成。NPU-Expoter是华为自研的专门收集华为NPU各种监测信息和指标,并封装成Prometheus专用数据格式的一个服务组件。 - - -# 编译NPU-Exporter - -1. 通过git拉取源码,获得npu-exporter。 - - 示例:Npu-Exporter源码放在/home/mind-cluster/component/npu-exporter目录下 - -2. 执行以下命令,进入Npu-Exporter构建目录,执行构建脚本,在“output“目录下生成二进制npu-exporter、yaml文件和Dockerfile等文件。 - - **cd** _/home/mind-cluster/component/_**npu-exporter/build/** - - **chmod +x build.sh** - - **./build.sh** - -3. 执行以下命令,查看**output**生成的软件列表。 - - **ll** _/home/mind-cluster/component/_**npu-exporter/output** - - ``` - drwxr-xr-x 2 root root 4096 Feb 23 07:10 . - drwxr-xr-x 10 root root 4096 Feb 23 07:10 .. - -r-------- 1 root root 623 Feb 23 07:10 Dockerfile - -r-------- 1 root root 623 Feb 23 07:10 Dockerfile-310P-1usoc - -r-------- 1 root root 623 Feb 23 07:10 metricConfiguration.json - -r-x------ 1 root root 25481072 Feb 23 07:10 npu-exporter - -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-310P-1usoc-v6.0.0.yaml - -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-v6.0.0.yaml - -r-------- 1 root root 623 Feb 23 07:10 pluginConfiguration.json - -r-x------ 1 root root 2579 Feb 23 07:10 run_for_310P_1usoc.sh - ``` - -# 说明 - -1. 当前Npu-Exporter仅支持http启动,如果需要使用https启动,请自行完成代码修改并适配Prometheus \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile b/mind-cluster/component/npu-exporter/build/Dockerfile deleted file mode 100644 index 24f9943..0000000 --- a/mind-cluster/component/npu-exporter/build/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM ubuntu:22.04 - -RUN useradd -d /home/HwHiAiUser -u 1000 -m -s /usr/sbin/nologin HwHiAiUser &&\ - usermod root -s /usr/sbin/nologin - -COPY ./npu-exporter /usr/local/bin/ -COPY ./metricConfiguration.json /usr/local/metricConfiguration.json -COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json - -RUN chown root:root /usr/local/bin/npu-exporter &&\ - chmod 750 -R /home/HwHiAiUser &&\ - chmod 550 /usr/local/bin/ &&\ - chmod 500 /usr/local/bin/npu-exporter &&\ - chmod 440 /usr/local/metricConfiguration.json &&\ - chmod 440 /usr/local/pluginConfiguration.json &&\ - echo 'umask 027' >> /etc/profile && \ - echo 'source /etc/profile' >> ~/.bashrc -ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi - -CMD /usr/local/bin/npu-exporter - diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc b/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc deleted file mode 100644 index 5927f7d..0000000 --- a/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc +++ /dev/null @@ -1,31 +0,0 @@ -FROM ubuntu:22.04 - -RUN groupadd -g 1000 HwHiAiUser && useradd -u 1000 -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser &&\ - groupadd -g 1101 HwDmUser && useradd -u 1101 -g HwDmUser -d /home/HwDmUser -m HwDmUser &&\ - groupadd -g 1102 HwBaseUser && useradd -u 1102 -g HwBaseUser -d /home/HwBaseUser -m HwBaseUser &&\ - usermod -a -G HwBaseUser HwHiAiUser &&\ - usermod -a -G HwDmUser HwHiAiUser &&\ - usermod -a -G HwBaseUser HwDmUser &&\ - usermod -a -G HwHiAiUser HwDmUser &&\ - usermod root -s /usr/sbin/nologin - -COPY ./npu-exporter /usr/local/bin/ -COPY ./run_for_310P_1usoc.sh / -COPY ./metricConfiguration.json /usr/local/metricConfiguration.json -COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json - -RUN chown root:root /usr/local/bin/npu-exporter &&\ - chmod 500 /run_for_310P_1usoc.sh &&\ - chmod 550 /usr/local/bin/ &&\ - chmod 500 /usr/local/bin/npu-exporter &&\ - chmod 440 /usr/local/metricConfiguration.json &&\ - chmod 440 /usr/local/pluginConfiguration.json &&\ - echo 'umask 027' >> /etc/profile && \ - echo 'source /etc/profile' >> ~/.bashrc - -RUN ln -s /lib /lib64 2>&1 >> /dev/null &&\ - mkdir -m 750 /var/driver -m 750 /var/dmp -m 750 /usr/slog -p -m 750 /home/drv/hdc_ppc &&\ - chown HwDmUser:HwDmUser /var/dmp &&\ - chown HwHiAiUser:HwHiAiUser /var/driver &&\ - chown HwHiAiUser:HwHiAiUser /home/drv/hdc_ppc &&\ - chown HwHiAiUser:HwHiAiUser /usr/slog \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/build.sh b/mind-cluster/component/npu-exporter/build/build.sh deleted file mode 100644 index 16c101d..0000000 --- a/mind-cluster/component/npu-exporter/build/build.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# Perform build npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2020-2023. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -set -e -CUR_DIR=$(dirname $(readlink -f $0)) -TOP_DIR=$(realpath "${CUR_DIR}"/..) -export GO111MODULE="on" -VER_FILE="${TOP_DIR}"/service_config.ini -build_version="v6.0.0" -if [ -f "$VER_FILE" ]; then - line=$(sed -n '1p' "$VER_FILE" 2>&1) - #cut the chars after ':' and add char 'v', the final example is v3.0.0 - build_version="v"${line#*=} -fi - -arch=$(arch 2>&1) -echo "Build Architecture is" "${arch}" - -OUTPUT_NAME="npu-exporter" -DOCKER_FILE_NAME="Dockerfile" -A200ISOC_DOCKER_FILE_NAME="Dockerfile-310P-1usoc" -A200ISOC_RUN_SHELL="run_for_310P_1usoc.sh" - -function clean() { - rm -rf "${TOP_DIR}"/output - mkdir -p "${TOP_DIR}"/output -} - -function build() { - cd "${TOP_DIR}/cmd/npu-exporter" - CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ - -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ - -o ${OUTPUT_NAME} - ls ${OUTPUT_NAME} - if [ $? -ne 0 ]; then - echo "fail to find npu-exporter" - exit 1 - fi -} - -function mv_file() { - mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - cp "${TOP_DIR}"/build/npu-exporter-310P-1usoc.yaml "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml - cp "${TOP_DIR}"/build/metricConfiguration.json "${TOP_DIR}"/output/ - cp "${TOP_DIR}"/build/pluginConfiguration.json "${TOP_DIR}"/output/ - sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml - cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/${A200ISOC_DOCKER_FILE_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/${A200ISOC_RUN_SHELL} "${TOP_DIR}"/output - chmod 400 "${TOP_DIR}"/output/* - chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} - chmod 500 "${TOP_DIR}"/output/${A200ISOC_RUN_SHELL} - -} - -function main() { - clean - build - mv_file -} - -main diff --git a/mind-cluster/component/npu-exporter/build/build_ch.sh b/mind-cluster/component/npu-exporter/build/build_ch.sh deleted file mode 100644 index 878fcbd..0000000 --- a/mind-cluster/component/npu-exporter/build/build_ch.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# Perform build npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2025-2025. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -set -e -CUR_DIR=$(dirname $(readlink -f $0)) -TOP_DIR=$(realpath "${CUR_DIR}"/..) -export GO111MODULE="on" -VER_FILE="${TOP_DIR}"/service_config.ini -build_version="v6.0.0" -if [ -f "$VER_FILE" ]; then - line=$(sed -n '1p' "$VER_FILE" 2>&1) - #cut the chars after ':' and add char 'v', the final example is v3.0.0 - build_version="v"${line#*=} -fi - -arch=$(arch 2>&1) -echo "Build Architecture is" "${arch}" - -OUTPUT_NAME="npu-exporter" -DOCKER_FILE_NAME="Dockerfile" - - -function clean() { - rm -rf "${TOP_DIR}"/output - mkdir -p "${TOP_DIR}"/output -} - -function build() { - cd "${TOP_DIR}/cmd/npu-exporter" - CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ - -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ - -o ${OUTPUT_NAME} - ls ${OUTPUT_NAME} - if [ $? -ne 0 ]; then - echo "fail to find npu-exporter" - exit 1 - fi -} - -function mv_file() { - mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - sed -i "s/ascend*/alan/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - - cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output - chmod 400 "${TOP_DIR}"/output/* - chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} - -} - -function main() { - clean - build - mv_file -} - -main diff --git a/mind-cluster/component/npu-exporter/build/metricConfiguration.json b/mind-cluster/component/npu-exporter/build/metricConfiguration.json deleted file mode 100644 index 3dbd82b..0000000 --- a/mind-cluster/component/npu-exporter/build/metricConfiguration.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - {"metricsGroup": "ddr", "state": "ON"}, - {"metricsGroup": "hccs", "state": "ON"}, - {"metricsGroup": "npu", "state": "ON"}, - {"metricsGroup": "network", "state": "ON"}, - {"metricsGroup": "pcie", "state": "ON"}, - {"metricsGroup": "roce", "state": "ON"}, - {"metricsGroup": "sio", "state": "ON"}, - {"metricsGroup": "vnpu", "state": "ON"}, - {"metricsGroup": "version", "state": "ON"}, - {"metricsGroup": "optical", "state": "ON"}, - {"metricsGroup": "hbm", "state": "ON"} -] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml deleted file mode 100644 index 3b6e22f..0000000 --- a/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml +++ /dev/null @@ -1,167 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: npu-exporter ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: exporter-network-policy - namespace: npu-exporter -spec: - podSelector: - matchLabels: - app: npu-exporter - policyTypes: - - Ingress - - Egress - ingress: - - from: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus - egress: - - to: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: npu-exporter-310p-1usoc - namespace: npu-exporter -spec: - selector: - matchLabels: - app: npu-exporter - template: - metadata: - ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - labels: - app: npu-exporter - spec: - ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile -# securityContext: -# seccompProfile: -# type: RuntimeDefault - automountServiceAccountToken: false - nodeSelector: - workerselector: dls-worker-node - servertype: soc - containers: - - name: npu-exporter - image: npu-exporter:v5.0.RC1 - resources: - requests: - memory: 1000Mi - cpu: 1000m - limits: - memory: 1000Mi - cpu: 1000m - imagePullPolicy: Never - command: [ "/bin/bash", "-c", "/run_for_310P_1usoc.sh"] - # pair firstly - securityContext: - privileged: true - readOnlyRootFilesystem: true - runAsUser: 0 - runAsGroup: 0 - ports: - - name: http - containerPort: 8082 - protocol: TCP - volumeMounts: - - name: log-npu-exporter - mountPath: /var/log/mindx-dl/npu-exporter - - name: localtime - mountPath: /etc/localtime - readOnly: true - - name: ascend-driver - mountPath: /usr/local/Ascend/driver - readOnly: true - - name: ascend-dcmi - mountPath: /usr/local/dcmi - readOnly: true - - name: libyaml - mountPath: /usr/lib64/libyaml-0.so.2 - readOnly: true - - name: docker-shim # delete when only use containerd - mountPath: /run/dockershim.sock - readOnly: true - - name: docker # delete when only use containerd - mountPath: /run/docker/containerd/containerd.sock - readOnly: true - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - mountPath: /var/run/cri-dockerd.sock - readOnly: true - - name: containerd - mountPath: /run/containerd - readOnly: true - - name: tmp - mountPath: /tmp - - name: dmp - mountPath: /var/dmp_daemon - readOnly: true - - name: slogd - mountPath: /var/slogd - readOnly: true - - name: hbasic - mountPath: /etc/hdcBasic.cfg - readOnly: true - - name: slogconf - mountPath: /etc/slog.conf - readOnly: true - volumes: - - name: log-npu-exporter - hostPath: - path: /var/log/mindx-dl/npu-exporter - type: Directory - - name: localtime - hostPath: - path: /etc/localtime - - name: libyaml - hostPath: - path: /usr/lib64/libyaml-0.so.2 - type: File - - name: ascend-driver - hostPath: - path: /usr/local/Ascend/driver - - name: ascend-dcmi - hostPath: - path: /usr/local/dcmi - - name: docker-shim # delete when only use containerd - hostPath: - path: /run/dockershim.sock - - name: docker # delete when only use containerd - hostPath: - path: /run/docker/containerd/containerd.sock - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - hostPath: - path: /var/run/cri-dockerd.sock - - name: containerd - hostPath: - path: /run/containerd - - name: tmp - hostPath: - path: /tmp - - name: dmp - hostPath: - path: /var/dmp_daemon - type: File - - name: slogd - hostPath: - path: /var/slogd - type: File - - name: hbasic - hostPath: - path: /etc/hdcBasic.cfg - type: File - - name: slogconf - hostPath: - path: /etc/slog.conf - type: File \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter.yaml deleted file mode 100644 index 970e3cf..0000000 --- a/mind-cluster/component/npu-exporter/build/npu-exporter.yaml +++ /dev/null @@ -1,140 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: npu-exporter ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: exporter-network-policy - namespace: npu-exporter -spec: - podSelector: - matchLabels: - app: npu-exporter - policyTypes: - - Ingress - - Egress - ingress: - - from: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus - egress: - - to: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: npu-exporter - namespace: npu-exporter -spec: - selector: - matchLabels: - app: npu-exporter - template: - metadata: - ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - labels: - app: npu-exporter - spec: - ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile -# securityContext: -# seccompProfile: -# type: RuntimeDefault - automountServiceAccountToken: false - nodeSelector: - workerselector: dls-worker-node - containers: - - name: npu-exporter - image: npu-exporter:v5.0.RC1 - resources: - requests: - memory: 1000Mi - cpu: 1000m - limits: - memory: 1000Mi - cpu: 1000m - imagePullPolicy: Never - command: [ "/bin/bash", "-c", "--"] - # pair firstly - args: [ "umask 027;npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 - -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker" ] - securityContext: - privileged: true - readOnlyRootFilesystem: true - runAsUser: 0 - runAsGroup: 0 - ports: - - name: http - containerPort: 8082 - protocol: TCP - volumeMounts: - - name: log-npu-exporter - mountPath: /var/log/mindx-dl/npu-exporter - - name: localtime - mountPath: /etc/localtime - readOnly: true - - name: ascend-driver - mountPath: /usr/local/Ascend/driver - readOnly: true - - name: ascend-dcmi - mountPath: /usr/local/dcmi - readOnly: true - - name: docker-shim # delete when only use containerd or isula - mountPath: /var/run/dockershim.sock - readOnly: true - - name: docker # delete when only use containerd or isula - mountPath: /var/run/docker - readOnly: true - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - mountPath: /var/run/cri-dockerd.sock - readOnly: true - - name: containerd # delete when only use isula - mountPath: /run/containerd - readOnly: true - - name: isulad # delete when use containerd or docker - mountPath: /run/isulad.sock - readOnly: true - - name: tmp - mountPath: /tmp - volumes: - - name: log-npu-exporter - hostPath: - path: /var/log/mindx-dl/npu-exporter - type: Directory - - name: localtime - hostPath: - path: /etc/localtime - - name: ascend-driver - hostPath: - path: /usr/local/Ascend/driver - - name: ascend-dcmi - hostPath: - path: /usr/local/dcmi - - name: docker-shim # delete when only use containerd or isula - hostPath: - path: /var/run/dockershim.sock - - name: docker # delete when only use containerd or isula - hostPath: - path: /var/run/docker - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - hostPath: - path: /var/run/cri-dockerd.sock - - name: containerd # delete when only use isula - hostPath: - path: /run/containerd - - name: isulad # delete when use containerd or docker - hostPath: - path: /run/isulad.sock - - name: tmp - hostPath: - path: /tmp - diff --git a/mind-cluster/component/npu-exporter/build/pluginConfiguration.json b/mind-cluster/component/npu-exporter/build/pluginConfiguration.json deleted file mode 100644 index 68823e0..0000000 --- a/mind-cluster/component/npu-exporter/build/pluginConfiguration.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"metricsGroup": "MyPlugin", "state": "OFF"}, - {"metricsGroup": "text", "state": "ON"} -] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh b/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh deleted file mode 100644 index 055ed41..0000000 --- a/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Perform build npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2022-2022. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -set -e - -# log process run in background -echo -e "[INFO]\t $(date +"%F %T:%N")\t start slogd server in background" -su - HwHiAiUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/slogd -d &" -echo -e "[INFO]\t $(date +"%F %T:%N")\t start dmp_daemon server in background" -# dcmi interface process run in background -su - HwDmUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/dmp_daemon -I -M -U 8087 &" - -export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi -# the host is openEuler, so the parameters "endpoint" and "containerd" are set to adapt to "-containerMode=docker" in default -# in openEuler os, the path of parameters "endpoint" and "containerd" are not in the default place -echo -e "[INFO]\t $(date +"%F %T:%N")\t start npu-exporter server" -/usr/local/bin/npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker -endpoint=/run/dockershim.sock -containerd=/run/docker/containerd/containerd.sock - diff --git a/mind-cluster/component/npu-exporter/build/test.sh b/mind-cluster/component/npu-exporter/build/test.sh deleted file mode 100644 index 097eb3a..0000000 --- a/mind-cluster/component/npu-exporter/build/test.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# Perform test for npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2020-2020. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -set -e - - -# execute go test and echo result to report files -function execute_test() { - if ! (go test -v -race -coverprofile cov.out "${TOP_DIR}"/... >./"$file_input") - then - echo '****** go test cases error! ******' - cat $file_input - exit 1 - else - gocov convert cov.out | gocov-html >"$file_detail_output" - gotestsum --junitfile unit-tests.xml "${TOP_DIR}"/... - - total_coverage=$(go tool cover -func=cov.out | grep "total:" | awk '{print $3}'| sed 's/%//') - # round up - coverage=$(echo "$total_coverage" | awk '{if ($1 >= 0) print ($1 == int($1)) ? int($1) : int($1) + 1;\ - else print ($1 == int($1)) ? int($1) : int($1)}') - if [[ $coverage -ge 80 ]]; then - echo "coverage passed: $coverage%" - exit 0 - else - echo "coverage failed: $coverage%, it needs to be greater than 80%." - exit 1 - fi - fi -} - - -export GO111MODULE="on" -export PATH=$GOPATH/bin:$PATH -export GOFLAGS="-gcflags=all=-l" -unset GOPATH -# if didn't install the following tools, please install firstly -#go get -insecure github.com/axw/gocov/gocov -#go get github.com/matm/gocov-html -CUR_DIR=$(dirname "$(readlink -f "$0")") -TOP_DIR=$(realpath "${CUR_DIR}"/..) - -file_input='testExporter.txt' -file_detail_output='api.html' - -if [ -f "${TOP_DIR}"/test ]; then - rm -rf "${TOP_DIR}"/test -fi -mkdir -p "${TOP_DIR}"/test -cd "${TOP_DIR}"/test -echo "clean old version test results" - -if [ -f "$file_input" ]; then - rm -rf "$file_input" -fi -if [ -f "$file_detail_output" ]; then - rm -rf "$file_detail_output" -fi - -echo "************************************* Start LLT Test *************************************" -execute_test -echo "************************************* End LLT Test *************************************" diff --git a/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go b/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go deleted file mode 100644 index 700b248..0000000 --- a/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go +++ /dev/null @@ -1,545 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package main -package main - -import ( - "context" - "errors" - "flag" - "fmt" - "log" - "net" - "net/http" - "os" - "regexp" - "strconv" - "strings" - "sync" - "time" - - "github.com/influxdata/telegraf/plugins/common/shim" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/limiter" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/config" - "huawei.com/npu-exporter/v6/collector/container" - _ "huawei.com/npu-exporter/v6/platforms/inputs/npu" - "huawei.com/npu-exporter/v6/platforms/prom" - "huawei.com/npu-exporter/v6/plugins" - "huawei.com/npu-exporter/v6/utils/logger" - "huawei.com/npu-exporter/v6/versions" -) - -var ( - port int - updateTime int - ip = "" - version bool - concurrency int - containerMode = "" - containerd = "" - endpoint = "" - limitIPReq = "" - platform = "" - textMetricsFilePath = "" - limitIPConn int - limitTotalConn int - cacheSize int - profilingTime int - hccsBWProfilingTime int - pollInterval time.Duration - deviceResetTimeout int -) - -const ( - portConst = 8082 - updateTimeConst = 5 - cacheTime = 100 * time.Second - portLeft = 1025 - portRight = 40000 - oneMinute = 60 - defaultConcurrency = 5 - defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" - containerModeDocker = "docker" - containerModeContainerd = "containerd" - containerModeIsula = "isula" - unixPre = "unix://" - timeout = 10 - maxHeaderBytes = 1024 - // tenDays ten days - tenDays = 10 - maxIPConnLimit = 128 - maxConcurrency = 512 - defaultConnection = 20 - maxProfilingTime = 2000 - minHccsBWProfilingTime = 1 - maxHccsBWProfilingTime = 1000 - defaultShutDownTimeout = 30 * time.Second -) - -const ( - prometheusPlatform = "Prometheus" - telegrafPlatform = "Telegraf" - pollIntervalStr = "poll_interval" - platformStr = "platform" - defaultProfilingTime = 200 - defaultHccsBwProfilingTime = 200 -) - -func main() { - flag.Parse() - if version { - fmt.Printf("NPU-exporter version: %s \n", versions.BuildVersion) - return - } - err := logger.InitLogger(platform) - if err != nil { - fmt.Fprintf(os.Stderr, "%v", err) - return - } - initPaprams() - err = paramValid(platform) - if err != nil { - return - } - dmgr, err := devmanager.AutoInit("", deviceResetTimeout) - if err != nil { - logger.Errorf("new npu collector failed, error is %v", err) - return - } - logger.Infof("npu exporter starting and the version is %s", versions.BuildVersion) - deviceParser := container.MakeDevicesParser(readCntMonitoringFlags()) - defer deviceParser.Close() - - if err := deviceParser.Init(); err != nil { - logger.Errorf("failed to init devices parser: %v", err) - } - deviceParser.Timeout = time.Duration(updateTime) * time.Second - - colcommon.Collector = colcommon.NewNpuCollector(cacheTime, time.Duration(updateTime)*time.Second, deviceParser, dmgr) - plugins.InitTextMetricsDesc(textMetricsFilePath) - plugins.RegisterPlugin() - config.Register(colcommon.Collector) - - ctx, cancel := context.WithCancel(context.Background()) - wg := &sync.WaitGroup{} - colcommon.InitCardInfo(wg, ctx, colcommon.Collector) - colcommon.StartContainerInfoCollect(ctx, cancel, wg, colcommon.Collector) - - colcommon.StartCollect(wg, ctx, colcommon.Collector) - switch platform { - case prometheusPlatform: - prometheusProcss(wg, ctx, cancel) - case telegrafPlatform: - telegrafProcess() - default: - err = fmt.Errorf("err platform input") - } - wg.Wait() -} - -func prometheusProcss(wg *sync.WaitGroup, ctx context.Context, cancel context.CancelFunc) { - c := prom.NewPrometheusCollector(colcommon.Collector) - reg := prometheus.NewRegistry() - reg.MustRegister(c) - - wg.Add(1) - go func() { - startServe(ctx, cancel, reg) - wg.Done() - }() -} - -func initPaprams() { - common.SetHccsBWProfilingTime(hccsBWProfilingTime) - common.SetExternalParams(profilingTime) -} - -func paramValid(platform string) error { - var err error - switch platform { - case prometheusPlatform: - err = paramValidInPrometheus() - case telegrafPlatform: - err = paramValidInTelegraf() - default: - err = fmt.Errorf("err platform input") - } - if err != nil { - logger.Error(err) - return err - } - return nil -} - -func initConfig() *limiter.HandlerConfig { - conf := &limiter.HandlerConfig{ - PrintLog: true, - Method: http.MethodGet, - LimitBytes: limiter.DefaultDataLimit, - TotalConCurrency: concurrency, - IPConCurrency: limitIPReq, - CacheSize: limiter.DefaultCacheSize, - } - return conf -} - -func newServerAndListener(conf *limiter.HandlerConfig) (*http.Server, net.Listener) { - handler, err := limiter.NewLimitHandlerV2(http.DefaultServeMux, conf) - if err != nil { - hwlog.RunLog.Error(err) - return nil, nil - } - s := &http.Server{ - Addr: ip + ":" + strconv.Itoa(port), - Handler: handler, - ReadTimeout: timeout * time.Second, - WriteTimeout: timeout * time.Second, - MaxHeaderBytes: maxHeaderBytes, - ErrorLog: log.New(&hwlog.SelfLogWriter{}, "", log.Lshortfile), - } - ln, err := net.Listen("tcp", s.Addr) - if err != nil { - logger.Errorf("listen ip and port error: %v", err) - return nil, nil - } - limitLs, err := limiter.LimitListener(ln, limitTotalConn, limitIPConn, limiter.DefaultCacheSize) - if err != nil { - hwlog.RunLog.Error(err) - return nil, nil - } - return s, limitLs -} - -func readCntMonitoringFlags() container.CntNpuMonitorOpts { - opts := container.CntNpuMonitorOpts{UseOciBackup: true, UseCriBackup: true} - switch containerMode { - case containerModeDocker: - opts.EndpointType = container.EndpointTypeDockerd - opts.OciEndpoint = container.DefaultDockerAddr - opts.CriEndpoint = container.DefaultDockerShim - case containerModeContainerd: - opts.EndpointType = container.EndpointTypeContainerd - opts.OciEndpoint = container.DefaultContainerdAddr - opts.CriEndpoint = container.DefaultContainerdAddr - case containerModeIsula: - opts.EndpointType = container.EndpointTypeIsula - opts.OciEndpoint = container.DefaultIsuladAddr - opts.CriEndpoint = container.DefaultIsuladAddr - default: - hwlog.RunLog.Error("invalid container mode setting,reset to docker") - opts.EndpointType = container.EndpointTypeDockerd - opts.OciEndpoint = container.DefaultDockerAddr - opts.CriEndpoint = container.DefaultDockerShim - } - if containerd != "" { - opts.OciEndpoint = containerd - opts.UseOciBackup = false - } - if endpoint != "" { - opts.CriEndpoint = endpoint - opts.UseCriBackup = false - } - return opts -} - -func checkIPAndPortInPrometheus() error { - if port < portLeft || port > portRight { - return errors.New("the port is invalid") - } - parsedIP := net.ParseIP(ip) - if parsedIP == nil { - return errors.New("the listen ip is invalid") - } - ip = parsedIP.String() - logger.Infof("listen on: %s", ip) - return nil -} - -func paramValidInPrometheus() error { - checks := []func() error{ - checkIPAndPortInPrometheus, - checkUpdateTime, - containerSockCheck, - checkLimitIPReqFormat, - checkLimitIPConn, - checkLimitTotalConn, - checkCacheSize, - checkConcurrency, - checkProfilingTime, - checkHccsBWProfilingTime, - checkDeviceResetTimeout, - checkPollIntervalInCmdLine, - } - - for _, check := range checks { - if err := check(); err != nil { - return err - } - } - return nil -} - -func checkUpdateTime() error { - if updateTime > oneMinute || updateTime < 1 { - return errors.New("the updateTime is invalid") - } - return nil -} - -func checkLimitIPReqFormat() error { - reg := regexp.MustCompile(limiter.IPReqLimitReg) - if !reg.Match([]byte(limitIPReq)) { - return errors.New("limitIPReq format error") - } - return nil -} - -func checkLimitIPConn() error { - if limitIPConn < 1 || limitIPConn > maxIPConnLimit { - return errors.New("limitIPConn is invalid") - } - return nil -} - -func checkLimitTotalConn() error { - if limitTotalConn < 1 || limitTotalConn > maxConcurrency { - return errors.New("limitTotalConn is invalid") - } - return nil -} - -func checkCacheSize() error { - if cacheSize < 1 || cacheSize > limiter.DefaultCacheSize*tenDays { - return errors.New("cacheSize is invalid") - } - return nil -} - -func checkConcurrency() error { - if concurrency < 1 || concurrency > maxConcurrency { - return errors.New("concurrency is invalid") - } - return nil -} - -func checkProfilingTime() error { - if profilingTime < 1 || profilingTime > maxProfilingTime { - return errors.New("profilingTime range error") - } - return nil -} - -func checkHccsBWProfilingTime() error { - if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { - return errors.New("hccsBWProfilingTime range error") - } - return nil -} - -func checkDeviceResetTimeout() error { - if deviceResetTimeout < api.MinDeviceResetTimeout || deviceResetTimeout > api.MaxDeviceResetTimeout { - return errors.New("deviceResetTimeout range error") - } - return nil -} - -func checkPollIntervalInCmdLine() error { - cmdLine := strings.Join(os.Args[1:], "") - if strings.Contains(cmdLine, pollIntervalStr) { - return fmt.Errorf("%s is not support this scene", pollIntervalStr) - } - return nil -} - -func containerSockCheck() error { - if endpoint != "" && !strings.Contains(endpoint, ".sock") { - return errors.New("endpoint file is not sock address") - } - if containerd != "" && !strings.Contains(containerd, ".sock") { - return errors.New("containerd file is not sock address") - } - if endpoint != "" && !strings.Contains(endpoint, unixPre) { - endpoint = unixPre + endpoint - } - if containerd != "" && !strings.Contains(containerd, unixPre) { - containerd = unixPre + containerd - } - return nil -} - -func init() { - flag.IntVar(&port, "port", portConst, - "The server port of the http service,range[1025-40000]") - flag.StringVar(&ip, "ip", "", - "The listen ip of the service,0.0.0.0 is not recommended when install on Multi-NIC host") - flag.IntVar(&updateTime, "updateTime", updateTimeConst, - "Interval (seconds) to update the npu metrics cache,range[1-60]") - flag.BoolVar(&version, "version", false, - "If true,query the version of the program (default false)") - flag.StringVar(&containerMode, "containerMode", containerModeDocker, - "Set 'docker' for monitoring docker containers or 'containerd' for CRI & containerd") - flag.StringVar(&containerd, "containerd", "", - "The endpoint of containerd used for listening containers' events") - flag.StringVar(&endpoint, "endpoint", "", - "The endpoint of the CRI server to which will be connected") - flag.IntVar(&concurrency, "concurrency", defaultConcurrency, - "The max concurrency of the http server, range is [1-512]") - // hwlog configuration - flag.IntVar(&logger.HwLogConfig.LogLevel, "logLevel", 0, - "Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)") - flag.IntVar(&logger.HwLogConfig.MaxAge, "maxAge", hwlog.DefaultMinSaveAge, - "Maximum number of days for backup log files, range [7, 700] days") - flag.StringVar(&logger.HwLogConfig.LogFileName, "logFile", defaultLogFile, - "Log file path. If the file size exceeds 20MB, will be rotated") - flag.IntVar(&logger.HwLogConfig.MaxBackups, "maxBackups", hwlog.DefaultMaxBackups, - "Maximum number of backup log files, range is (0, 30]") - flag.IntVar(&cacheSize, "cacheSize", limiter.DefaultCacheSize, "the cacheSize for ip limit,"+ - "range is [1,1024000],keep default normally") - flag.IntVar(&limitIPConn, "limitIPConn", defaultConcurrency, "the tcp connection limit for each Ip,"+ - "range is [1,128]") - flag.IntVar(&limitTotalConn, "limitTotalConn", defaultConnection, "the tcp connection limit for all"+ - " request,range is [1,512]") - flag.StringVar(&limitIPReq, "limitIPReq", "20/1", - "the http request limit counts for each Ip,20/1 means allow 20 request in 1 seconds") - flag.StringVar(&platform, "platform", "Prometheus", "the data reporting platform, "+ - "just support Prometheus and Telegraf") - flag.StringVar(&textMetricsFilePath, "textMetricsFilePath", "", - "text indicator collection path, only support specified one file path") - flag.DurationVar(&pollInterval, pollIntervalStr, 1*time.Second, - "how often to send metrics when use Telegraf plugin, "+ - "needs to be used with -platform=Telegraf, otherwise, it does not take effect") - flag.IntVar(&profilingTime, "profilingTime", defaultProfilingTime, - "config pcie bandwidth profiling time, range is [1, 2000]") - flag.IntVar(&hccsBWProfilingTime, api.HccsBWProfilingTimeStr, defaultHccsBwProfilingTime, - "config "+api.Hccs+" bandwidth profiling time, range is [1, 1000]") - flag.IntVar(&deviceResetTimeout, api.DeviceResetTimeout, api.DefaultDeviceResetTimeout, - "when npu-exporter starts, if the number of chips is insufficient, the maximum duration to wait for "+ - "the driver to report all chips, unit second, range [10, 600]") -} - -func indexHandler(w http.ResponseWriter, _ *http.Request) { - var proposal = "http" - _, err := w.Write([]byte( - ` - NPU-Exporter - -

NPU-Exporter

-

Welcome to use NPU-Exporter,the Prometheus metrics url is ` + proposal + `://ip:` + - strconv.Itoa(port) + `/metrics: Metrics

- - `)) - if err != nil { - logger.Errorf("Write to response error: %v", err) - } -} - -func prometheusProcess() { - -} - -func startServe(ctx context.Context, cancel context.CancelFunc, reg *prometheus.Registry) { - http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{ErrorHandling: promhttp.ContinueOnError})) - http.Handle("/", http.HandlerFunc(indexHandler)) - conf := initConfig() - s, limitLs := newServerAndListener(conf) - if s == nil || limitLs == nil { - cancel() - return - } - - go func() { - logger.Warn("enable unsafe http server") - if err := s.Serve(limitLs); err != nil { - logger.Errorf("Http server error: %v and stopped", err) - cancel() - } - }() - - <-ctx.Done() - shutErr := func() error { - logger.Info("received stop signal, STOP http server") - ctxShutDown, timeOut := context.WithTimeout(context.Background(), defaultShutDownTimeout) - defer timeOut() - return s.Shutdown(ctxShutDown) - }() - if shutErr != nil { - logger.Errorf("shutdown http server error: %v", shutErr) - } -} - -func paramValidInTelegraf() error { - // cmdLine here must contain "-platform=Telegraf", otherwise, it will enter the Prometheus process - cmdLine := os.Args[1:] - - // store the preset parameter names in the map - presetParamsMap := map[string]bool{ - platformStr: true, - pollIntervalStr: true, - api.HccsBWProfilingTimeStr: true, - } - - if len(cmdLine) > len(presetParamsMap) { - return errors.New("too many parameters") - } - - var paramLen = 2 - // check every input params - for _, param := range cmdLine { - param = strings.TrimPrefix(param, "-") - split := strings.Split(param, "=") - if len(split) != paramLen { - return fmt.Errorf("the param [%s] is a wrong format", param) - } - paramName := split[0] - if !presetParamsMap[paramName] { - return fmt.Errorf("not support [%s] in Telegraf", paramName) - } - } - - if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { - return errors.New(api.Hccs + "BWProfilingTime range error") - } - return nil -} - -func telegrafProcess() { - // create the shim. This is what will run your plugins. - shim := shim.New() - - // If no config is specified, all imported plugins are loaded. - // otherwise follow what the config asks for. - // Check for settings from a config toml file, - // (or just use whatever plugins were imported above) - configFile := "" - err := shim.LoadConfig(&configFile) - if err != nil { - fmt.Fprintf(os.Stderr, "Err loading input: %s\n", err) - return - } - - // run the input plugin(s) until stdin closes, or we receive a termination signal - if err := shim.Run(pollInterval); err != nil { - fmt.Fprintf(os.Stderr, "Err: %s\n", err) - return - } -} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go deleted file mode 100644 index af46251..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "context" - "strings" - "sync" - "time" - - "ascend-common/common-utils/hwlog" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -// StartContainerInfoCollect start collect container info -func StartContainerInfoCollect(ctx context.Context, cancelFunc context.CancelFunc, group *sync.WaitGroup, - n *NpuCollector) { - group.Add(1) - - go func() { - defer group.Done() - retryCount := 0 - collectContainerInfo := func() { - logger.Info("start to collect container info") - n.devicesParser.FetchAndParse(nil) - select { - case result := <-n.devicesParser.RecvResult(): - if err := n.cache.Set(containersDevicesCacheKey, result, n.cacheTime); err != nil { - logger.Error(err) - } - logger.Infof(UpdateCachePattern, containersDevicesCacheKey) - retryCount = 0 - case err := <-n.devicesParser.RecvErr(): - logger.Errorf("received error from device parser: %v", err) - if strings.Contains(err.Error(), "connection refused") { - retryCount++ - if retryCount == connectRefusedMaxRetry { - logger.Error("connection refused, task shutdown") - cancelFunc() - } - } - } - } - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop container info collect") - return - default: - collectContainerInfo() - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, containersDevicesCacheKey) - return - } - } - } - }() -} - -// GetContainerNPUInfo get container npu info -func GetContainerNPUInfo(n *NpuCollector) map[int32]container.DevicesInfo { - obj, err := n.cache.Get(containersDevicesCacheKey) - // only run once to prevent wait when container info get failed - npuContainerInfoInit.Do(func() { - if err != nil { - logger.Warn("containers' devices info not found in cache, rebuilding") - resultChan := make(chan container.DevicesInfos, 1) - n.devicesParser.FetchAndParse(resultChan) - select { - case obj = <-resultChan: - case <-time.After(time.Second): - logger.Warn("rebuild container info cache timeout") - return - } - logger.Info("rebuild cache successfully") - } - }) - cntNpuInfos, ok := obj.(container.DevicesInfos) - if !ok { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: DomainForContainerInfo, ID: 0}, - "error container npu info cache and convert failed") - return nil - } - hwlog.ResetErrCnt(DomainForContainerInfo, 0) - res := make(map[int32]container.DevicesInfo, initSize) - for _, v := range cntNpuInfos { - for _, deviceID := range v.Devices { - res[int32(deviceID)] = v - } - } - return res -} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go deleted file mode 100644 index 6412e12..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/cache" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - testCacheTime = 60 * time.Second - testUpdateTime = 10 * time.Millisecond - testDeviceID0 = 0 - testDeviceID1 = 1 - testDeviceID2 = 2 - testContainerID1 = "container1" - testContainerID2 = "container2" - testContainerName1 = "test-container-1" - testContainerName2 = "test-container-2" -) - -var ( - testDevicesInfos = container.DevicesInfos{ - testContainerID1: { - ID: testContainerID1, - Name: testContainerName1, - Devices: []int{testDeviceID0, testDeviceID1}, - }, - testContainerID2: { - ID: testContainerID2, - Name: testContainerName2, - Devices: []int{testDeviceID2}, - }, - } -) - -func createTestNpuCollector() *NpuCollector { - parser := &container.DevicesParser{} - return &NpuCollector{ - cache: cache.New(cacheSize), - devicesParser: parser, - updateTime: testUpdateTime, - cacheTime: testCacheTime, - } -} - -func resetNpuContainerInfoInit() { - npuContainerInfoInit = sync.Once{} -} - -type getContainerNPUInfoTestCase struct { - name string - setupCache func(*NpuCollector) - mockParser func(*gomonkey.Patches, *container.DevicesParser) - expectedResult map[int32]container.DevicesInfo -} - -func createGetContainerNPUInfoTestCases() []getContainerNPUInfoTestCase { - return []getContainerNPUInfoTestCase{ - { - name: "should return container npu info when cache exists", - setupCache: func(n *NpuCollector) { - n.cache.Set(containersDevicesCacheKey, testDevicesInfos, testCacheTime) - }, - mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, - expectedResult: map[int32]container.DevicesInfo{ - int32(testDeviceID0): testDevicesInfos[testContainerID1], - int32(testDeviceID1): testDevicesInfos[testContainerID1], - int32(testDeviceID2): testDevicesInfos[testContainerID2], - }, - }, - { - name: "should rebuild cache when cache not exists", - setupCache: func(n *NpuCollector) {}, - mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) { - patches.ApplyMethod(parser, "FetchAndParse", - func(p *container.DevicesParser, resultOut chan<- container.DevicesInfos) { - if resultOut != nil { - resultOut <- testDevicesInfos - } - }) - }, - expectedResult: map[int32]container.DevicesInfo{ - int32(testDeviceID0): testDevicesInfos[testContainerID1], - int32(testDeviceID1): testDevicesInfos[testContainerID1], - int32(testDeviceID2): testDevicesInfos[testContainerID2], - }, - }, - { - name: "should return nil when cache type conversion failed", - setupCache: func(n *NpuCollector) { - n.cache.Set(containersDevicesCacheKey, "invalid type", testCacheTime) - }, - mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, - expectedResult: nil, - }, - } -} - -func TestGetContainerNPUInfo(t *testing.T) { - testCases := createGetContainerNPUInfoTestCases() - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - resetNpuContainerInfoInit() - n := createTestNpuCollector() - tc.setupCache(n) - - patches := gomonkey.NewPatches() - defer patches.Reset() - tc.mockParser(patches, n.devicesParser) - - result := GetContainerNPUInfo(n) - convey.So(result, convey.ShouldResemble, tc.expectedResult) - }) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/common/constants.go b/mind-cluster/component/npu-exporter/collector/common/constants.go deleted file mode 100644 index d7e1409..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/constants.go +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general constants -package common - -// metric label name -const ( - npuID = "id" - modelName = "model_name" - npuUUID = "vdie_id" - npuPCIEInfo = "pcie_bus_info" - namespace = "namespace" - podName = "pod_name" - cntrName = "container_name" -) - -const ( - // Healthy status of Health - Healthy = "Healthy" - // UnHealthy status of unhealth - UnHealthy = "UnHealthy" - // Abnormal status of Abnormal - Abnormal = "Abnormal" - - // LinkUp npu interface up - LinkUp = "UP" - // LinkDown npu interface down - LinkDown = "DOWN" - - // Base convert base - Base = 10 - // ContainerNameLen container name length - ContainerNameLen = 3 - // npuListCacheKey Cache key - npuListCacheKey = "npu-exporter-npu-list" - // Cache key for parsing-device result - containersDevicesCacheKey = "npu-exporter-containers-devices" - initSize = 8 - tickerFailedPattern = "%s ticker failed, task shutdown" - // UpdateCachePattern Update cache pattern - UpdateCachePattern = "update Cache,key is %s" - connectRefusedMaxRetry = 3 -) - -const ( - cacheSize = 128 - // NameSpaceIdx is the index of namespace in container name - NameSpaceIdx = 0 - // PodNameIdx is the index of pod name in container name - PodNameIdx = 1 - // ConNameIdx is the index of container name in container name - ConNameIdx = 2 - - // DecimalPlaces is the decimal places of float64 - DecimalPlaces = 2 - // BitSize is the bit size of float64 - BitSize = 64 - // GeneralDevTagKey is the default value of devTagKey in telegraf, it means the metric is not related to any device - GeneralDevTagKey = "GeneralDevTagKey" -) - -// log limit domains for metrics -const ( - // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID - DomainForLogicIdErr = "logicID" - - // DomainForHccs domain for hccs - DomainForHccs = "hccs" - - // DomainForDDR domain for DDR - DomainForDDR = "DDR" - - // DomainForSio domain for sio - DomainForSio = "sio" - - // DomainForHBM domain for HBM - DomainForHBM = "hbm" - - // DomainForHBMECC domain for hbmEcc - DomainForHBMECC = "hbmEcc" - - // DomainForHccsBW domain for hccs bandwidth - DomainForHccsBW = "hccsBw" - - // DomainForOptical domain for Optical - DomainForOptical = "optical" - - // DomainForLinkState domain for linkState - DomainForLinkState = "linkState" - - // DomainForBandwidth domain for bandwidth - DomainForBandwidth = "bandwidth" - - // DomainForLinkStat domain for linkStat - DomainForLinkStat = "linkStat" - - // DomainForLinkSpeed domain for linkSpeed - DomainForLinkSpeed = "linkSpeed" - - // DomainForRoce domain for roce - DomainForRoce = "roce" - - // DomainForMcuPower domain for mcu power - DomainForMcuPower = "mcuPower" - - // DomainForChipPower domain for chip power - DomainForChipPower = "chipPower" - - // DomainForAICoreUtilization domain for ai core utilization - DomainForAICoreUtilization = "AICoreUtilization" - - // DomainForVectorCoreUtilization domain for vector core utilization - DomainForVectorCoreUtilization = "vectorCoreUtilization" - - // DomainForProcess domain for process info - DomainForProcess = "processInfo" - - // DomainForHbmUtilization domain for High Bandwidth Memory Utilization - DomainForHbmUtilization = "hbmUtilization" - - // DomainForOverallUtilization domain for overall utilization - DomainForOverallUtilization = "overallUtilization" - - // DomainForPcieBandwidth domain for pcie bandwidth - DomainForPcieBandwidth = "pcieBandwidth" - // DomainForContainerInfo domain for pcie container info - DomainForContainerInfo = "containerInfo" -) diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go deleted file mode 100644 index d891649..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "reflect" - "strings" - "sync" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - // CardLabel general card label - CardLabel = []string{npuID, modelName, npuUUID, npuPCIEInfo, namespace, podName, cntrName} - - noNeedToPrintUpdateLog = map[string]bool{ - "NetworkCollector": true, - "RoceCollector": true, - "OpticalCollector": true, - } -) - -// BuildDescSlice build desc slice -func BuildDescSlice(slice *[]*prometheus.Desc, name string, help string) { - *slice = append(*slice, BuildDesc(name, help)) -} - -// BuildDesc build desc -func BuildDesc(name string, help string) *prometheus.Desc { - return prometheus.NewDesc(name, help, CardLabel, nil) -} - -// BuildDescWithLabel build desc with label -func BuildDescWithLabel(name string, help string, label []string) *prometheus.Desc { - return prometheus.NewDesc(name, help, label, nil) -} - -// MetricsCollector metrics collector -type MetricsCollector interface { - // Describe report metrics to prometheus - Describe(ch chan<- *prometheus.Desc) - - // CollectToCache collect data to cache - CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) - - // UpdatePrometheus update prometheus - UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, containerMap map[int32]container.DevicesInfo, - chips []HuaWeiAIChip) - - // UpdateTelegraf update telegraf - UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} - - // PreCollect pre handle before collect - PreCollect(*NpuCollector, []HuaWeiAIChip) - - // PostCollect post handle after collect - PostCollect(*NpuCollector) - - // IsSupported Check whether the current hardware supports this metric - IsSupported(*NpuCollector) bool -} - -// MetricsCollectorAdapter base collector for metrics collector -type MetricsCollectorAdapter struct { - LocalCache sync.Map - Is910Series bool - ContainerMap map[int32]container.DevicesInfo - Chips []HuaWeiAIChip -} - -// Describe report metrics to prometheus -func (c *MetricsCollectorAdapter) Describe(ch chan<- *prometheus.Desc) { -} - -// CollectToCache collect data to cache -func (c *MetricsCollectorAdapter) CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) { -} - -// UpdatePrometheus update prometheus -func (c *MetricsCollectorAdapter) UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) { -} - -// UpdateTelegraf update telegraf -func (c *MetricsCollectorAdapter) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} { - return fieldsMap -} - -// PreCollect pre handle before collect -func (c *MetricsCollectorAdapter) PreCollect(n *NpuCollector, chipList []HuaWeiAIChip) { - if strings.Contains(n.Dmgr.GetDevType(), api.Ascend910A) { - c.Is910Series = true - } -} - -// PostCollect post handle after collect -func (c *MetricsCollectorAdapter) PostCollect(*NpuCollector) { -} - -// IsSupported Check whether the current hardware supports this metric -func (c *MetricsCollectorAdapter) IsSupported(*NpuCollector) bool { - return true -} - -// UpdateCache update cache -func UpdateCache[T any](n *NpuCollector, cacheKey string, localCache *sync.Map) { - var cacheInfo = make(map[int32]T) - obj, err := n.cache.Get(cacheKey) - if err != nil { - logger.Debugf("get info of %s failed: %v, use initial data", cacheKey, err) - } else { - if oldCacheInfo, ok := obj.(map[int32]T); ok { - cacheInfo = copyMap(oldCacheInfo) - } else { - logger.Debug("cache format invalid, reset") - } - } - - localCache.Range(func(key, value interface{}) bool { - finalKey, okKey := key.(int32) - finalValue, okValue := value.(T) - if okKey && okValue { - cacheInfo[finalKey] = finalValue - } - return true - }) - - err = n.cache.Set(cacheKey, cacheInfo, n.cacheTime) - if noNeedToPrintUpdateLog[cacheKey] { - return - } - if err != nil { - logger.Error(err) - } -} - -func copyMap[T any](oldCacheInfo map[int32]T) map[int32]T { - var cacheInfo = make(map[int32]T) - for key, value := range oldCacheInfo { - cacheInfo[key] = value - } - return cacheInfo -} - -// GetInfoFromCache get info from cache -func GetInfoFromCache[T any](n *NpuCollector, cacheKey string) map[int32]T { - res := make(map[int32]T) - obj, err := n.cache.Get(cacheKey) - if err != nil { - logger.Warn("cache not found, please wait for rebuild") - return res - } - - if data, ok := obj.(map[int32]T); ok { - return data - } - logger.Error("cache type mismatch") - return res -} - -// GetCacheKey Obtain the name of the struct pointer as the key of the cache -func GetCacheKey(ptr interface{}) string { - v := reflect.ValueOf(ptr) - if v.Kind() != reflect.Ptr { - return "" - } - v = v.Elem() - if v.Kind() != reflect.Struct { - return "" - } - return v.Type().Name() -} diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go deleted file mode 100644 index f66ceb5..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go +++ /dev/null @@ -1,231 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "reflect" - "sync" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" -) - -// TestCopyMap test copyMap -func TestCopyMap(t *testing.T) { - type testStruct struct { - name string - age int - } - mockString := "mock" - tests := []struct { - name string - input map[int32]testStruct - validate func(*testing.T, interface{}) - }{ - {name: "NilInput", input: (map[int32]testStruct)(nil), - validate: func(t *testing.T, got interface{}) { - g, ok := got.(map[int32]testStruct) - if !ok || g == nil || len(g) != 0 { - t.Errorf("should return empty map for nil input") - } - }}, - {name: "EmptyMap", input: map[int32]testStruct{}, - validate: func(t *testing.T, got interface{}) { - if len(got.(map[int32]testStruct)) != 0 { - t.Errorf("expected empty map") - } - }}, - {name: "SingleElement", input: map[int32]testStruct{1: {name: mockString, age: 1}}, - validate: func(t *testing.T, got interface{}) { - g, ok := got.(map[int32]testStruct) - if !ok || g[1].name != mockString || g[1].age != 1 || len(g) != 1 { - t.Errorf("element mismatch") - } - }}, - {name: "MultipleElements", input: map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}}, - validate: func(t *testing.T, got interface{}) { - expected := map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}} - if !reflect.DeepEqual(got, expected) { - t.Errorf("deepEqual failed") - } - }}, - } - - for _, tt := range tests { - convey.Convey(tt.name, t, func() { - got := copyMap[testStruct](tt.input) - tt.validate(t, got) - }) - } -} - -func TestPreCollect(t *testing.T) { - tests := []struct { - name string - deviceType string - expected bool - }{ - {name: "TestPreCollect_" + api.Ascend910, - deviceType: api.Ascend910, - expected: true, - }, - {name: "TestPreCollect_" + api.Ascend310, - deviceType: api.Ascend310, - expected: false, - }, - } - convey.Convey("TestPreCollect", t, func() { - n := mockNewNpuCollector() - adapter := MetricsCollectorAdapter{ - Is910Series: false, - ContainerMap: nil, - Chips: nil, - } - for _, tt := range tests { - convey.Convey(tt.name, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.deviceType) - adapter.PreCollect(n, nil) - convey.So(adapter.Is910Series, convey.ShouldEqual, tt.expected) - }) - } - }) -} - -type cacheCase struct { - name string - cacheKey string - preHandle func() - expected int -} - -func buildTestsForUpdateCache(expected int) []cacheCase { - tests := []cacheCase{ - {name: "TestUpdateCache_save info to cache", - cacheKey: "mockKey1", - preHandle: func() {}, - expected: expected, - }, - {name: "TestUpdateCache_update old cache", - cacheKey: "mockKey2", - preHandle: func() { - noNeedToPrintUpdateLog["mockKey2"] = true - }, - expected: expected, - }, - {name: "TestUpdateCache_old cache is in incorrect type", - cacheKey: "mockKey3", - preHandle: func() {}, - expected: expected, - }, - } - return tests -} - -func TestUpdateCache(t *testing.T) { - const key = int32(0) - const expected = 1 - tests := buildTestsForUpdateCache(expected) - - n := mockNewNpuCollector() - // data init - n.cache.Set("mockKey2", map[int32]string{key: "0"}, n.cacheTime) - n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) - - convey.Convey("TestUpdateCache", t, func() { - - for _, tt := range tests { - convey.Convey(tt.name, func() { - localCache := sync.Map{} - localCache.Store(key, "mockValue") - tt.preHandle() - UpdateCache[string](n, tt.cacheKey, &localCache) - - data, err := n.cache.Get(tt.cacheKey) - convey.So(err, convey.ShouldBeNil) - map2, ok := data.(map[int32]string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(len(map2), convey.ShouldEqual, tt.expected) - }) - } - - }) -} - -func TestGetInfoFromCache(t *testing.T) { - const key = int32(0) - tests := []struct { - name string - cacheKey string - expected int - }{ - {name: "TestGetInfoFromCache_no info in cache", - cacheKey: "mockKey1", - expected: 0, - }, - {name: "TestGetInfoFromCache_correct", - cacheKey: "mockKey2", - expected: 1, - }, - {name: "TestGetInfoFromCache_info in cache is in incorrect type", - cacheKey: "mockKey3", - expected: 0, - }, - } - n := mockNewNpuCollector() - // data init - n.cache.Set("mockKey2", map[int32]string{key: "mockValue"}, n.cacheTime) - n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) - for _, tt := range tests { - convey.Convey(tt.name, t, func() { - cache := GetInfoFromCache[string](n, tt.cacheKey) - convey.So(len(cache), convey.ShouldEqual, tt.expected) - }) - } -} - -func TestGetCacheKey(t *testing.T) { - tests := []struct { - name string - args interface{} - expected string - }{ - {name: "TestGetCacheKey_ptr", - args: &MetricsCollectorAdapter{}, - expected: "MetricsCollectorAdapter", - }, - {name: "TestGetCacheKey_int", - args: 0, - expected: "", - }, - {name: "TestGetCacheKey_struct", - args: MetricsCollectorAdapter{}, - expected: "", - }, - } - - convey.Convey("TestGetCacheKey", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - convey.So(GetCacheKey(tt.args), convey.ShouldEqual, tt.expected) - }) - } - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector.go deleted file mode 100644 index fee5312..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/npu_collector.go +++ /dev/null @@ -1,423 +0,0 @@ -/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "context" - "sync" - "time" - - "ascend-common/api" - "ascend-common/common-utils/cache" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - npuContainerInfoInit sync.Once - npuChipInfoInit sync.Once - // Collector base collector for prometheus and telegraf - Collector *NpuCollector - - // ChainForSingleGoroutine a list of collectors for single goroutine - ChainForSingleGoroutine []MetricsCollector - - // ChainForMultiGoroutine a list of collectors for multi goroutine - ChainForMultiGoroutine []MetricsCollector - - // ChainForCustomPlugin a list of collectors for plugin - ChainForCustomPlugin []MetricsCollector - - updateTimeForCardIds = time.Minute -) - -const ( - maxCollectTimeout = 10 * time.Second -) - -// NpuCollector for collect metrics -type NpuCollector struct { - cache *cache.ConcurrencyLRUCache - devicesParser *container.DevicesParser - updateTime time.Duration - cacheTime time.Duration - Dmgr *devmanager.DeviceManager -} - -// NewNpuCollector create a new collector -func NewNpuCollector(cacheTime time.Duration, updateTime time.Duration, - deviceParser *container.DevicesParser, dmgr *devmanager.DeviceManager) *NpuCollector { - CommonCollector := &NpuCollector{ - cache: cache.New(cacheSize), - cacheTime: cacheTime, - updateTime: updateTime, - devicesParser: deviceParser, - Dmgr: dmgr, - } - return CommonCollector -} - -// StartCollect start collect -func StartCollect(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - npuChipInfoInitAtFirstTime(n) - startCollectSingleGoroutine(group, ctx, n) - startCollectForMultiGoroutine(group, ctx, n) - startCollectForPluginGoroutine(group, ctx, n) -} - -func startCollectForPluginGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - group.Add(1) - go func() { - defer group.Done() - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - goroutinePreCollect(ChainForCustomPlugin, n) - defer goroutinePostCollect(ChainForCustomPlugin, n) - runPluginCollect(ctx, n, ticker) - }() -} - -func runPluginCollect(ctx context.Context, n *NpuCollector, ticker *time.Ticker) { - for { - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop plugin collect") - return - default: - collectPluginMetrics(n) - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, "handling plugin collectors") - return - } - } - } -} - -func collectPluginMetrics(n *NpuCollector) { - chipList := getChipListCache(n) - for _, c := range ChainForCustomPlugin { - resultChan := make(chan struct{}, 1) - go func(cur MetricsCollector) { - cur.CollectToCache(n, chipList) - resultChan <- struct{}{} - }(c) - select { - case <-resultChan: - continue - case <-time.After(maxCollectTimeout): - logger.Errorf("collect timeout for %v", GetCacheKey(c)) - continue - } - - } -} - -func startCollectForMultiGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - chips := getChipListCache(n) - - group.Add(len(chips)) - for _, chip := range chips { - go func(chip HuaWeiAIChip) { - defer group.Done() - runChipCollector(ctx, n, chip) - }(chip) - } -} - -func runChipCollector(ctx context.Context, n *NpuCollector, chip HuaWeiAIChip) { - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - goroutinePreCollect(ChainForMultiGoroutine, n) - defer goroutinePostCollect(ChainForMultiGoroutine, n) - for { - select { - case <-ctx.Done(): - logger.Infof("received the stop signal,stop collect network info of npu(%d)", chip.LogicID) - return - default: - singleChipSlice := []HuaWeiAIChip{chip} - for _, c := range ChainForMultiGoroutine { - c.CollectToCache(n, singleChipSlice) - } - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, "collect for multigroutine ") - return - } - } - } -} - -func goroutinePreCollect(collectors []MetricsCollector, n *NpuCollector) { - chipList := getChipListCache(n) - for _, c := range collectors { - c.PreCollect(n, chipList) - } -} - -func goroutinePostCollect(collectors []MetricsCollector, n *NpuCollector) { - for _, c := range collectors { - c.PostCollect(n) - } -} - -func startCollectSingleGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - group.Add(1) - go func() { - defer group.Done() - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - goroutinePreCollect(ChainForSingleGoroutine, n) - defer goroutinePostCollect(ChainForSingleGoroutine, n) - for { - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop npu base info collect") - return - default: - chipList := getChipListCache(n) - for _, c := range ChainForSingleGoroutine { - c.CollectToCache(n, chipList) - } - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, "handling all collectors") - return - } - } - } - }() -} - -// npuChipInfoInitAtFirstTime When first enter, the cache data is empty, -// need to get the data from the device, and build the cache -func npuChipInfoInitAtFirstTime(n *NpuCollector) { - npuChipInfoInit.Do(func() { - _, err := n.cache.Get(npuListCacheKey) - if err != nil { - logger.Debug("no cache in first time, start to collect chip list and rebuild cache") - - npuInfo := getNPUChipList(n.Dmgr) - if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { - logger.Error(err) - } else { - logger.Infof(UpdateCachePattern, npuListCacheKey) - } - logger.Debug("rebuild cache successfully") - } - }) -} - -// InitCardInfo init card info -func InitCardInfo(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - - group.Add(1) - go func() { - defer group.Done() - ticker := time.NewTicker(updateTimeForCardIds) - defer ticker.Stop() - for { - logger.Info("start to collect npu chip list info") - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop card info collect") - return - default: - npuInfo := getNPUChipList(n.Dmgr) - if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { - logger.Error(err) - } else { - logger.Infof(UpdateCachePattern, npuListCacheKey) - } - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, npuListCacheKey) - return - } - } - } - }() -} - -func getNPUChipList(dmgr devmanager.DeviceInterface) (npuInfo []HuaWeiAIChip) { - chipList := make([]HuaWeiAIChip, 0) - - cardNum, cards, err := dmgr.GetCardList() - if err != nil || cardNum == 0 { - logger.Errorf("failed to get npu info, error is: %v", err) - return chipList - } - - chipListIDs := make([]int32, 0) - - for _, cardID := range cards { - deviceNum, _ := dmgr.GetDeviceNumInCard(cardID) - for deviceID := int32(0); deviceID < deviceNum; deviceID++ { - var chip HuaWeiAIChip - // get logicID - logicID, err := dmgr.GetDeviceLogicID(cardID, deviceID) - if err != nil { - logger.Errorf("get logic ID of card: %v device:%v failed: %v", cardID, deviceID, err) - continue - } - - chip.LogicID = logicID - chip.CardId = cardID - chip.MainBoardId = dmgr.GetMainBoardId() - - setPhyId(&chip, dmgr, cardID, deviceID) - setChipInfo(&chip, dmgr, cardID, deviceID) - setBoardInfo(&chip, dmgr, cardID, deviceID) - setVdieID(&chip, dmgr, cardID, deviceID) - assemblevNPUInfo(dmgr, logicID, &chip) - setPCIeBusInfo(logicID, dmgr, &chip) - setElabelInfo(&chip, dmgr, cardID) - - chipList = append(chipList, chip) - chipListIDs = append(chipListIDs, logicID) - } - } - - logger.Debugf("flush chip info list successed,chip num is : %v, chipLogicIDs: %v", - len(chipList), chipListIDs) - return chipList -} - -func setBoardInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - boardInfo, err := dmgr.GetBoardInfo(chip.LogicID) - if err != nil { - logger.Errorf("get board info of card: %v device:%v failed: %v", cardID, deviceID, err) - boardInfo = common.BoardInfo{} - } - chip.BoardInfo = &boardInfo -} -func setVdieID(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - vdieID, err := dmgr.GetDieID(chip.LogicID, dcmi.VDIE) - if err != nil { - logger.Debug(err) - } - chip.VDieID = vdieID -} - -func setPhyId(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - phyID, err := dmgr.GetPhysicIDFromLogicID(chip.LogicID) - if err != nil { - logger.Errorf("get phy ID of card: %v device:%v failed: %v", cardID, deviceID, err) - } - chip.PhyId = phyID - chip.DeviceID = phyID -} -func setChipInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - // get chip info - chipInfo, err := dmgr.GetChipInfo(chip.LogicID) - if err != nil { - logger.Errorf("get chip info of card: %v device:%v failed: %v", cardID, deviceID, err) - chipInfo = &common.ChipInfo{} - } - chip.ChipInfo = chipInfo -} - -func setPCIeBusInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *HuaWeiAIChip) { - productTypes := dmgr.GetProductTypeArray() - pcieInfo, err := dmgr.GetPCIeBusInfo(logicID) - if err != nil { - if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { - logger.Debugf("pcie bus info is not supported on %s", common.Atlas200ISoc) - hwChip.PCIeBusInfo = "" - return - } - logger.Error(err) - pcieInfo = "" - } - hwChip.PCIeBusInfo = pcieInfo -} - -func setElabelInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32) { - elabelInfo, err := dmgr.GetCardElabelV2(cardID) - if err != nil { - logger.Errorf("get elabel info of card: %v failed: %v", cardID, err) - chip.ElabelInfo = &common.ElabelInfo{SerialNumber: "NA"} - return - } - chip.ElabelInfo = &common.ElabelInfo{ - SerialNumber: elabelInfo.SerialNumber, - } -} - -func assemblevNPUInfo(dmgr devmanager.DeviceInterface, logicID int32, baseChipInfo *HuaWeiAIChip) { - if dmgr.GetDevType() != api.Ascend310P { - return - } - vDevInfos, err := dmgr.GetVirtualDeviceInfo(logicID) - if err != nil { - logger.Warnf("failed to get virtual device info,logicID(%d),err: %v", logicID, err) - baseChipInfo.VDevInfos = nil - } - if vDevInfos.TotalResource.VDevNum == 0 { - baseChipInfo.VDevInfos = &common.VirtualDevInfo{} - } - baseChipInfo.VDevInfos = &vDevInfos -} - -// GetChipListWithVNPU get chip list with vnpu -func GetChipListWithVNPU(n *NpuCollector) []HuaWeiAIChip { - result := make([]HuaWeiAIChip, 0) - chips := getChipListCache(n) - - for _, chipInfo := range chips { - isNeedHandleVnpu := n.Dmgr.GetDevType() == api.Ascend310P && chipInfo.VDevInfos != nil && - len(chipInfo.VDevInfos.VDevActivityInfo) > 0 - - if !isNeedHandleVnpu { - result = append(result, chipInfo) - continue - } - - for _, activityVDev := range chipInfo.VDevInfos.VDevActivityInfo { - vDevInfo := chipInfo - activityVDevCopy := activityVDev - vDevInfo.VDevActivityInfo = &activityVDevCopy - result = append(result, vDevInfo) - } - } - - return result - -} -func getChipListCache(n *NpuCollector) []HuaWeiAIChip { - obj, err := n.cache.Get(npuListCacheKey) - if err != nil { - logger.Errorf("get npu chip list from cache failed,err is : %v", err) - return make([]HuaWeiAIChip, 0) - } - if obj == nil { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "getChipListCache"}, - "there is no chip list info in cache,please check collect logs") - return make([]HuaWeiAIChip, 0) - } - - chipList, ok := obj.([]HuaWeiAIChip) - if !ok { - logger.Errorf("error npu chip info cache and convert failed,real type is (%T)", obj) - n.cache.Delete(npuListCacheKey) - return make([]HuaWeiAIChip, 0) - } - // if cache is empty or nil, return empty list - if len(chipList) == 0 { - return make([]HuaWeiAIChip, 0) - } - return chipList -} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go deleted file mode 100644 index 722079b..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package common for general collector -package common - -import ( - "context" - "errors" - "strconv" - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "github.com/stretchr/testify/assert" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - mockErr = errors.New("mockErr") - testError = errors.New(testErrorMsg) -) - -const ( - cacheTime = 60 * time.Second - npuCount = 8 - defaultUpdateTime = 10 * time.Millisecond - num2 = 2 - num100 = 100 - mockKey = "mockKey" - mockValue = "mockValue" - - // Test constants for setElabelInfo - testCardID = int32(1) - testProductName = "Atlas 900" - testModel = "Atlas-900-9000" - testManufacturer = "Huawei" - testManufacturerDate = "2023-01-01" - testSerialNumber = "SN123456789" - testDefaultSerial = "NA" - testErrorMsg = "get elabel info failed" -) - -type mockContainerRuntimeOperator struct{} - -// Init implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) Init() error { - return nil -} - -// Close implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) Close() error { - return nil -} - -// ContainerIDs implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetContainers(ctx context.Context) ([]*container.CommonContainer, error) { - return []*container.CommonContainer{}, nil -} - -// GetContainerInfoByID implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { - return v1.Spec{}, nil -} - -// GetIsulaContainerInfoByID implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetIsulaContainerInfoByID(ctx context.Context, - id string) (isula.ContainerJson, error) { - return isula.ContainerJson{}, nil -} - -// GetContainerType implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetContainerType() string { - return container.DefaultContainer -} - -func mockScan4AscendDevices(_ string) ([]int, bool, error) { - return []int{1}, true, nil -} - -func mockGetCgroupPath(controller, specCgroupsPath string) (string, error) { - return "", nil -} - -func makeMockDevicesParser() *container.DevicesParser { - return &container.DevicesParser{ - RuntimeOperator: new(mockContainerRuntimeOperator), - } -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -// TestNewNpuCollector test method of NewNpuCollector -func TestNewNpuCollector(t *testing.T) { - tc := newNpuCollectorTestCase{ - cacheTime: cacheTime, - updateTime: defaultUpdateTime, - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - - c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - - assert.NotNil(t, c) -} - -type testCase struct { - name string - wantErr bool - mockPart interface{} - expectValue interface{} - expectCount interface{} -} - -func newTestCase(name string, wantErr bool, mockPart interface{}) testCase { - return testCase{ - name: name, - wantErr: wantErr, - mockPart: mockPart, - } -} - -// TestGetChipInfo test method getChipInfo -func TestGetChipInfo(t *testing.T) { - tests := []testCase{ - newTestCase("should return chip info successfully when dsmi works normally", false, - &devmanager.DeviceManagerMock{}), - newTestCase("should return nil when dsmi works abnormally", true, &devmanager.DeviceManagerMockErr{}), - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - chipInfo := getNPUChipList(tt.mockPart.(devmanager.DeviceInterface)) - t.Logf("%#v", chipInfo) - assert.NotNil(t, chipInfo) - if tt.wantErr { - assert.Len(t, chipInfo, 0) - } else { - assert.NotNil(t, chipInfo) - } - }) - } -} - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") -} - -func mockGetNPUChipList() []HuaWeiAIChip { - chips := make([]HuaWeiAIChip, 0) - for id := int32(0); id < npuCount; id++ { - chip := HuaWeiAIChip{ - CardId: id, - PhyId: id, - DeviceID: id, - LogicID: id, - } - - chips = append(chips, chip) - } - return chips -} - -// TestInitCardInfo test method getChipInfo -func TestInitCardInfo(t *testing.T) { - patches := gomonkey.ApplyFuncReturn(getNPUChipList, mockGetNPUChipList()) - defer patches.Reset() - convey.Convey("test InitCardInfo", t, func() { - - ctx, cancelFunc := context.WithCancel(context.Background()) - defer cancelFunc() - npuCollector := mockNewNpuCollector() - - InitCardInfo(&sync.WaitGroup{}, ctx, npuCollector) - time.Sleep(time.Millisecond * num100) - cancelFunc() - chips := getChipListCache(npuCollector) - convey.So(len(chips), convey.ShouldEqual, npuCount) - }) -} - -// TestGetChipListCache test method getChipListCache -func TestGetChipListCache(t *testing.T) { - npuCollector := mockNewNpuCollector() - tests := []testCase{ - {name: "should return 0 chips when cache is nil", wantErr: false, mockPart: func() {}, expectCount: 0}, - {name: "should return chips : " + strconv.Itoa(npuCount), expectCount: npuCount, wantErr: false, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, mockGetNPUChipList(), cacheTime) }}, - {name: "should return 0 chips when cache value is nil", wantErr: false, expectCount: 0, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, nil, cacheTime) }}, - {name: "should return 0 chips when value is a incorrect type", expectCount: 0, wantErr: false, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, &HuaWeiAIChip{}, cacheTime) }}, - {name: "should return 0 chips when cache is empty", expectCount: 0, wantErr: false, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, []HuaWeiAIChip{}, cacheTime) }, - }, - } - - convey.Convey("getChipListCache", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - tt.mockPart.(func())() - chips := getChipListCache(npuCollector) - assert.Len(t, chips, tt.expectCount.(int)) - convey.So(len(chips), convey.ShouldEqual, tt.expectCount) - }) - } - }) -} - -func mockNewNpuCollector() *NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: cacheTime, - updateTime: defaultUpdateTime, - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -func TestNpuChipInfoInitAtFirstTime(t *testing.T) { - n := mockNewNpuCollector() - convey.Convey("TestNpuChipInfoInitAtFirstTime", t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyFuncReturn(getNPUChipList, []HuaWeiAIChip{{CardId: 0}}) - // do test - npuChipInfoInitAtFirstTime(n) - // valid cache - data, err := n.cache.Get(npuListCacheKey) - convey.So(err, convey.ShouldBeNil) - chips, ok := data.([]HuaWeiAIChip) - convey.So(ok, convey.ShouldBeTrue) - convey.So(len(chips), convey.ShouldEqual, 1) - }) -} - -func patchCollectToCache() *gomonkey.Patches { - return gomonkey.ApplyMethod(&MetricsCollectorAdapter{}, "CollectToCache", - func(_ *MetricsCollectorAdapter, n *NpuCollector, chipList []HuaWeiAIChip) { - n.cache.Set(mockKey, mockValue, n.cacheTime) - }) -} - -func TestStartCollectForMultiGoroutine(t *testing.T) { - n := mockNewNpuCollector() - wg := sync.WaitGroup{} - ChainForMultiGoroutine = []MetricsCollector{ - &MetricsCollectorAdapter{}, - &MetricsCollectorAdapter{}, - } - patches := patchCollectToCache() - defer patches.Reset() - patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{createChip()}) - convey.Convey("TestStartCollectForMultiGoroutine", t, func() { - ctx, cancel := context.WithCancel(context.Background()) - startCollectForMultiGoroutine(&wg, ctx, n) - time.Sleep(n.updateTime) - cancel() - data, err := n.cache.Get(mockKey) - convey.So(err, convey.ShouldBeNil) - value, ok := data.(string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(value, convey.ShouldEqual, mockValue) - }) -} - -func TestRunChipCollector(t *testing.T) { - n := mockNewNpuCollector() - patches := patchCollectToCache() - defer patches.Reset() - convey.Convey("TestRunChipCollector", t, func() { - ctx, cancel := context.WithCancel(context.Background()) - tickCh := make(chan time.Time) - patches.ApplyFuncReturn(time.NewTicker, &time.Ticker{C: tickCh}) - close(tickCh) - go runChipCollector(ctx, n, createChip()) - time.Sleep(n.updateTime) - cancel() - data, err := n.cache.Get(mockKey) - convey.So(err, convey.ShouldBeNil) - value, ok := data.(string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(value, convey.ShouldEqual, mockValue) - }) -} - -func TestStartCollectSingleGoroutine(t *testing.T) { - n := mockNewNpuCollector() - wg := sync.WaitGroup{} - ChainForSingleGoroutine = []MetricsCollector{ - &MetricsCollectorAdapter{}, - } - patches := patchCollectToCache() - defer patches.Reset() - convey.Convey("TestStartCollectSingleGoroutine", t, func() { - ctx, cancel := context.WithCancel(context.Background()) - startCollectSingleGoroutine(&wg, ctx, n) - time.Sleep(n.updateTime) - cancel() - data, err := n.cache.Get(mockKey) - convey.So(err, convey.ShouldBeNil) - value, ok := data.(string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(value, convey.ShouldEqual, mockValue) - }) -} - -type chipsCase struct { - name string - devType string - buildChips func() - expectValue int -} - -func TestGetChipListWithVNPU(t *testing.T) { - n := mockNewNpuCollector() - chip := HuaWeiAIChip{} - tests := []chipsCase{ - {name: "TestGetChipListWithVNPU_310p_no_vnpu", - devType: api.Ascend310P, - buildChips: func() { - chip = createChip() - }, - expectValue: 1, - }, - {name: "TestGetChipListWithVNPU_310p_2_vnpus", - devType: api.Ascend310P, - buildChips: func() { - chip = createValidVnpuChip() - }, - expectValue: num2, - }, - {name: "TestGetChipListWithVNPU_910", - devType: api.Ascend910, - buildChips: func() { - chip = createChip() - }, - expectValue: 1, - }, - } - - convey.Convey("TestGetChipListWithVNPU", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - tt.buildChips() - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.devType) - patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{chip}) - - chips := GetChipListWithVNPU(n) - convey.So(len(chips), convey.ShouldEqual, tt.expectValue) - }) - } - }) -} - -func createValidVnpuChip() HuaWeiAIChip { - chip := createChip() - chip.VDevInfos = &common.VirtualDevInfo{ - VDevActivityInfo: []common.VDevActivityInfo{ - { - VDevID: 0, - VDevAiCore: 0, - VDevTotalMem: 0, - VDevUsedMem: 0, - IsVirtualDev: true, - }, - { - VDevID: 1, - VDevAiCore: 1, - VDevTotalMem: 1, - VDevUsedMem: 1, - IsVirtualDev: true, - }, - }, - } - return chip -} - -func createChip() HuaWeiAIChip { - return HuaWeiAIChip{ - CardId: 0, - PhyId: 0, - DeviceID: 0, - LogicID: 0, - ChipInfo: &common.ChipInfo{ - Name: api.Ascend910, - Type: "Ascend", - Version: "V1", - }, - } -} - -func TestSetPCIeBusInfo(t *testing.T) { - const mockPcieBus = "0000:01:00.0" - tests := []struct { - name string - productTypes []string - err error - expectValue string - }{{ - name: "TestSetPCIeBusInfo_910", - productTypes: []string{api.Ascend910}, - err: nil, - expectValue: mockPcieBus, - }, { - name: "TestSetPCIeBusInfo_910_err", - productTypes: []string{api.Ascend910}, - err: mockErr, - expectValue: "", - }, { - name: "TestSetPCIeBusInfo_Atlas200ISoc", - productTypes: []string{common.Atlas200ISoc}, - err: nil, - expectValue: mockPcieBus, - }, { - name: "TestSetPCIeBusInfo_Atlas200ISoc_err", - productTypes: []string{common.Atlas200ISoc}, - err: mockErr, - expectValue: "", - }} - chip := createChip() - convey.Convey("TestSetPCIeBusInfo", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - dmgr := &devmanager.DeviceManager{ProductTypes: tt.productTypes} - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(dmgr, "GetPCIeBusInfo", mockPcieBus, tt.err) - - setPCIeBusInfo(0, dmgr, &chip) - convey.So(chip.PCIeBusInfo, convey.ShouldEqual, tt.expectValue) - }) - } - }) -} - -type setElabelInfoTestCase struct { - name string - cardID int32 - mockElabelInfo common.ElabelInfo - mockError error - expectSerial string - expectProduct string - expectModel string - expectManufacturer string - expectManufacturerDate string -} - -func createSetElabelInfoTestCases() []setElabelInfoTestCase { - return []setElabelInfoTestCase{ - { - name: "should set elabel info successfully when GetCardElabelV2 returns valid data", - cardID: testCardID, - mockElabelInfo: common.ElabelInfo{ - ProductName: testProductName, - Model: testModel, - Manufacturer: testManufacturer, - ManufacturerDate: testManufacturerDate, - SerialNumber: testSerialNumber, - }, - mockError: nil, - expectSerial: testSerialNumber, - expectProduct: testProductName, - expectModel: testModel, - expectManufacturer: testManufacturer, - expectManufacturerDate: testManufacturerDate, - }, - { - name: "should set default elabel info when GetCardElabelV2 returns error", - cardID: testCardID, - mockElabelInfo: common.ElabelInfo{}, - mockError: testError, - expectSerial: testDefaultSerial, - expectProduct: "", - expectModel: "", - expectManufacturer: "", - expectManufacturerDate: "", - }, - } -} - -func executeSetElabelInfoTest(tc setElabelInfoTestCase) { - // Create mock device manager - mockDmgr := &devmanager.DeviceManager{} - - // Create test chip - chip := &HuaWeiAIChip{} - - // Apply gomonkey patches - patches := gomonkey.NewPatches() - defer patches.Reset() - - patches.ApplyMethodReturn(mockDmgr, "GetCardElabelV2", - tc.mockElabelInfo, tc.mockError) - - // Execute the function under test - setElabelInfo(chip, mockDmgr, tc.cardID) - - // Verify results - convey.So(chip.ElabelInfo, convey.ShouldNotBeNil) - convey.So(chip.ElabelInfo.SerialNumber, convey.ShouldEqual, tc.expectSerial) -} - -// TestSetElabelInfo test setElabelInfo method -func TestSetElabelInfo(t *testing.T) { - testCases := createSetElabelInfoTestCases() - - convey.Convey("TestSetElabelInfo", t, func() { - for _, tc := range testCases { - convey.Convey(tc.name, func() { - executeSetElabelInfoTest(tc) - }) - } - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/common/types.go b/mind-cluster/component/npu-exporter/collector/common/types.go deleted file mode 100644 index 4576c85..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/types.go +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for collector -package common - -import ( - "ascend-common/devmanager/common" -) - -// HuaWeiAIChip chip info -type HuaWeiAIChip struct { - - // CardId npu card id - CardId int32 `json:"card_id"` - // PhyId npu chip phy id - PhyId int32 `json:"phy_id"` - // DeviceID the chip physic ID - DeviceID int32 `json:"device_id"` - // the chip logic ID - LogicID int32 `json:"logic_id"` - // VDieID the vdie id - VDieID string `json:"vdie_id"` - // MainBoardId main board id , used to distinguish between A900A3SuperPod and A9000A3SuperPod - MainBoardId uint32 - // ChipInfo the chip info - ChipInfo *common.ChipInfo `json:"chip_info"` - // BoardInfo board info of device, but not display - BoardInfo *common.BoardInfo - - // VDevActivityInfo the activity virtual device info - VDevActivityInfo *common.VDevActivityInfo `json:"v_dev_activity_info"` - // VDevInfos the virtual device info - VDevInfos *common.VirtualDevInfo `json:"v_dev_infos"` - // PCIeBusInfo bus info - PCIeBusInfo string - // ElabelInfo elabel info including SN - ElabelInfo *common.ElabelInfo `json:"elabel_info"` -} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config.go deleted file mode 100644 index be32832..0000000 --- a/mind-cluster/component/npu-exporter/collector/config/metrics_config.go +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package config for general collector -package config - -import ( - "encoding/json" - "fmt" - "reflect" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" - - "ascend-common/common-utils/utils" -) - -var ( - // singleGoroutineMap metrics in this map will be collected in single goroutine - singleGoroutineMap = map[string]common.MetricsCollector{ - groupHccs: &metrics.HccsCollector{}, - groupNpu: &metrics.BaseInfoCollector{}, - groupSio: &metrics.SioCollector{}, - groupVersion: &metrics.VersionCollector{}, - groupHbm: &metrics.HbmCollector{}, - groupDDR: &metrics.DdrCollector{}, - groupVnpu: &metrics.VnpuCollector{}, - groupPcie: &metrics.PcieCollector{}, - } - // multiGoroutineMap metrics in this map will be collected in multi goroutine - multiGoroutineMap = map[string]common.MetricsCollector{ - groupNetwork: &metrics.NetworkCollector{}, - groupRoce: &metrics.RoceCollector{}, - groupOptical: &metrics.OpticalCollector{}, - } - // pluginCollectorMap metrics in this map will be collected in plugin goroutine - pluginCollectorMap = map[string]common.MetricsCollector{} - presetConfigs = make([]map[string]string, 0) - pluginConfigs = make([]map[string]string, 0) - - defaultPresetConfigs = []map[string]string{ - {metricsGroup: groupDDR, state: stateOn}, - {metricsGroup: groupHccs, state: stateOn}, - {metricsGroup: groupNpu, state: stateOn}, - {metricsGroup: groupNetwork, state: stateOn}, - {metricsGroup: groupPcie, state: stateOn}, - {metricsGroup: groupRoce, state: stateOn}, - {metricsGroup: groupSio, state: stateOn}, - {metricsGroup: groupVnpu, state: stateOn}, - {metricsGroup: groupVersion, state: stateOn}, - {metricsGroup: groupOptical, state: stateOn}, - {metricsGroup: groupHbm, state: stateOn}, - } - defaultPluginConfigs = []map[string]string{ - {metricsGroup: groupText, state: stateOn}, - } -) - -const ( - metricsGroup = "metricsGroup" - state = "state" - - groupDDR = "ddr" - groupHccs = "hccs" - groupNpu = "npu" - groupNetwork = "network" - groupPcie = "pcie" - groupRoce = "roce" - groupSio = "sio" - groupVnpu = "vnpu" - groupVersion = "version" - groupOptical = "optical" - groupHbm = "hbm" - groupText = "text" - - stateOn = "ON" - stateOFF = "OFF" -) - -const ( - PresetConfigPath = "/usr/local/metricConfiguration.json" - PluginConfigPath = "/usr/local/pluginConfiguration.json" -) - -func loadConfiguration() { - if fileBytes := loadFromFile(PresetConfigPath); fileBytes == nil { - logger.Warnf("load config from file %s failed, use default config", PresetConfigPath) - presetConfigs = defaultPresetConfigs - } else { - initConfiguration(fileBytes, &presetConfigs) - } - if fileBytes := loadFromFile(PluginConfigPath); fileBytes == nil { - logger.Warnf("load config from file %s failed, use default config", PluginConfigPath) - pluginConfigs = defaultPluginConfigs - } else { - initConfiguration(fileBytes, &pluginConfigs) - } -} - -func loadFromFile(filePath string) []byte { - fileBytes, err := utils.LoadFile(filePath) - if err != nil { - return nil - } - return fileBytes -} - -func initConfiguration(fileBytes []byte, configs *[]map[string]string) { - if err := json.Unmarshal(fileBytes, configs); err != nil { - logger.Errorf("unmarshal config byte failed: %v", err) - return - } -} - -// AddPluginCollector add plugin collector to cache -func AddPluginCollector(name string, collector common.MetricsCollector) error { - if _, exist := pluginCollectorMap[name]; exist { - logger.Errorf("plugin collector %v already exist", name) - return fmt.Errorf("plugin collector %v already exist", name) - } - logger.Infof("add plugin collector %v ok", name) - pluginCollectorMap[name] = collector - return nil -} - -// DeletePluginCollector delete plugin collector from cache -func DeletePluginCollector(name string) { - if _, exist := pluginCollectorMap[name]; !exist { - logger.Warnf("plugin collector %v does not exist", name) - return - } - logger.Infof("delete plugin collector %v ok", name) - delete(pluginCollectorMap, name) -} - -// Register register collector to cache -func Register(n *common.NpuCollector) { - loadConfiguration() - - for _, config := range presetConfigs { - metricsGroupName := config[metricsGroup] - - if config[state] != stateOn { - logger.Infof("metricsGroup [%v] is off", metricsGroupName) - continue - } - logger.Infof("metricsGroup [%v] is on", metricsGroupName) - collector, exist := singleGoroutineMap[metricsGroupName] - if exist && collector.IsSupported(n) { - common.ChainForSingleGoroutine = append(common.ChainForSingleGoroutine, collector) - } - - collector, exist = multiGoroutineMap[metricsGroupName] - if exist && collector.IsSupported(n) { - common.ChainForMultiGoroutine = append(common.ChainForMultiGoroutine, collector) - } - } - - for _, config := range pluginConfigs { - metricsGroupName := config[metricsGroup] - - if config[state] != stateOn { - logger.Infof("plugin collector [%v] is off", metricsGroupName) - continue - } - logger.Infof("plugin collector [%v] is on", metricsGroupName) - collector, exist := pluginCollectorMap[metricsGroupName] - if exist && collector.IsSupported(n) { - logger.Infof("add plugin collector:%v", metricsGroupName) - common.ChainForCustomPlugin = append(common.ChainForCustomPlugin, collector) - } - - } - - logger.Infof("ChainForSingleGoroutine:%#v", common.ChainForSingleGoroutine) - logger.Infof("ChainForMultiGoroutine:%#v", common.ChainForMultiGoroutine) - logger.Infof("ChainForCustomPlugin:%#v", common.ChainForCustomPlugin) -} - -// UnRegister delete collector from chain -func UnRegister(worker reflect.Type) { - logger.Debugf("unRegister collector:%v", worker) - unRegisterChain(worker, &common.ChainForSingleGoroutine) - unRegisterChain(worker, &common.ChainForMultiGoroutine) - unRegisterChain(worker, &common.ChainForCustomPlugin) -} - -func unRegisterChain(worker reflect.Type, chain *[]common.MetricsCollector) { - newChain := make([]common.MetricsCollector, 0) - for _, collector := range *chain { - if reflect.TypeOf(collector) != worker { - newChain = append(newChain, collector) - } - } - *chain = newChain -} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go deleted file mode 100644 index 974ed3e..0000000 --- a/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package config for general collector -package config - -import ( - "ascend-common/common-utils/utils" - "reflect" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" -) - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - initChain() -} - -func initChain() { - common.ChainForSingleGoroutine = []common.MetricsCollector{} - common.ChainForMultiGoroutine = []common.MetricsCollector{} -} - -func TestInitConfiguration(t *testing.T) { - convey.Convey("TestInitConfiguration", t, func() { - initConfiguration([]byte("test"), &presetConfigs) - convey.So(len(presetConfigs), convey.ShouldEqual, 0) - }) -} - -func TestLoadConfiguration(t *testing.T) { - convey.Convey("TestLoadConfiguration", t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - convey.Convey("load config ok", func() { - patches.ApplyFunc(loadFromFile, func(filePath string) []byte { - if filePath == PresetConfigPath { - filePath = "../../build/metricConfiguration.json" - } else if filePath == PluginConfigPath { - filePath = "../../build/pluginConfiguration.json" - } - fileBytes, _ := utils.LoadFile(filePath) - return fileBytes - }) - defer func() { - presetConfigs = make([]map[string]string, 0) - pluginConfigs = make([]map[string]string, 0) - }() - loadConfiguration() - convey.So(len(presetConfigs), convey.ShouldBeGreaterThan, 0) - convey.So(len(pluginConfigs), convey.ShouldBeGreaterThan, 0) - }) - convey.Convey("load config fail", func() { - presetConfigs = make([]map[string]string, 0) - pluginConfigs = make([]map[string]string, 0) - patches.ApplyFunc(loadFromFile, func(filePath string) []byte { - return nil - }) - loadConfiguration() - convey.So(len(presetConfigs), convey.ShouldEqual, len(defaultPresetConfigs)) - convey.So(len(pluginConfigs), convey.ShouldEqual, len(defaultPluginConfigs)) - }) - }) -} - -func TestAddPluginCollector(t *testing.T) { - convey.Convey("TestAddPluginCollector", t, func() { - convey.Convey("add plugin ok", func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - defer func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - }() - err := AddPluginCollector("test", &metrics.HccsCollector{}) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("add plugin fail", func() { - pluginCollectorMap["test"] = &metrics.HccsCollector{} - defer func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - }() - err := AddPluginCollector("test", &metrics.HccsCollector{}) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestDeletePluginCollector(t *testing.T) { - convey.Convey("TestDeletePluginCollector", t, func() { - convey.Convey("delete plugin ok", func() { - pluginCollectorMap["test"] = &metrics.HccsCollector{} - DeletePluginCollector("test") - convey.So(pluginCollectorMap["test"], convey.ShouldBeNil) - }) - convey.Convey("delete plugin fail", func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - DeletePluginCollector("test") - convey.So(len(pluginCollectorMap), convey.ShouldEqual, 0) - }) - }) -} - -func TestRegister(t *testing.T) { - convey.Convey("TestRegister", t, func() { - n := &common.NpuCollector{} - patches := gomonkey.NewPatches() - defer patches.Reset() - // Mock IsSupported method to always return true - patches.ApplyMethodReturn(&metrics.HccsCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.BaseInfoCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.SioCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.VersionCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.HbmCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.DdrCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.VnpuCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.PcieCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.NetworkCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.RoceCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.OpticalCollector{}, "IsSupported", true) - patches.ApplyFunc(loadConfiguration, func() { - initConfiguration(loadFromFile("../../build/metricConfiguration.json"), &presetConfigs) - initConfiguration(loadFromFile("../../build/pluginConfiguration.json"), &pluginConfigs) - }) - Register(n) - convey.Convey("Should add collectors to ChainForSingleGoroutine", func() { - convey.So(len(common.ChainForSingleGoroutine), convey.ShouldBeGreaterThan, 0) - }) - convey.Convey("Should add collectors to ChainForMultiGoroutine", func() { - convey.So(len(common.ChainForMultiGoroutine), convey.ShouldBeGreaterThan, 0) - }) - }) -} - -func TestUnRegister(t *testing.T) { - convey.Convey("TestUnRegister", t, func() { - // Initialize chains with some collectors - common.ChainForSingleGoroutine = []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.BaseInfoCollector{}, - } - common.ChainForMultiGoroutine = []common.MetricsCollector{ - &metrics.NetworkCollector{}, - &metrics.RoceCollector{}, - } - - convey.Convey("When UnRegister is called with HccsCollector type", func() { - UnRegister(reflect.TypeOf(&metrics.HccsCollector{})) - - convey.Convey("Should remove HccsCollector from ChainForSingleGoroutine", func() { - expected := []common.MetricsCollector{ - &metrics.BaseInfoCollector{}, - } - convey.So(len(common.ChainForSingleGoroutine), convey.ShouldEqual, len(expected)) - for i, collector := range common.ChainForSingleGoroutine { - convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) - } - }) - - convey.Convey("Should not affect ChainForMultiGoroutine", func() { - expected := []common.MetricsCollector{ - &metrics.NetworkCollector{}, - &metrics.RoceCollector{}, - } - convey.So(len(common.ChainForMultiGoroutine), convey.ShouldEqual, len(expected)) - for i, collector := range common.ChainForMultiGoroutine { - convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) - } - }) - }) - }) -} - -func TestUnRegisterChain(t *testing.T) { - convey.Convey("TestUnRegisterChain", t, func() { - // Initialize a chain with some collectors - chain := []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.BaseInfoCollector{}, - &metrics.NetworkCollector{}, - } - - convey.Convey("When unRegisterChain is called with BaseInfoCollector type", func() { - unRegisterChain(reflect.TypeOf(&metrics.BaseInfoCollector{}), &chain) - convey.Convey("Should remove BaseInfoCollector from the chain", func() { - expected := []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.NetworkCollector{}, - } - convey.So(len(chain), convey.ShouldEqual, len(expected)) - for i, collector := range chain { - convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) - } - }) - }) - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go deleted file mode 100644 index 5ee3c7f..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go +++ /dev/null @@ -1,870 +0,0 @@ -// -//Copyright 2018 The Kubernetes Authors. -//Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. -//modify descripe: remove unused options for example: -//remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" -// -//Licensed under the Apache License, Version 2.0 (the "License"); -//you may not use this file except in compliance with the License. -//You may obtain a copy of the License at -// -//http://www.apache.org/licenses/LICENSE-2.0 -// -//Unless required by applicable law or agreed to in writing, software -//distributed under the License is distributed on an "AS IS" BASIS, -//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -//See the License for the specific language governing permissions and -//limitations under the License. - -// To regenerate api.pb.go run hack/update-generated-runtime.sh - -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.28.1 -// protoc v3.13.0 -// source: isula_api.proto - -package isula - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type ContainerState int32 - -const ( - ContainerState_CONTAINER_CREATED ContainerState = 0 - ContainerState_CONTAINER_RUNNING ContainerState = 1 - ContainerState_CONTAINER_EXITED ContainerState = 2 - ContainerState_CONTAINER_UNKNOWN ContainerState = 3 -) - -// Enum value maps for ContainerState. -var ( - ContainerState_name = map[int32]string{ - 0: "CONTAINER_CREATED", - 1: "CONTAINER_RUNNING", - 2: "CONTAINER_EXITED", - 3: "CONTAINER_UNKNOWN", - } - ContainerState_value = map[string]int32{ - "CONTAINER_CREATED": 0, - "CONTAINER_RUNNING": 1, - "CONTAINER_EXITED": 2, - "CONTAINER_UNKNOWN": 3, - } -) - -func (x ContainerState) Enum() *ContainerState { - p := new(ContainerState) - *p = x - return p -} - -func (x ContainerState) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (ContainerState) Descriptor() protoreflect.EnumDescriptor { - return file_isula_api_proto_enumTypes[0].Descriptor() -} - -func (ContainerState) Type() protoreflect.EnumType { - return &file_isula_api_proto_enumTypes[0] -} - -func (x ContainerState) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use ContainerState.Descriptor instead. -func (ContainerState) EnumDescriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{0} -} - -// ImageSpec is an internal representation of an image. Currently, it wraps the -// value of a Container's Image field (e.g. imageID or imageDigest), but in the -// future it will include more detailed information about the different image types. -type ImageSpec struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` - // Unstructured key-value map holding arbitrary metadata. - // ImageSpec Annotations can be used to help the runtime target specific - // images in multi-arch images. - Annotations map[string]string `protobuf:"bytes,2,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` -} - -func (x *ImageSpec) Reset() { - *x = ImageSpec{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ImageSpec) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ImageSpec) ProtoMessage() {} - -func (x *ImageSpec) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ImageSpec.ProtoReflect.Descriptor instead. -func (*ImageSpec) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{0} -} - -func (x *ImageSpec) GetImage() string { - if x != nil { - return x.Image - } - return "" -} - -func (x *ImageSpec) GetAnnotations() map[string]string { - if x != nil { - return x.Annotations - } - return nil -} - -// ContainerMetadata holds all necessary information for building the container -// name. The container runtime is encouraged to expose the metadata in its user -// interface for better user experience. E.g., runtime can construct a unique -// container name based on the metadata. Note that (name, attempt) is unique -// within a sandbox for the entire lifetime of the sandbox. -type ContainerMetadata struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // Name of the container. Same as the container name in the PodSpec. - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - // Attempt number of creating the container. Default: 0. - Attempt uint32 `protobuf:"varint,2,opt,name=attempt,proto3" json:"attempt,omitempty"` -} - -func (x *ContainerMetadata) Reset() { - *x = ContainerMetadata{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ContainerMetadata) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ContainerMetadata) ProtoMessage() {} - -func (x *ContainerMetadata) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ContainerMetadata.ProtoReflect.Descriptor instead. -func (*ContainerMetadata) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{1} -} - -func (x *ContainerMetadata) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *ContainerMetadata) GetAttempt() uint32 { - if x != nil { - return x.Attempt - } - return 0 -} - -// ContainerStateValue is the wrapper of ContainerState. -type ContainerStateValue struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // State of the container. - State ContainerState `protobuf:"varint,1,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` -} - -func (x *ContainerStateValue) Reset() { - *x = ContainerStateValue{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ContainerStateValue) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ContainerStateValue) ProtoMessage() {} - -func (x *ContainerStateValue) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[2] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ContainerStateValue.ProtoReflect.Descriptor instead. -func (*ContainerStateValue) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{2} -} - -func (x *ContainerStateValue) GetState() ContainerState { - if x != nil { - return x.State - } - return ContainerState_CONTAINER_CREATED -} - -// ContainerFilter is used to filter containers. -// All those fields are combined with 'AND' -type ContainerFilter struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // ID of the container. - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - // State of the container. - State *ContainerStateValue `protobuf:"bytes,2,opt,name=state,proto3" json:"state,omitempty"` - // ID of the PodSandbox. - PodSandboxId string `protobuf:"bytes,3,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` - // LabelSelector to select matches. - // Only api.MatchLabels is supported for now and the requirements - // are ANDed. MatchExpressions is not supported yet. - LabelSelector map[string]string `protobuf:"bytes,4,rep,name=label_selector,json=labelSelector,proto3" json:"label_selector,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` -} - -func (x *ContainerFilter) Reset() { - *x = ContainerFilter{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ContainerFilter) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ContainerFilter) ProtoMessage() {} - -func (x *ContainerFilter) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[3] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ContainerFilter.ProtoReflect.Descriptor instead. -func (*ContainerFilter) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{3} -} - -func (x *ContainerFilter) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *ContainerFilter) GetState() *ContainerStateValue { - if x != nil { - return x.State - } - return nil -} - -func (x *ContainerFilter) GetPodSandboxId() string { - if x != nil { - return x.PodSandboxId - } - return "" -} - -func (x *ContainerFilter) GetLabelSelector() map[string]string { - if x != nil { - return x.LabelSelector - } - return nil -} - -type ListContainersRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Filter *ContainerFilter `protobuf:"bytes,1,opt,name=filter,proto3" json:"filter,omitempty"` -} - -func (x *ListContainersRequest) Reset() { - *x = ListContainersRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ListContainersRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ListContainersRequest) ProtoMessage() {} - -func (x *ListContainersRequest) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[4] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ListContainersRequest.ProtoReflect.Descriptor instead. -func (*ListContainersRequest) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{4} -} - -func (x *ListContainersRequest) GetFilter() *ContainerFilter { - if x != nil { - return x.Filter - } - return nil -} - -// Container provides the runtime information for a container, such as ID, hash, -// state of the container. -type Container struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // ID of the container, used by the container runtime to identify - // a container. - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - // ID of the sandbox to which this container belongs. - PodSandboxId string `protobuf:"bytes,2,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` - // Metadata of the container. - Metadata *ContainerMetadata `protobuf:"bytes,3,opt,name=metadata,proto3" json:"metadata,omitempty"` - // Spec of the image. - Image *ImageSpec `protobuf:"bytes,4,opt,name=image,proto3" json:"image,omitempty"` - // Reference to the image in use. For most runtimes, this should be an - // image ID. - ImageRef string `protobuf:"bytes,5,opt,name=image_ref,json=imageRef,proto3" json:"image_ref,omitempty"` - // State of the container. - State ContainerState `protobuf:"varint,6,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` - // Creation time of the container in nanoseconds. - CreatedAt int64 `protobuf:"varint,7,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` - // Key-value pairs that may be used to scope and select individual resources. - Labels map[string]string `protobuf:"bytes,8,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` - // Unstructured key-value map holding arbitrary metadata. - // Annotations MUST NOT be altered by the runtime; the value of this field - // MUST be identical to that of the corresponding ContainerConfig used to - // instantiate this Container. - Annotations map[string]string `protobuf:"bytes,9,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` -} - -func (x *Container) Reset() { - *x = Container{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[5] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *Container) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*Container) ProtoMessage() {} - -func (x *Container) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[5] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use Container.ProtoReflect.Descriptor instead. -func (*Container) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{5} -} - -func (x *Container) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *Container) GetPodSandboxId() string { - if x != nil { - return x.PodSandboxId - } - return "" -} - -func (x *Container) GetMetadata() *ContainerMetadata { - if x != nil { - return x.Metadata - } - return nil -} - -func (x *Container) GetImage() *ImageSpec { - if x != nil { - return x.Image - } - return nil -} - -func (x *Container) GetImageRef() string { - if x != nil { - return x.ImageRef - } - return "" -} - -func (x *Container) GetState() ContainerState { - if x != nil { - return x.State - } - return ContainerState_CONTAINER_CREATED -} - -func (x *Container) GetCreatedAt() int64 { - if x != nil { - return x.CreatedAt - } - return 0 -} - -func (x *Container) GetLabels() map[string]string { - if x != nil { - return x.Labels - } - return nil -} - -func (x *Container) GetAnnotations() map[string]string { - if x != nil { - return x.Annotations - } - return nil -} - -type ListContainersResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // List of containers. - Containers []*Container `protobuf:"bytes,1,rep,name=containers,proto3" json:"containers,omitempty"` -} - -func (x *ListContainersResponse) Reset() { - *x = ListContainersResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[6] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ListContainersResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ListContainersResponse) ProtoMessage() {} - -func (x *ListContainersResponse) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[6] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ListContainersResponse.ProtoReflect.Descriptor instead. -func (*ListContainersResponse) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{6} -} - -func (x *ListContainersResponse) GetContainers() []*Container { - if x != nil { - return x.Containers - } - return nil -} - -var File_isula_api_proto protoreflect.FileDescriptor - -var file_isula_api_proto_rawDesc = []byte{ - 0x0a, 0x0f, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x5f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x12, 0x10, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, - 0x68, 0x61, 0x32, 0x22, 0xb1, 0x01, 0x0a, 0x09, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, - 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, - 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, - 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, - 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, - 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x3e, 0x0a, 0x10, 0x41, 0x6e, 0x6e, 0x6f, 0x74, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, - 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, - 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x41, 0x0a, 0x11, 0x43, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, - 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, - 0x12, 0x18, 0x0a, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x0d, 0x52, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x22, 0x4d, 0x0a, 0x13, 0x43, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, 0x6c, 0x75, - 0x65, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, - 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, - 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0xa3, 0x02, 0x0a, 0x0f, 0x43, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x12, 0x0e, 0x0a, - 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x3b, 0x0a, - 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x25, 0x2e, 0x72, - 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, - 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, - 0x6c, 0x75, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, - 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x5f, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, - 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, - 0x12, 0x5b, 0x0a, 0x0e, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, - 0x6f, 0x72, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, - 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, 0x65, - 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0d, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x1a, 0x40, 0x0a, - 0x12, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, - 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, - 0x52, 0x0a, 0x15, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x39, 0x0a, 0x06, 0x66, 0x69, 0x6c, 0x74, - 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, - 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x52, 0x06, 0x66, 0x69, 0x6c, - 0x74, 0x65, 0x72, 0x22, 0xb5, 0x04, 0x0a, 0x09, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, - 0x72, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, - 0x64, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, - 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, - 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, 0x12, 0x3f, 0x0a, 0x08, 0x6d, 0x65, 0x74, 0x61, 0x64, - 0x61, 0x74, 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x72, 0x75, 0x6e, 0x74, - 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x08, - 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x31, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, - 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, - 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x49, 0x6d, 0x61, 0x67, 0x65, - 0x53, 0x70, 0x65, 0x63, 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x69, - 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x72, 0x65, 0x66, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, - 0x69, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x66, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, - 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, - 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, - 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x61, 0x74, 0x18, 0x07, - 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x41, 0x74, 0x12, - 0x3f, 0x0a, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, - 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, - 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, - 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, - 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, - 0x09, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, - 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, - 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, - 0x1a, 0x39, 0x0a, 0x0b, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, - 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, - 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x1a, 0x3e, 0x0a, 0x10, 0x41, - 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, - 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, - 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x55, 0x0a, 0x16, 0x4c, - 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x73, - 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, - 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, - 0x72, 0x73, 0x2a, 0x6b, 0x0a, 0x0e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, - 0x74, 0x61, 0x74, 0x65, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, - 0x52, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x43, - 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x52, 0x55, 0x4e, 0x4e, 0x49, 0x4e, 0x47, - 0x10, 0x01, 0x12, 0x14, 0x0a, 0x10, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, - 0x45, 0x58, 0x49, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, - 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x03, 0x32, - 0x77, 0x0a, 0x0e, 0x52, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, - 0x65, 0x12, 0x65, 0x0a, 0x0e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x73, 0x12, 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, - 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x28, 0x2e, 0x72, - 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, - 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0a, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, - 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_isula_api_proto_rawDescOnce sync.Once - file_isula_api_proto_rawDescData = file_isula_api_proto_rawDesc -) - -func file_isula_api_proto_rawDescGZIP() []byte { - file_isula_api_proto_rawDescOnce.Do(func() { - file_isula_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_isula_api_proto_rawDescData) - }) - return file_isula_api_proto_rawDescData -} - -var file_isula_api_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_isula_api_proto_msgTypes = make([]protoimpl.MessageInfo, 11) -var file_isula_api_proto_goTypes = []interface{}{ - (ContainerState)(0), // 0: runtime.v1alpha2.ContainerState - (*ImageSpec)(nil), // 1: runtime.v1alpha2.ImageSpec - (*ContainerMetadata)(nil), // 2: runtime.v1alpha2.ContainerMetadata - (*ContainerStateValue)(nil), // 3: runtime.v1alpha2.ContainerStateValue - (*ContainerFilter)(nil), // 4: runtime.v1alpha2.ContainerFilter - (*ListContainersRequest)(nil), // 5: runtime.v1alpha2.ListContainersRequest - (*Container)(nil), // 6: runtime.v1alpha2.Container - (*ListContainersResponse)(nil), // 7: runtime.v1alpha2.ListContainersResponse - nil, // 8: runtime.v1alpha2.ImageSpec.AnnotationsEntry - nil, // 9: runtime.v1alpha2.ContainerFilter.LabelSelectorEntry - nil, // 10: runtime.v1alpha2.Container.LabelsEntry - nil, // 11: runtime.v1alpha2.Container.AnnotationsEntry -} -var file_isula_api_proto_depIdxs = []int32{ - 8, // 0: runtime.v1alpha2.ImageSpec.annotations:type_name -> runtime.v1alpha2.ImageSpec.AnnotationsEntry - 0, // 1: runtime.v1alpha2.ContainerStateValue.state:type_name -> runtime.v1alpha2.ContainerState - 3, // 2: runtime.v1alpha2.ContainerFilter.state:type_name -> runtime.v1alpha2.ContainerStateValue - 9, // 3: runtime.v1alpha2.ContainerFilter.label_selector:type_name -> runtime.v1alpha2.ContainerFilter.LabelSelectorEntry - 4, // 4: runtime.v1alpha2.ListContainersRequest.filter:type_name -> runtime.v1alpha2.ContainerFilter - 2, // 5: runtime.v1alpha2.Container.metadata:type_name -> runtime.v1alpha2.ContainerMetadata - 1, // 6: runtime.v1alpha2.Container.image:type_name -> runtime.v1alpha2.ImageSpec - 0, // 7: runtime.v1alpha2.Container.state:type_name -> runtime.v1alpha2.ContainerState - 10, // 8: runtime.v1alpha2.Container.labels:type_name -> runtime.v1alpha2.Container.LabelsEntry - 11, // 9: runtime.v1alpha2.Container.annotations:type_name -> runtime.v1alpha2.Container.AnnotationsEntry - 6, // 10: runtime.v1alpha2.ListContainersResponse.containers:type_name -> runtime.v1alpha2.Container - 5, // 11: runtime.v1alpha2.RuntimeService.ListContainers:input_type -> runtime.v1alpha2.ListContainersRequest - 7, // 12: runtime.v1alpha2.RuntimeService.ListContainers:output_type -> runtime.v1alpha2.ListContainersResponse - 12, // [12:13] is the sub-list for method output_type - 11, // [11:12] is the sub-list for method input_type - 11, // [11:11] is the sub-list for extension type_name - 11, // [11:11] is the sub-list for extension extendee - 0, // [0:11] is the sub-list for field type_name -} - -func init() { file_isula_api_proto_init() } -func file_isula_api_proto_init() { - if File_isula_api_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_isula_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ImageSpec) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ContainerMetadata) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ContainerStateValue) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ContainerFilter) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ListContainersRequest) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*Container) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ListContainersResponse) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_isula_api_proto_rawDesc, - NumEnums: 1, - NumMessages: 11, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_isula_api_proto_goTypes, - DependencyIndexes: file_isula_api_proto_depIdxs, - EnumInfos: file_isula_api_proto_enumTypes, - MessageInfos: file_isula_api_proto_msgTypes, - }.Build() - File_isula_api_proto = out.File - file_isula_api_proto_rawDesc = nil - file_isula_api_proto_goTypes = nil - file_isula_api_proto_depIdxs = nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto deleted file mode 100644 index 3f1f9f9..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto +++ /dev/null @@ -1,118 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. -Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. - modify descripe: remove unused options for example: - remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// To regenerate api.pb.go run hack/update-generated-runtime.sh -syntax = 'proto3'; - -package runtime.v1alpha2; -option go_package = "./;isula"; - -// Runtime service defines the public APIs for remote container runtimes -service RuntimeService { - // ListContainers lists all containers by filters. - rpc ListContainers(ListContainersRequest) returns (ListContainersResponse) {} -} - -// ImageSpec is an internal representation of an image. Currently, it wraps the -// value of a Container's Image field (e.g. imageID or imageDigest), but in the -// future it will include more detailed information about the different image types. -message ImageSpec { - string image = 1; - // Unstructured key-value map holding arbitrary metadata. - // ImageSpec Annotations can be used to help the runtime target specific - // images in multi-arch images. - map annotations = 2; -} - -// ContainerMetadata holds all necessary information for building the container -// name. The container runtime is encouraged to expose the metadata in its user -// interface for better user experience. E.g., runtime can construct a unique -// container name based on the metadata. Note that (name, attempt) is unique -// within a sandbox for the entire lifetime of the sandbox. -message ContainerMetadata { - // Name of the container. Same as the container name in the PodSpec. - string name = 1; - // Attempt number of creating the container. Default: 0. - uint32 attempt = 2; -} - -enum ContainerState { - CONTAINER_CREATED = 0; - CONTAINER_RUNNING = 1; - CONTAINER_EXITED = 2; - CONTAINER_UNKNOWN = 3; -} - -// ContainerStateValue is the wrapper of ContainerState. -message ContainerStateValue { - // State of the container. - ContainerState state = 1; -} - -// ContainerFilter is used to filter containers. -// All those fields are combined with 'AND' -message ContainerFilter { - // ID of the container. - string id = 1; - // State of the container. - ContainerStateValue state = 2; - // ID of the PodSandbox. - string pod_sandbox_id = 3; - // LabelSelector to select matches. - // Only api.MatchLabels is supported for now and the requirements - // are ANDed. MatchExpressions is not supported yet. - map label_selector = 4; -} - -message ListContainersRequest { - ContainerFilter filter = 1; -} - -// Container provides the runtime information for a container, such as ID, hash, -// state of the container. -message Container { - // ID of the container, used by the container runtime to identify - // a container. - string id = 1; - // ID of the sandbox to which this container belongs. - string pod_sandbox_id = 2; - // Metadata of the container. - ContainerMetadata metadata = 3; - // Spec of the image. - ImageSpec image = 4; - // Reference to the image in use. For most runtimes, this should be an - // image ID. - string image_ref = 5; - // State of the container. - ContainerState state = 6; - // Creation time of the container in nanoseconds. - int64 created_at = 7; - // Key-value pairs that may be used to scope and select individual resources. - map labels = 8; - // Unstructured key-value map holding arbitrary metadata. - // Annotations MUST NOT be altered by the runtime; the value of this field - // MUST be identical to that of the corresponding ContainerConfig used to - // instantiate this Container. - map annotations = 9; -} - -message ListContainersResponse { - // List of containers. - repeated Container containers = 1; -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go deleted file mode 100644 index a503e15..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go +++ /dev/null @@ -1,107 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. -// versions: -// - protoc-gen-go-grpc v1.2.0 -// - protoc v3.13.0 -// source: isula_api.proto - -package isula - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// RuntimeServiceClient is the client API for RuntimeService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type RuntimeServiceClient interface { - // ListContainers lists all containers by filters. - ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) -} - -type runtimeServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewRuntimeServiceClient(cc grpc.ClientConnInterface) RuntimeServiceClient { - return &runtimeServiceClient{cc} -} - -func (c *runtimeServiceClient) ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) { - out := new(ListContainersResponse) - err := c.cc.Invoke(ctx, "/runtime.v1alpha2.RuntimeService/ListContainers", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// RuntimeServiceServer is the server API for RuntimeService service. -// All implementations must embed UnimplementedRuntimeServiceServer -// for forward compatibility -type RuntimeServiceServer interface { - // ListContainers lists all containers by filters. - ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) - mustEmbedUnimplementedRuntimeServiceServer() -} - -// UnimplementedRuntimeServiceServer must be embedded to have forward compatible implementations. -type UnimplementedRuntimeServiceServer struct { -} - -func (UnimplementedRuntimeServiceServer) ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method ListContainers not implemented") -} -func (UnimplementedRuntimeServiceServer) mustEmbedUnimplementedRuntimeServiceServer() {} - -// UnsafeRuntimeServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to RuntimeServiceServer will -// result in compilation errors. -type UnsafeRuntimeServiceServer interface { - mustEmbedUnimplementedRuntimeServiceServer() -} - -func RegisterRuntimeServiceServer(s grpc.ServiceRegistrar, srv RuntimeServiceServer) { - s.RegisterService(&RuntimeService_ServiceDesc, srv) -} - -func _RuntimeService_ListContainers_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(ListContainersRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(RuntimeServiceServer).ListContainers(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/runtime.v1alpha2.RuntimeService/ListContainers", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(RuntimeServiceServer).ListContainers(ctx, req.(*ListContainersRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// RuntimeService_ServiceDesc is the grpc.ServiceDesc for RuntimeService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var RuntimeService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "runtime.v1alpha2.RuntimeService", - HandlerType: (*RuntimeServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "ListContainers", - Handler: _RuntimeService_ListContainers_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "isula_api.proto", -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go deleted file mode 100644 index e31fea9..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package isula for monitoring isula' npu allocation -package isula - -// Config represents env -type Config struct { - Env []string `json:"Env,omitempty" platform:"linux"` -} - -// DeviceInfo represents device info -type DeviceInfo struct { - PathInContainer string `json:"PathInContainer,omitempty" platform:"linux"` -} - -// HostConfig represents host config content -type HostConfig struct { - Devices []DeviceInfo `json:"Devices,omitempty" platform:"linux"` - Privileged bool `json:"Privileged,omitempty" platform:"linux"` -} - -// ContainerJson represents container json content -type ContainerJson struct { - Config *Config `json:"Config,omitempty" platform:"linux"` - HostConfig *HostConfig `json:"HostConfig,omitempty" platform:"linux"` -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go deleted file mode 100644 index 5e4f83f..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go +++ /dev/null @@ -1,278 +0,0 @@ -// ####################################################################### -// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. -// # - iSulad licensed under the Mulan PSL v2. -// # - You can use this software according to the terms and conditions of the Mulan PSL v2. -// # - You may obtain a copy of Mulan PSL v2 at: -// # - http://license.coscl.org.cn/MulanPSL2 -// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -// # - PURPOSE. -// # - See the Mulan PSL v2 for more details. -// ##- @Description: generate grpc -// ##- @Author: wujing -// ##- @Create: 2019-04-25 -// ####################################################################### - -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.28.1 -// protoc v3.13.0 -// source: isulad.proto - -package isula - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type InspectContainerRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - Bformat bool `protobuf:"varint,2,opt,name=bformat,proto3" json:"bformat,omitempty"` - Timeout int32 `protobuf:"varint,3,opt,name=timeout,proto3" json:"timeout,omitempty"` -} - -func (x *InspectContainerRequest) Reset() { - *x = InspectContainerRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_isulad_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *InspectContainerRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*InspectContainerRequest) ProtoMessage() {} - -func (x *InspectContainerRequest) ProtoReflect() protoreflect.Message { - mi := &file_isulad_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use InspectContainerRequest.ProtoReflect.Descriptor instead. -func (*InspectContainerRequest) Descriptor() ([]byte, []int) { - return file_isulad_proto_rawDescGZIP(), []int{0} -} - -func (x *InspectContainerRequest) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *InspectContainerRequest) GetBformat() bool { - if x != nil { - return x.Bformat - } - return false -} - -func (x *InspectContainerRequest) GetTimeout() int32 { - if x != nil { - return x.Timeout - } - return 0 -} - -type InspectContainerResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - ContainerJSON string `protobuf:"bytes,1,opt,name=ContainerJSON,proto3" json:"ContainerJSON,omitempty"` - Cc uint32 `protobuf:"varint,2,opt,name=cc,proto3" json:"cc,omitempty"` - Errmsg string `protobuf:"bytes,3,opt,name=errmsg,proto3" json:"errmsg,omitempty"` -} - -func (x *InspectContainerResponse) Reset() { - *x = InspectContainerResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_isulad_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *InspectContainerResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*InspectContainerResponse) ProtoMessage() {} - -func (x *InspectContainerResponse) ProtoReflect() protoreflect.Message { - mi := &file_isulad_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use InspectContainerResponse.ProtoReflect.Descriptor instead. -func (*InspectContainerResponse) Descriptor() ([]byte, []int) { - return file_isulad_proto_rawDescGZIP(), []int{1} -} - -func (x *InspectContainerResponse) GetContainerJSON() string { - if x != nil { - return x.ContainerJSON - } - return "" -} - -func (x *InspectContainerResponse) GetCc() uint32 { - if x != nil { - return x.Cc - } - return 0 -} - -func (x *InspectContainerResponse) GetErrmsg() string { - if x != nil { - return x.Errmsg - } - return "" -} - -var File_isulad_proto protoreflect.FileDescriptor - -var file_isulad_proto_rawDesc = []byte{ - 0x0a, 0x0c, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0a, - 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x22, 0x5d, 0x0a, 0x17, 0x49, 0x6e, - 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, - 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x12, - 0x18, 0x0a, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x22, 0x68, 0x0a, 0x18, 0x49, 0x6e, 0x73, - 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, - 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x24, 0x0a, 0x0d, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x43, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x12, 0x0e, 0x0a, 0x02, 0x63, - 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x02, 0x63, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x65, - 0x72, 0x72, 0x6d, 0x73, 0x67, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x65, 0x72, 0x72, - 0x6d, 0x73, 0x67, 0x32, 0x68, 0x0a, 0x10, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x54, 0x0a, 0x07, 0x49, 0x6e, 0x73, 0x70, 0x65, - 0x63, 0x74, 0x12, 0x23, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x2e, - 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, - 0x6e, 0x65, 0x72, 0x73, 0x2e, 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x42, 0x0c, 0x48, - 0x02, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x33, -} - -var ( - file_isulad_proto_rawDescOnce sync.Once - file_isulad_proto_rawDescData = file_isulad_proto_rawDesc -) - -func file_isulad_proto_rawDescGZIP() []byte { - file_isulad_proto_rawDescOnce.Do(func() { - file_isulad_proto_rawDescData = protoimpl.X.CompressGZIP(file_isulad_proto_rawDescData) - }) - return file_isulad_proto_rawDescData -} - -var file_isulad_proto_msgTypes = make([]protoimpl.MessageInfo, 2) -var file_isulad_proto_goTypes = []interface{}{ - (*InspectContainerRequest)(nil), // 0: containers.InspectContainerRequest - (*InspectContainerResponse)(nil), // 1: containers.InspectContainerResponse -} -var file_isulad_proto_depIdxs = []int32{ - 0, // 0: containers.ContainerService.Inspect:input_type -> containers.InspectContainerRequest - 1, // 1: containers.ContainerService.Inspect:output_type -> containers.InspectContainerResponse - 1, // [1:2] is the sub-list for method output_type - 0, // [0:1] is the sub-list for method input_type - 0, // [0:0] is the sub-list for extension type_name - 0, // [0:0] is the sub-list for extension extendee - 0, // [0:0] is the sub-list for field type_name -} - -func init() { file_isulad_proto_init() } -func file_isulad_proto_init() { - if File_isulad_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_isulad_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*InspectContainerRequest) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isulad_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*InspectContainerResponse) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_isulad_proto_rawDesc, - NumEnums: 0, - NumMessages: 2, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_isulad_proto_goTypes, - DependencyIndexes: file_isulad_proto_depIdxs, - MessageInfos: file_isulad_proto_msgTypes, - }.Build() - File_isulad_proto = out.File - file_isulad_proto_rawDesc = nil - file_isulad_proto_goTypes = nil - file_isulad_proto_depIdxs = nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto deleted file mode 100644 index af5f85c..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto +++ /dev/null @@ -1,35 +0,0 @@ -// ####################################################################### -// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. -// # - iSulad licensed under the Mulan PSL v2. -// # - You can use this software according to the terms and conditions of the Mulan PSL v2. -// # - You may obtain a copy of Mulan PSL v2 at: -// # - http://license.coscl.org.cn/MulanPSL2 -// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -// # - PURPOSE. -// # - See the Mulan PSL v2 for more details. -// ##- @Description: generate grpc -// ##- @Author: wujing -// ##- @Create: 2019-04-25 -// ####################################################################### -syntax = "proto3"; -option optimize_for = CODE_SIZE; - -package containers; -option go_package = "./;isula"; - -service ContainerService { - rpc Inspect(InspectContainerRequest) returns (InspectContainerResponse); -} - -message InspectContainerRequest { - string id = 1; - bool bformat = 2; - int32 timeout = 3; -} - -message InspectContainerResponse { - string ContainerJSON = 1; - uint32 cc = 2; - string errmsg = 3; -} \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go deleted file mode 100644 index c563e0a..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go +++ /dev/null @@ -1,105 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. -// versions: -// - protoc-gen-go-grpc v1.2.0 -// - protoc v3.13.0 -// source: isulad.proto - -package isula - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// ContainerServiceClient is the client API for ContainerService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type ContainerServiceClient interface { - Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) -} - -type containerServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewContainerServiceClient(cc grpc.ClientConnInterface) ContainerServiceClient { - return &containerServiceClient{cc} -} - -func (c *containerServiceClient) Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) { - out := new(InspectContainerResponse) - err := c.cc.Invoke(ctx, "/containers.ContainerService/Inspect", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// ContainerServiceServer is the server API for ContainerService service. -// All implementations must embed UnimplementedContainerServiceServer -// for forward compatibility -type ContainerServiceServer interface { - Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) - mustEmbedUnimplementedContainerServiceServer() -} - -// UnimplementedContainerServiceServer must be embedded to have forward compatible implementations. -type UnimplementedContainerServiceServer struct { -} - -func (UnimplementedContainerServiceServer) Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Inspect not implemented") -} -func (UnimplementedContainerServiceServer) mustEmbedUnimplementedContainerServiceServer() {} - -// UnsafeContainerServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to ContainerServiceServer will -// result in compilation errors. -type UnsafeContainerServiceServer interface { - mustEmbedUnimplementedContainerServiceServer() -} - -func RegisterContainerServiceServer(s grpc.ServiceRegistrar, srv ContainerServiceServer) { - s.RegisterService(&ContainerService_ServiceDesc, srv) -} - -func _ContainerService_Inspect_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(InspectContainerRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(ContainerServiceServer).Inspect(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/containers.ContainerService/Inspect", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(ContainerServiceServer).Inspect(ctx, req.(*InspectContainerRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// ContainerService_ServiceDesc is the grpc.ServiceDesc for ContainerService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var ContainerService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "containers.ContainerService", - HandlerType: (*ContainerServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "Inspect", - Handler: _ContainerService_Inspect_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "isulad.proto", -} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser.go b/mind-cluster/component/npu-exporter/collector/container/parser.go deleted file mode 100644 index 4531374..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/parser.go +++ /dev/null @@ -1,630 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container for monitoring containers' npu allocation -package container - -import ( - "bufio" - "context" - "errors" - "fmt" - "math" - "os" - "regexp" - "strconv" - "strings" - "sync" - "time" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - namespaceMoby = "moby" // Docker - namespaceK8s = "k8s.io" // CRI + Containerd - sliceLen8 = 8 - ascendEnvPart = 2 - charDevice = "c" - - minus = "-" - comma = "," - ascend = "Ascend" - maxEnvLength = 1024 - parsingNpuDefaultTimeoutDuration = 3 -) - -const ( - // EndpointTypeContainerd K8S + Containerd - EndpointTypeContainerd = iota - // EndpointTypeDockerd Docker with or without K8S - EndpointTypeDockerd - // EndpointTypeIsula K8S + isula - EndpointTypeIsula = 2 -) - -var ( - // ErrFromContext error is from the context - ErrFromContext = errors.New("error from context") - - npuMajorID []string - npuMajorFetchCtrl sync.Once - parsingNpuDefaultTimeout = parsingNpuDefaultTimeoutDuration * time.Second -) - -var ( - envErrDescribe = func(ctrID, devID, env string, err error) string { - return fmt.Sprintf("container (%s) has an invalid device ID (%s) in %s, err is %v", ctrID, devID, env, err) - } - minusStyle = func(s string) bool { - return strings.Contains(s, minus) - } - commaMinusStyle = func(s string) bool { - return strings.Contains(s, minus) && strings.Contains(s, comma) - } - ascendStyle = func(s string) bool { - return strings.Contains(s, ascend) - } -) - -// CntNpuMonitorOpts contains setting options for monitoring containers -type CntNpuMonitorOpts struct { - EndpointType int // containerd or docker - CriEndpoint string // CRI server address - UseCriBackup bool // whether try to use cri backup address - OciEndpoint string // OCI server, now is containerd address - UseOciBackup bool // whether try to use oci backup address -} - -// MakeDevicesParser evaluates option settings and make an instance according to it -func MakeDevicesParser(opts CntNpuMonitorOpts) *DevicesParser { - runtimeOperator := &RuntimeOperatorTool{ - UseCriBackup: opts.UseCriBackup, - UseOciBackup: opts.UseOciBackup, - CriEndpoint: opts.CriEndpoint, - OciEndpoint: opts.OciEndpoint, - } - parser := &DevicesParser{ - RuntimeOperator: runtimeOperator, - } - - switch opts.EndpointType { - case EndpointTypeContainerd: - runtimeOperator.Namespace = namespaceK8s - case EndpointTypeDockerd: - runtimeOperator.Namespace = namespaceMoby - case EndpointTypeIsula: - runtimeOperator.Namespace = namespaceK8s - default: - logger.Errorf("invalid type value %d", opts.EndpointType) - } - - return parser -} - -// DevicesInfo the container device information struct -type DevicesInfo struct { - // container id - ID string - // container name, the format is: PodNameSpace_PodName_ContainerName - Name string - Devices []int -} - -// DevicesInfos the device information storage map -type DevicesInfos = map[string]DevicesInfo - -// DevicesParser the parser which parse device info -type DevicesParser struct { - // instances - result chan DevicesInfos - err chan error - // configuration - RuntimeOperator RuntimeOperator - Timeout time.Duration -} - -// Init initializes connection to containerd daemon and to CRI server or dockerd daemon based on name fetcher setting -func (dp *DevicesParser) Init() error { - if err := dp.RuntimeOperator.Init(); err != nil { - return contactError(err, "connecting to container runtime failed") - } - dp.result = make(chan DevicesInfos, 1) - dp.err = make(chan error, 1) - return nil -} - -// RecvResult exposes the channel used for receiving devices info analyzing result -func (dp *DevicesParser) RecvResult() <-chan DevicesInfos { - return dp.result -} - -// RecvErr exposes the channel used for receiving errors occurred during analyzing -func (dp *DevicesParser) RecvErr() <-chan error { - return dp.err -} - -// Close closes all connections and channels established during initializing -func (dp *DevicesParser) Close() { - _ = dp.RuntimeOperator.Close() -} - -func (dp *DevicesParser) parseDevices(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { - if dp.RuntimeOperator.GetContainerType() == IsulaContainer { - return dp.parseDeviceInIsula(ctx, c, rs) - } - - return dp.parseDevicesInContainerd(ctx, c, rs) -} - -func (dp *DevicesParser) parseDevicesInContainerd(ctx context.Context, c *CommonContainer, - rs chan<- DevicesInfo) error { - if rs == nil { - return errors.New("empty result channel") - } - deviceInfo := DevicesInfo{} - defer func(di *DevicesInfo) { - rs <- *di - }(&deviceInfo) - - spec, err := dp.RuntimeOperator.GetContainerInfoByID(ctx, c.Id) - if err != nil { - return contactError(err, fmt.Sprintf("cannot get container devices by container id (%s)", c.Id)) - } - if spec.Linux == nil || spec.Linux.Resources == nil || len(spec.Linux.Resources.Devices) > maxDevicesNum { - return contactError(errors.New("device error"), - fmt.Sprintf("devices in container is too much (%v) or empty", maxDevicesNum)) - } - if spec.Process == nil || len(spec.Process.Env) > maxEnvNum { - return contactError(errors.New("env error"), fmt.Sprintf("env in container is too much (%v) or empty", - maxEnvNum)) - } - - envs := spec.Process.Env - for i := len(envs) - 1; i >= 0; i-- { - e := envs[i] - if strings.Contains(e, api.AscendDeviceInfo) { - deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) - return err - } - } - - deviceInfo, err = dp.getDevicesWithoutAscendRuntime(spec, c) - return err -} - -func (dp *DevicesParser) getDevicesWithoutAscendRuntime(spec v1.Spec, c *CommonContainer) (DevicesInfo, error) { - deviceInfo := DevicesInfo{} - devicesIDs, err := filterNPUDevices(spec) - if err != nil { - logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) - return DevicesInfo{}, nil - } - logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) - - if len(devicesIDs) != 0 { - if deviceInfo, err = makeUpDeviceInfo(c); err == nil { - deviceInfo.Devices = devicesIDs - return deviceInfo, nil - } else { - logger.Errorf("makeUpDeviceInfo failed: %s", err) - } - return DevicesInfo{}, err - } - - return DevicesInfo{}, nil -} - -func (dp *DevicesParser) getDevicesWithAscendRuntime(ascendDevEnv string, c *CommonContainer) (DevicesInfo, error) { - logger.Debugf("get device info by env (%s) in %s", ascendDevEnv, c.Id) - devInfo := strings.Split(ascendDevEnv, "=") - if len(devInfo) != ascendEnvPart { - return DevicesInfo{}, fmt.Errorf("an invalid %s env(%s)", api.AscendDeviceInfo, ascendDevEnv) - } - devicesIDs := dp.parseDiffEnvFmt(devInfo[1], c.Id) - if len(devicesIDs) == 0 { - return DevicesInfo{}, nil - } - - deviceInfo, err := makeUpDeviceInfo(c) - if err != nil { - hwlog.RunLog.Error(err) - return DevicesInfo{}, err - } - deviceInfo.Devices = devicesIDs - return deviceInfo, nil -} - -func (dp *DevicesParser) parseDiffEnvFmt(devices, containerID string) []int { - if len(devices) > maxEnvLength { - return []int{} - } - if ascendStyle(devices) { - return dp.getDeviceIDsByAscendStyle(devices, containerID) - } - if commaMinusStyle(devices) { - return dp.getDeviceIDsByCommaMinusStyle(devices, containerID) - } - if minusStyle(devices) { - return dp.getDeviceIDsByMinusStyle(devices, containerID) - } - return dp.getDeviceIDsByCommaStyle(devices, containerID) -} - -func (dp *DevicesParser) getDeviceIDsByCommaStyle(devices, containerID string) []int { - devList := strings.Split(devices, comma) - devicesIDs := make([]int, 0, len(devList)) - for _, devID := range devList { - id, err := strconv.Atoi(devID) - if err != nil { - logger.Errorf("container (%s) has an invalid device ID (%v) in %s, error is %s", containerID, - devID, api.AscendDeviceInfo, err) - continue - } - devicesIDs = append(devicesIDs, id) - } - return devicesIDs -} - -func (dp *DevicesParser) getDeviceIDsByAscendStyle(devices, containerID string) []int { - devList := strings.Split(devices, comma) - deviceIDs := make([]int, 0, len(devList)) - for _, subDevice := range devList { - deviceName := strings.Split(subDevice, minus) - if len(deviceName) != ascendEnvPart { - logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, nil)) - continue - } - id, err := strconv.Atoi(deviceName[1]) - if err != nil { - logger.Errorf(envErrDescribe(containerID, deviceName[1], api.AscendDeviceInfo, err)) - continue - } - deviceIDs = append(deviceIDs, id) - } - return deviceIDs -} - -func (dp *DevicesParser) getDeviceIDsByMinusStyle(devices, containerID string) []int { - deviceIDs := make([]int, 0) - devIDRange := strings.Split(devices, minus) - if len(devIDRange) != ascendEnvPart { - logger.Errorf(envErrDescribe(containerID, "range", api.AscendDeviceInfo, nil)) - return deviceIDs - } - minDevID, err := strconv.Atoi(devIDRange[0]) - if err != nil { - logger.Errorf(envErrDescribe(containerID, devIDRange[0], api.AscendDeviceInfo, err)) - return deviceIDs - } - maxDevID, err := strconv.Atoi(devIDRange[1]) - if err != nil { - logger.Errorf(envErrDescribe(containerID, devIDRange[1], api.AscendDeviceInfo, err)) - return deviceIDs - } - if minDevID > maxDevID { - logger.Errorf(envErrDescribe(containerID, "", - api.AscendDeviceInfo, errors.New("min id bigger than max id"))) - return deviceIDs - } - if maxDevID > math.MaxInt16 { - logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, errors.New("max id invalid"))) - return deviceIDs - } - for deviceID := minDevID; deviceID <= maxDevID; deviceID++ { - deviceIDs = append(deviceIDs, deviceID) - } - return deviceIDs -} - -func (dp *DevicesParser) getDeviceIDsByCommaMinusStyle(devices, containerID string) []int { - var deviceIDs []int - devList := strings.Split(devices, comma) - for _, subDevices := range devList { - if minusStyle(subDevices) { - deviceIDs = append(deviceIDs, dp.getDeviceIDsByMinusStyle(subDevices, containerID)...) - continue - } - deviceIDs = append(deviceIDs, dp.getDeviceIDsByCommaStyle(subDevices, containerID)...) - } - return deviceIDs -} - -func (dp *DevicesParser) getDevWithoutAscendRuntimeInIsula(containerInfo isula.ContainerJson, - c *CommonContainer) (DevicesInfo, error) { - deviceInfo := DevicesInfo{} - devicesIDs, err := filterNPUDevicesInIsula(containerInfo) - if err != nil { - logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) - return DevicesInfo{}, nil - } - logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) - - if len(devicesIDs) == 0 { - return DevicesInfo{}, nil - } - - deviceInfo, err = makeUpDeviceInfo(c) - if err != nil { - hwlog.RunLog.Error(err) - return DevicesInfo{}, err - } - deviceInfo.Devices = devicesIDs - return deviceInfo, nil -} - -func (dp *DevicesParser) parseDeviceInIsula(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { - if rs == nil { - return errors.New("empty result channel") - } - - deviceInfo := DevicesInfo{} - defer func(di *DevicesInfo) { - rs <- *di - }(&deviceInfo) - - if len(c.Id) > maxCgroupPath { - return fmt.Errorf("the containerId (%s) is too long", c.Id) - } - containerInfo, err := dp.RuntimeOperator.GetIsulaContainerInfoByID(ctx, c.Id) - if err != nil { - return contactError(err, fmt.Sprintf("getting config of container(%s) fail", c.Id)) - } - if containerInfo.HostConfig == nil || containerInfo.Config == nil { - return errors.New("empty container info") - } - - envs := containerInfo.Config.Env - for i := len(envs) - 1; i >= 0; i-- { - e := envs[i] - if strings.Contains(e, api.AscendDeviceInfo) { - deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) - return err - } - } - - deviceInfo, err = dp.getDevWithoutAscendRuntimeInIsula(containerInfo, c) - return err -} - -func (dp *DevicesParser) collect(ctx context.Context, r <-chan DevicesInfo, ct int32) (DevicesInfos, error) { - if r == nil { - return nil, errors.New("receiving channel is empty") - } - if ct < 0 { - return nil, nil - } - - results := make(map[string]DevicesInfo, ct) - for { - select { - case info, ok := <-r: - if !ok { - return nil, nil - } - if info.ID != "" { - results[info.ID] = info - } - if ct -= 1; ct <= 0 { - return results, nil - } - case <-ctx.Done(): - hwlog.RunLog.Error("ctx is timeout") - dp.err <- ErrFromContext - return nil, nil - } - } -} - -func (dp *DevicesParser) doParse(resultOut chan<- DevicesInfos) { - var result DevicesInfos = nil - defer func(rslt DevicesInfos) { - if resultOut != nil { - resultOut <- rslt - close(resultOut) - } - }(result) - - ctx := context.Background() - containers, err := dp.RuntimeOperator.GetContainers(ctx) - if err != nil { - dp.err <- err - return - } - - l := len(containers) - if l == 0 || l > maxContainers { - logger.Debugf("get %d containers from cri interface, return empty data", l) - dp.result <- make(DevicesInfos) - return - } - - r := make(chan DevicesInfo) - defer close(r) - wg := sync.WaitGroup{} - wg.Add(l) - - for _, container := range containers { - go func(container *CommonContainer, c context.Context) { - if err := dp.parseDevices(c, container, r); err != nil { - dp.err <- err - } - wg.Done() - }(container, ctx) - } - ctx, cancelFn := context.WithTimeout(ctx, withDefault(dp.Timeout, parsingNpuDefaultTimeout)) - defer cancelFn() - result, err = dp.collect(ctx, r, int32(l)) - if err != nil { - logger.Errorf("collect info error: %v", err) - } - - if result != nil { - dp.result <- result - } - wg.Wait() -} - -// FetchAndParse triggers the asynchronous process of querying and analyzing all containers -// resultOut channel is for fetching the current result -func (dp *DevicesParser) FetchAndParse(resultOut chan<- DevicesInfos) { - if dp.err == nil { - logger.Debug("device paster is not initialized") - return - } - go dp.doParse(resultOut) -} - -func withDefault(v time.Duration, d time.Duration) time.Duration { - if v == 0 { - return d - } - - return v -} - -// query the MajorID of NPU devices -func getNPUMajorID() ([]string, error) { - const ( - deviceCount = 2 - maxSearchLine = 512 - ) - - path, err := utils.CheckPath("/proc/devices") - if err != nil { - return nil, err - } - majorID := make([]string, 0, deviceCount) - f, err := os.Open(path) - if err != nil { - return majorID, err - } - defer func() { - err = f.Close() - if err != nil { - hwlog.RunLog.Error(err) - } - }() - s := bufio.NewScanner(f) - count := 0 - for s.Scan() { - // prevent from searching too many lines - if count > maxSearchLine { - break - } - count++ - text := s.Text() - matched, err := regexp.MatchString("^[0-9]{1,3}\\s[v]?devdrv-cdev$", text) - if err != nil { - return majorID, err - } - if !matched { - continue - } - fields := strings.Fields(text) - majorID = append(majorID, fields[0]) - } - return majorID, nil -} - -func npuMajor() []string { - npuMajorFetchCtrl.Do(func() { - var err error - npuMajorID, err = getNPUMajorID() - if err != nil { - return - } - }) - return npuMajorID -} - -func contains(slice []string, target string) bool { - for _, v := range slice { - if v == target { - return true - } - } - return false -} - -func contactError(err error, msg string) error { - return fmt.Errorf("%s->%s", err.Error(), msg) -} - -func filterNPUDevices(spec v1.Spec) ([]int, error) { - if spec.Linux == nil || spec.Linux.Resources == nil { - return nil, errors.New("empty spec info") - } - - const base = 10 - devIDs := make([]int, 0, sliceLen8) - majorIDs := npuMajor() - for _, dev := range spec.Linux.Resources.Devices { - if dev.Minor == nil || dev.Major == nil { - // do not monitor privileged container - continue - } - if *dev.Minor > math.MaxInt32 { - return nil, fmt.Errorf("get wrong device ID (%v)", dev.Minor) - } - major := strconv.FormatInt(*dev.Major, base) - if dev.Type == charDevice && contains(majorIDs, major) { - devIDs = append(devIDs, int(*dev.Minor)) - } - } - - return devIDs, nil -} - -// filterNPUDevicesInIsula get id of device from containerJson(containerInfo) -func filterNPUDevicesInIsula(containerInfo isula.ContainerJson) ([]int, error) { - privileged := containerInfo.HostConfig.Privileged - if privileged { - return nil, errors.New("it's a privileged container and skip it") - } - - devIDs := make([]int, 0, sliceLen8) - devices := containerInfo.HostConfig.Devices - for _, dev := range devices { - Id, err := getDevIdFromPath(api.DevicePathPattern, dev.PathInContainer) - if err != nil { - logger.Warn(err) - continue - } - devIDs = append(devIDs, Id) - } - - return devIDs, nil -} - -func getDevIdFromPath(pattern, path string) (int, error) { - if match, err := regexp.MatchString(pattern, path); err != nil || !match { - return -1, fmt.Errorf("unexpected path of device: %s or match error: %v", path, err) - } - number := regexp.MustCompile(`\d+`) - IdStr := number.FindString(path) - Id, err := strconv.Atoi(IdStr) - if err != nil { - return -1, fmt.Errorf("unexpected device ID (%v)", IdStr) - } - if Id > math.MaxInt32 { - return -1, fmt.Errorf("get wrong device ID (%v)", Id) - } - return Id, nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser_test.go b/mind-cluster/component/npu-exporter/collector/container/parser_test.go deleted file mode 100644 index f2975b9..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/parser_test.go +++ /dev/null @@ -1,1027 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container provides utilities for container monitoring and testing. -package container - -import ( - "context" - "errors" - "os" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - // Test endpoint constants - testContainerdEndpoint = "unix:///run/containerd.sock" - testDockerEndpoint = "unix:///run/docker.sock" - - device0 = 0 - device1 = 1 - device2 = 2 - device3 = 3 - testDeviceRange = "0-2" - testDeviceComma = "0,1,2" - testDeviceCommaRange = "0-1,2-3" - testAscendDevice0 = "Ascend-0" - testAscendDevices = "Ascend-0,Ascend-1" - testMixedDevices = "0-1,3" - - // Test error constants - testOriginalError = "original error" - testErrorMessage = "test message" - testContactedError = "original error->test message" - - // Test path constants - testDevicePattern = "/dev/npu([0-9]+)" - - // Test duration constants - testZeroDuration = 0 -) - -func TestMakeDevicesParser(t *testing.T) { - testCases := []struct { - name string - opts CntNpuMonitorOpts - expected *DevicesParser - }{ - {name: "should create parser when options are valid for containerd", - opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeContainerd, - OciEndpoint: testContainerdEndpoint, UseOciBackup: false, UseCriBackup: false}, - expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: false, UseCriBackup: false, - CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, - {name: "should create parser when options are valid for docker", - opts: CntNpuMonitorOpts{CriEndpoint: testDockerEndpoint, EndpointType: EndpointTypeDockerd, - OciEndpoint: testDockerEndpoint, UseOciBackup: true, UseCriBackup: false}, - expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, - CriEndpoint: testDockerEndpoint, OciEndpoint: testDockerEndpoint}, Timeout: testZeroDuration}}, - {name: "should create parser when options are valid for isula", - opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeIsula, - OciEndpoint: testContainerdEndpoint, UseOciBackup: true, UseCriBackup: true}, - expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, - CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, - } - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - result := MakeDevicesParser(tc.opts) - convey.So(result, convey.ShouldNotBeNil) - convey.So(result.RuntimeOperator, convey.ShouldNotBeNil) - convey.So(result.Timeout, convey.ShouldEqual, tc.expected.Timeout) - }) - } -} - -func TestDevicesParserInit(t *testing.T) { - convey.Convey("TestDevicesParserInit", t, func() { - convey.Convey("should initialize successfully when runtime operator init succeeds", func() { - dp := &DevicesParser{ - RuntimeOperator: &RuntimeOperatorTool{}, - } - - patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", nil) - defer patches.Reset() - - err := dp.Init() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when initialization fails", func() { - dp := &DevicesParser{ - RuntimeOperator: &RuntimeOperatorTool{}, - } - patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", errors.New("init failed")) - defer patches.Reset() - err := dp.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "init failed") - }) - }) -} - -func TestDevicesParserRecvResult(t *testing.T) { - convey.Convey("TestDevicesParserRecvResult", t, func() { - convey.Convey("should return result channel when initialized", func() { - dp := &DevicesParser{ - result: make(chan DevicesInfos, 1), - } - resultChan := dp.RecvResult() - convey.So(resultChan, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserRecvErr(t *testing.T) { - convey.Convey("TestDevicesParserRecvErr", t, func() { - convey.Convey("should return error channel when initialized", func() { - dp := &DevicesParser{ - err: make(chan error, 1), - } - errChan := dp.RecvErr() - convey.So(errChan, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserClose(t *testing.T) { - convey.Convey("TestDevicesParserClose", t, func() { - convey.Convey("should close runtime operator when called", func() { - mockOperator := &RuntimeOperatorTool{} - dp := &DevicesParser{ - RuntimeOperator: mockOperator, - } - - visited := false - patches := gomonkey.ApplyMethod(mockOperator, "Close", func(*RuntimeOperatorTool) error { - visited = true - return nil - }) - defer patches.Reset() - - dp.Close() - convey.So(visited, convey.ShouldBeTrue) - }) - }) -} - -func TestDevicesParserParseDevices(t *testing.T) { - convey.Convey("TestDevicesParserParseDevices", t, func() { - convey.Convey("should parse isula devices when container type is isula", func() { - dp := &DevicesParser{} - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", IsulaContainer). - ApplyFuncReturn((*DevicesParser).parseDeviceInIsula, nil) - defer patches.Reset() - - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - resultChan := make(chan DevicesInfo, 1) - err := dp.parseDevices(ctx, container, resultChan) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should parse containerd devices when container type is not isula", func() { - dp := &DevicesParser{} - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", DefaultContainer). - ApplyFuncReturn((*DevicesParser).parseDevicesInContainerd, nil) - defer patches.Reset() - - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - resultChan := make(chan DevicesInfo, 1) - err := dp.parseDevices(ctx, container, resultChan) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestDevicesParserParseDevicesInContainerd(t *testing.T) { - convey.Convey("TestDevicesParserParseDevicesInContainerd", t, func() { - convey.Convey("should return error when result channel is nil", func() { - dp := &DevicesParser{} - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - - err := dp.parseDevicesInContainerd(ctx, container, nil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") - }) - - convey.Convey("should return error when get container info fails", func() { - dp := &DevicesParser{} - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethod(mockOperator, "GetContainerInfoByID", - func(*RuntimeOperatorTool, context.Context, string) (v1.Spec, error) { - return v1.Spec{}, errors.New("get container info failed") - }) - defer patches.Reset() - - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - resultChan := make(chan DevicesInfo, 1) - - err := dp.parseDevicesInContainerd(ctx, container, resultChan) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserGetDevicesWithoutAscendRuntime(t *testing.T) { - convey.Convey("TestDevicesParserGetDevicesWithoutAscendRuntime", t, func() { - convey.Convey("should return devices when filter succeeds", func() { - dp := &DevicesParser{} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevices, []int{device0, device1, device2}, nil) - defer patches.Reset() - - patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) - - spec := v1.Spec{} - container := &CommonContainer{Id: "test-container"} - - result, err := dp.getDevicesWithoutAscendRuntime(spec, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) - }) - - convey.Convey("should return empty when filter fails", func() { - dp := &DevicesParser{} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevices, nil, errors.New("filter failed")) - defer patches.Reset() - - spec := v1.Spec{} - container := &CommonContainer{Id: "test-container"} - - result, err := dp.getDevicesWithoutAscendRuntime(spec, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldResemble, DevicesInfo{}) - }) - }) -} - -func TestDevicesParserGetDevicesWithAscendRuntime(t *testing.T) { - convey.Convey("TestDevicesParserGetDevicesWithAscendRuntime", t, func() { - convey.Convey("should return error when env format is invalid", func() { - dp := &DevicesParser{} - ascendDevEnv := "invalid-env" - container := &CommonContainer{Id: "test-container"} - - result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) - convey.So(err, convey.ShouldNotBeNil) - convey.So(result, convey.ShouldResemble, DevicesInfo{}) - }) - - convey.Convey("should return devices when env format is valid", func() { - dp := &DevicesParser{} - ascendDevEnv := "ASCEND_VISIBLE_DEVICES=0,1,2" - container := &CommonContainer{Id: "test-container"} - - patches := gomonkey.ApplyFunc(makeUpDeviceInfo, func(*CommonContainer) (DevicesInfo, error) { - return DevicesInfo{ID: "test", Name: "test-name"}, nil - }) - defer patches.Reset() - - result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) - }) - }) -} - -func TestDevicesParserGetDevWithoutAscendRuntimeInIsula(t *testing.T) { - convey.Convey("TestDevicesParserGetDevWithoutAscendRuntimeInIsula", t, func() { - convey.Convey("should return devices when filter succeeds", func() { - dp := &DevicesParser{} - containerInfo := isula.ContainerJson{} - container := &CommonContainer{Id: "test-container"} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, []int{device0, device1, device2}, nil) - defer patches.Reset() - - patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) - - result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) - }) - - convey.Convey("should return empty when filter fails", func() { - dp := &DevicesParser{} - containerInfo := isula.ContainerJson{} - container := &CommonContainer{Id: "test-container"} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, nil, errors.New("filter failed")) - defer patches.Reset() - - result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldResemble, DevicesInfo{}) - }) - }) -} - -func TestDevicesParserParseDeviceInIsula(t *testing.T) { - convey.Convey("TestDevicesParserParseDeviceInIsula", t, func() { - convey.Convey("should return error when result channel is nil", func() { - dp := &DevicesParser{} - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - - err := dp.parseDeviceInIsula(ctx, container, nil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") - }) - - convey.Convey("should return error when container id is too long", func() { - dp := &DevicesParser{} - ctx := context.Background() - longId := string(make([]byte, maxCgroupPath+1)) - container := &CommonContainer{Id: longId} - resultChan := make(chan DevicesInfo, 1) - - err := dp.parseDeviceInIsula(ctx, container, resultChan) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserCollect(t *testing.T) { - convey.Convey("TestDevicesParserCollect", t, func() { - convey.Convey("should return error when receiving channel is nil", func() { - dp := &DevicesParser{} - ctx := context.Background() - - result, err := dp.collect(ctx, nil, 1) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "receiving channel is empty") - convey.So(result, convey.ShouldBeNil) - }) - - convey.Convey("should return nil when count is negative", func() { - dp := &DevicesParser{} - ctx := context.Background() - resultChan := make(chan DevicesInfo) - - result, err := dp.collect(ctx, resultChan, -1) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldBeNil) - }) - }) -} - -func TestDevicesParserDoParse(t *testing.T) { - convey.Convey("TestDevicesParserDoParse", t, func() { - const time100ms = 100 * time.Millisecond - convey.Convey("should handle error when get containers fails", func() { - dp := &DevicesParser{ - err: make(chan error, 1), - } - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethod(mockOperator, "GetContainers", - func(*RuntimeOperatorTool, context.Context) ([]*CommonContainer, error) { - return nil, errors.New("get containers failed") - }) - defer patches.Reset() - - resultChan := make(chan DevicesInfos, 1) - dp.doParse(resultChan) - - select { - case err := <-dp.err: - convey.So(err, convey.ShouldNotBeNil) - case <-time.After(time100ms): - convey.So("timeout", convey.ShouldEqual, "should receive error") - } - }) - }) -} - -func TestDevicesParserFetchAndParse(t *testing.T) { - const time10ms = 10 * time.Millisecond - convey.Convey("TestDevicesParserFetchAndParse", t, func() { - convey.Convey("should return early when err channel is nil", func() { - dp := &DevicesParser{ - err: nil, - } - visited := make(chan bool, 1) - patches := gomonkey.ApplyPrivateMethod(dp, "doParse", - func(*DevicesParser, chan<- DevicesInfos) error { - visited <- true - return nil - }) - defer patches.Reset() - - dp.FetchAndParse(nil) - time.Sleep(time10ms) - convey.So(len(visited), convey.ShouldEqual, 0) - }) - - convey.Convey("should start parsing when initialized", func() { - dp := &DevicesParser{ - err: make(chan error, 1), - RuntimeOperator: &RuntimeOperatorTool{}, - } - visited := make(chan bool, 1) - patches := gomonkey.ApplyPrivateMethod(dp, "doParse", - func(*DevicesParser, chan<- DevicesInfos) error { - visited <- true - return nil - }) - defer patches.Reset() - - dp.FetchAndParse(nil) - time.Sleep(time10ms) - convey.So(len(visited), convey.ShouldEqual, 1) - }) - }) -} - -func TestDevicesParserGetDeviceIDsByMinusStyle(t *testing.T) { - convey.Convey("TestDevicesParserGetDeviceIDsByMinusStyle", t, func() { - testCases := []struct { - name string - devices string - expected []int - }{ - {name: "should return empty slice when devices string is invalid", devices: "invalid-devices", expected: []int{}}, - {name: "should return empty slice when min device ID is invalid", devices: "invalid-5", expected: []int{}}, - {name: "should return empty slice when max device ID is invalid", devices: "0-invalid", expected: []int{}}, - {name: "should return empty slice when min ID is bigger than max ID", devices: "5-3", expected: []int{}}, - {name: "should return empty slice when max ID is too large", devices: "0-99999", expected: []int{}}, - {name: "should return device IDs when range is valid", devices: "0-2", expected: []int{0, 1, 2}}, - {name: "should return single device ID when min equals max", devices: "1-1", expected: []int{1}}, - } - for _, tc := range testCases { - convey.Convey(tc.name, func() { - dp := &DevicesParser{} - result := dp.getDeviceIDsByMinusStyle(tc.devices, "test-container") - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetNPUMajorID(t *testing.T) { - testCases := builderTestGetNPUMajorIDCases() - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - _, cleanup := tc.setup(t) - defer cleanup() - result, err := getNPUMajorID() - if tc.hasError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - } - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } -} - -type TestGetNPUMajorIDCase struct { - name string - setup func(*testing.T) (*gomonkey.Patches, func()) - expected []string - hasError bool -} - -func builderTestGetNPUMajorIDCases() []TestGetNPUMajorIDCase { - testCases := []TestGetNPUMajorIDCase{{name: "should return error when path check fails", - setup: func(*testing.T) (*gomonkey.Patches, func()) { - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) - return patches, func() { patches.Reset() } - }, expected: nil, hasError: true}, - {name: "should return error when file open fails", - setup: func(*testing.T) (*gomonkey.Patches, func()) { - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, "/proc/devices", nil) - p1.ApplyFuncReturn(os.Open, nil, errors.New("file open failed")) - return p1, func() { p1.Reset() } - }, expected: []string{}, hasError: true}, - {name: "should return empty slice when no NPU devices found", - setup: func(t *testing.T) (*gomonkey.Patches, func()) { - tmpFile, clean, err := mkTemp("1 mem\n2 pty\n") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) - return p1, func() { clean(); p1.Reset() } - }, expected: []string{}, hasError: false}, - {name: "should return major IDs when NPU devices found", - setup: func(t *testing.T) (*gomonkey.Patches, func()) { - tmpFile, clean, err := mkTemp("195 devdrv-cdev\n196 devdrv-cdev\n") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) - return p1, func() { clean(); p1.Reset() } - }, expected: []string{"195", "196"}, hasError: false}, - {name: "should return major IDs when mixed devices found", - setup: func(t *testing.T) (*gomonkey.Patches, func()) { - tmpFile, clean, err := mkTemp("1 mem\n195 devdrv-cdev\n2 pty\n196 devdrv-cdev\n") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) - return p1, func() { clean(); p1.Reset() } - }, expected: []string{"195", "196"}, hasError: false}, - } - return testCases -} - -func TestNpuMajor(t *testing.T) { - convey.Convey("TestNpuMajor", t, func() { - convey.Convey("should return cached major IDs", func() { - patches := gomonkey.ApplyFuncReturn(getNPUMajorID, []string{"123", "456"}, nil) - defer patches.Reset() - - result := npuMajor() - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -func TestFilterNPUDevices(t *testing.T) { - convey.Convey("TestFilterNPUDevices", t, func() { - const mockMajorID = 236 - convey.Convey("should return error when spec is empty", func() { - spec := v1.Spec{} - result, err := filterNPUDevices(spec) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "empty spec info") - convey.So(result, convey.ShouldBeNil) - }) - - convey.Convey("should return devices when spec is valid", func() { - spec := v1.Spec{ - Linux: &v1.Linux{ - Resources: &v1.LinuxResources{ - Devices: []v1.LinuxDeviceCgroup{{Type: "c", Major: int64Ptr(mockMajorID), Minor: int64Ptr(0)}}, - }, - }, - } - patches := gomonkey.ApplyFuncReturn(npuMajor, []string{"236"}) - defer patches.Reset() - - result, err := filterNPUDevices(spec) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -// mkTemp creates a temporary file with the given content and returns the file name, -// a cleanup function, and an error. The file is closed before returning. -func mkTemp(content string) (string, func(), error) { - f, err := os.CreateTemp("", "test_*") - if err != nil { - return "", func() {}, err - } - if _, err = f.WriteString(content); err != nil { - clean(f) - return "", func() {}, err - } - if _, err = f.Seek(0, 0); err != nil { - clean(f) - return "", func() {}, err - } - name := f.Name() - return name, func() { clean(f) }, nil -} - -func clean(f *os.File) { - if f == nil { - return - } - if err := f.Close(); err != nil { - logger.Errorf("an error occurred where close file [%v],err :%v", f.Name(), err) - } - if err := os.Remove(f.Name()); err != nil { - logger.Errorf("an error occurred where remove file [%v],err :%v", f.Name(), err) - } -} - -func TestFilterNPUDevicesInIsula(t *testing.T) { - convey.Convey("TestFilterNPUDevicesInIsula", t, func() { - convey.Convey("should return error when container is privileged", func() { - containerInfo := isula.ContainerJson{ - HostConfig: &isula.HostConfig{ - Privileged: true, - }, - } - - result, err := filterNPUDevicesInIsula(containerInfo) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "privileged container") - convey.So(result, convey.ShouldBeNil) - }) - - convey.Convey("should return devices when container is not privileged", func() { - containerInfo := isula.ContainerJson{ - HostConfig: &isula.HostConfig{ - Privileged: false, - Devices: []isula.DeviceInfo{ - { - PathInContainer: "/dev/npu0", - }, - }, - }, - } - - patches := gomonkey.ApplyFuncReturn(getDevIdFromPath, 0, nil) - defer patches.Reset() - - result, err := filterNPUDevicesInIsula(containerInfo) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -// Helper function for creating int64 pointers -func int64Ptr(v int64) *int64 { - return &v -} - -func TestParseDiffEnvFmt(t *testing.T) { - convey.Convey("TestParseDiffEnvFmt", t, func() { - dp := &DevicesParser{} - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - {name: "should parse comma style devices when valid", - devices: testDeviceComma, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - {name: "should parse minus style devices when valid", - devices: testDeviceRange, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - {name: "should parse ascend style devices when valid", - devices: testAscendDevices, - containerID: "test-container", - expected: []int{device0, device1}, - }, - {name: "should parse comma minus style devices when valid", - devices: testDeviceCommaRange, - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - {name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.parseDiffEnvFmt(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByCommaStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByCommaStyle", t, func() { - dp := &DevicesParser{} - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - {name: "should parse comma separated devices when valid", - devices: "0,1,2,3", - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - {name: "should parse single device when valid", - devices: "0", - containerID: "test-container", - expected: []int{device0}, - }, - {name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - {name: "should parse devices with spaces when valid", - devices: testDeviceComma, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByCommaStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByAscendStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByAscendStyle", t, func() { - dp := &DevicesParser{} - - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - { - name: "should parse ascend devices when valid", - devices: "Ascend-0,Ascend-1,Ascend-2", - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - { - name: "should parse single ascend device when valid", - devices: testAscendDevice0, - containerID: "test-container", - expected: []int{0}, - }, - { - name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - { - name: "should parse mixed case ascend devices when valid", - devices: "ascend-0,ASCEND-1", - containerID: "test-container", - expected: []int{device0, device1}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByAscendStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByMinusStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByMinusStyle", t, func() { - dp := &DevicesParser{} - - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - { - name: "should parse range devices when valid", - devices: "0-3", - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - { - name: "should parse single device range when valid", - devices: "0-0", - containerID: "test-container", - expected: []int{device0}, - }, - { - name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByMinusStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByCommaMinusStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByCommaMinusStyle", t, func() { - dp := &DevicesParser{} - - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - { - name: "should parse comma minus devices when valid", - devices: testDeviceCommaRange, - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - { - name: "should parse single range when valid", - devices: testDeviceRange, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - { - name: "should return nil when devices are empty", - devices: "", - containerID: "test-container", - expected: nil, - }, - { - name: "should parse mixed ranges when valid", - devices: testMixedDevices, - containerID: "test-container", - expected: []int{device0, device1, device3}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByCommaMinusStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestContains(t *testing.T) { - convey.Convey("TestContains", t, func() { - testCases := []struct { - name string - slice []string - target string - expected bool - }{ - { - name: "should return true when target exists in slice", - slice: []string{"a", "b", "c"}, - target: "b", - expected: true, - }, - { - name: "should return false when target does not exist in slice", - slice: []string{"a", "b", "c"}, - target: "d", - expected: false, - }, - { - name: "should return false when slice is empty", - slice: []string{}, - target: "a", - expected: false, - }, - { - name: "should return false when slice is nil", - slice: nil, - target: "a", - expected: false, - }, - { - name: "should return false when target is empty string", - slice: []string{"a", "b", "c"}, - target: "", - expected: false, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := contains(tc.slice, tc.target) - convey.So(result, convey.ShouldEqual, tc.expected) - }) - } - }) -} - -func TestContactError(t *testing.T) { - convey.Convey("TestContactError", t, func() { - testCases := []struct { - name string - err error - msg string - expected string - }{ - { - name: "should concatenate error with message when both provided", - err: errors.New(testOriginalError), - msg: testErrorMessage, - expected: testContactedError, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := contactError(tc.err, tc.msg) - convey.So(result.Error(), convey.ShouldEqual, tc.expected) - }) - } - }) -} - -func TestGetDevIdFromPath(t *testing.T) { - convey.Convey("TestGetDevIdFromPath", t, func() { - testCases := []struct { - name string - pattern string - path string - expected int - hasError bool - }{ - {name: "should extract device id when path is valid", - pattern: testDevicePattern, - path: "/dev/npu0", - expected: 0, - hasError: false, - }, - {name: "should extract device id when path has multiple digits", - pattern: testDevicePattern, - path: "/dev/npu123", - expected: 123, - hasError: false, - }, - {name: "should return error when device path is invalid", - pattern: testDevicePattern, - path: "/dev/cpu0", - expected: 0, - hasError: true, - }, - {name: "should return error when path is empty", - pattern: testDevicePattern, - path: "", - expected: 0, - hasError: true, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result, err := getDevIdFromPath(tc.pattern, tc.path) - if tc.hasError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldEqual, tc.expected) - } - }) - } - }) -} - -func TestWithDefault(t *testing.T) { - convey.Convey("TestWithDefault", t, func() { - const time0s = 0 - const time3s = 3 * time.Second - const time5s = 5 * time.Second - testCases := []struct { - name string - v time.Duration - d time.Duration - expected time.Duration - }{ - {name: "should return default when duration is zero", - v: time0s, - d: time5s, - expected: time5s, - }, - {name: "should return value when duration is non-zero", - v: time3s, - d: time5s, - expected: time3s, - }, - {name: "should return value when duration is negative", - v: -1 * time.Second, - d: time5s, - expected: -1 * time.Second, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := withDefault(tc.v, tc.d) - convey.So(result, convey.ShouldEqual, tc.expected) - }) - } - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go deleted file mode 100644 index daab834..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go +++ /dev/null @@ -1,413 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container for monitoring containers' npu allocation -package container - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "strings" - "syscall" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/metadata" - "google.golang.org/grpc/status" - criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" - "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" - - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - labelK8sPodNamespace = "io.kubernetes.pod.namespace" - labelK8sPodName = "io.kubernetes.pod.name" - labelContainerName = "io.kubernetes.container.name" - - // DefaultIsuladAddr default isulad sock adress - DefaultIsuladAddr = "unix:///run/isulad.sock" - // DefaultDockerShim default docker shim sock address - DefaultDockerShim = "unix:///run/dockershim.sock" - // DefaultCRIDockerd default cri-dockerd sock address - DefaultCRIDockerd = "unix:///run/cri-dockerd.sock" - // DefaultContainerdAddr default containerd sock address - DefaultContainerdAddr = "unix:///run/containerd/containerd.sock" - // DefaultDockerAddr default docker containerd sock address - DefaultDockerAddr = "unix:///run/docker/containerd/docker-containerd.sock" - defaultDockerOnEuler = "unix:///run/docker/containerd/containerd.sock" - grpcHeader = "containerd-namespace" - unixPre = "unix://" - - // IsulaContainer represents isula container type - IsulaContainer = "isula" - // DefaultContainer represents default container type - DefaultContainer = "docker-containerd" - excludePermissions = 0002 - - criV1alpha2 = "runtime.v1alpha2.RuntimeService" -) - -// CommonContainer wraps some common container attribute of isulad and containerd -type CommonContainer struct { - Id string - Labels map[string]string -} - -// RuntimeOperator wraps operations against container runtime -type RuntimeOperator interface { - Init() error - Close() error - GetContainers(ctx context.Context) ([]*CommonContainer, error) - GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) - GetIsulaContainerInfoByID(ctx context.Context, id string) (isula.ContainerJson, error) - GetContainerType() string -} - -// RuntimeOperatorTool implements RuntimeOperator interface -type RuntimeOperatorTool struct { - criConn *grpc.ClientConn - conn *grpc.ClientConn - criClient interface{} - client interface{} - // CriEndpoint CRI server endpoint - CriEndpoint string - // OciEndpoint containerd Server endpoint - OciEndpoint string - // Namespace the namespace of containerd - Namespace string - // UseCriBackup use cri back up address or not - UseCriBackup bool - // UseOciBackup use oci back up address or not - UseOciBackup bool -} - -// Init initializes container runtime operator -func (operator *RuntimeOperatorTool) Init() error { - start := syscall.Getuid() - logger.Debugf("the init uid is:%d", start) - if start != 0 { - err := syscall.Setuid(0) - if err != nil { - return fmt.Errorf("raise uid failed: %v", err) - } - logger.Debugf("raise uid to:%d", 0) - defer func() { - err = syscall.Setuid(start) - if err != nil { - logger.Errorf("recover uid failed: %v", err) - } - logger.Debugf("recover uid to:%d", start) - }() - } - if err := sockCheck(operator); err != nil { - hwlog.RunLog.Error("check socket path failed") - return err - } - - if err := operator.initCriClient(); err != nil { - return fmt.Errorf("init CRI client failed, %s", err) - } - - if err := operator.initOciClient(); err != nil { - return fmt.Errorf("init OCI client failed, %s", err) - } - return nil -} - -func (operator *RuntimeOperatorTool) initCriClient() error { - criConn, err := GetConnection(operator.CriEndpoint) - if err != nil || criConn == nil { - msg := fmt.Sprintf("connecting to CRI server failed: %v", err) - if operator.UseCriBackup { - logger.Warnf("%v, will use cri-dockerd address to try again", msg) - if utils.IsExist(strings.TrimPrefix(DefaultCRIDockerd, unixPre)) { - criConn, err = GetConnection(DefaultCRIDockerd) - } - } else { - logger.Warn(msg) - } - } - if err != nil { - return fmt.Errorf("connecting to CRI server failed: %v", err) - } - if operator.CriEndpoint == DefaultIsuladAddr { - operator.criClient = isula.NewRuntimeServiceClient(criConn) - } else { - operator.criClient = v1alpha2.NewRuntimeServiceClient(criConn) - } - operator.criConn = criConn - return nil -} - -func (operator *RuntimeOperatorTool) initOciClient() error { - conn, err := GetConnection(operator.OciEndpoint) - if err != nil || conn == nil { - msg := fmt.Sprintf("failed to get OCI connection: %v", err) - if operator.UseOciBackup { - logger.Warnf("%v, will use backup address to try again", msg) - if utils.IsExist(strings.TrimPrefix(DefaultContainerdAddr, unixPre)) { - conn, err = GetConnection(DefaultContainerdAddr) - - } else if utils.IsExist(strings.TrimPrefix(defaultDockerOnEuler, unixPre)) { - conn, err = GetConnection(defaultDockerOnEuler) - } - } else { - logger.Warn(msg) - } - } - if err != nil { - return fmt.Errorf("connecting to OCI server failed: %v", err) - } - if operator.OciEndpoint == DefaultIsuladAddr { - operator.client = isula.NewContainerServiceClient(conn) - } else { - operator.client = v1.NewContainersClient(conn) - } - operator.conn = conn - return nil -} - -func sockCheck(operator *RuntimeOperatorTool) error { - absPath, err := utils.CheckPath(strings.TrimPrefix(operator.CriEndpoint, unixPre)) - if err != nil { - return err - } - if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { - return err - } - - absPath, err = utils.CheckPath(strings.TrimPrefix(operator.OciEndpoint, unixPre)) - if err != nil { - return err - } - if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { - return err - } - return nil -} - -// Close closes container runtime operator -func (operator *RuntimeOperatorTool) Close() error { - err := operator.conn.Close() - if err != nil { - return err - } - err = operator.criConn.Close() - if err != nil { - return err - } - return nil -} - -// GetContainers returns all containers' IDs -func (operator *RuntimeOperatorTool) GetContainers(ctx context.Context) ([]*CommonContainer, error) { - if utils.IsNil(operator.criClient) || operator.criConn == nil { - return nil, errors.New("criClient is empty") - } - if client, ok := operator.criClient.(v1alpha2.RuntimeServiceClient); ok { - containers, err := getContainersByContainerdV1alpha2(ctx, client) - if isUnimplementedError(err, criV1alpha2) { - v1Client := criv1.NewRuntimeServiceClient(operator.criConn) - return getContainersByContainerdV1(ctx, v1Client) - } - return containers, err - } - if client, ok := operator.criClient.(isula.RuntimeServiceClient); ok { - return getContainersByIsulad(ctx, client) - } - - logger.Errorf("client %v is unexpected", operator.criClient) - return nil, errors.New("unexpected client type") -} - -func isUnimplementedError(err error, serviceName string) bool { - if err == nil { - return false - } - st, ok := status.FromError(err) - if ok { - return st.Code() == codes.Unimplemented && strings.Contains(st.Message(), serviceName) - } - errStr := err.Error() - if strings.Contains(errStr, "code = Unimplemented") && - strings.Contains(errStr, "desc = ") && strings.Contains(errStr, serviceName) { - return true - } - return false -} - -// GetContainerInfoByID use oci interface to get container -func (operator *RuntimeOperatorTool) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { - if utils.IsNil(operator.client) || operator.conn == nil { - return v1.Spec{}, errors.New("oci client is empty") - } - - s := v1.Spec{} - if client, ok := operator.client.(v1.ContainersClient); ok { - resp, err := client.Get(setGrpcNamespaceHeader(ctx, operator.Namespace), &v1.GetContainerRequest{ - Id: id, - }) - if err != nil { - hwlog.RunLog.Error("get call OCI get method failed") - return v1.Spec{}, err - } - if err = json.Unmarshal(resp.Container.Spec.Value, &s); err != nil { - hwlog.RunLog.Error("unmarshal OCI response failed") - return v1.Spec{}, err - } - return s, nil - } - - return s, errors.New("unexpected containerd client") -} - -// GetIsulaContainerInfoByID return isula container info -func (operator *RuntimeOperatorTool) GetIsulaContainerInfoByID(ctx context.Context, - id string) (isula.ContainerJson, error) { - containerJsonInfo := isula.ContainerJson{} - if utils.IsNil(operator.client) || operator.conn == nil { - return containerJsonInfo, errors.New("oci client is empty") - } - - if client, ok := operator.client.(isula.ContainerServiceClient); ok { - resp, err := client.Inspect(setGrpcNamespaceHeader(ctx, operator.Namespace), &isula.InspectContainerRequest{ - Id: id, - }) - if err != nil { - hwlog.RunLog.Error("call isula OCI Inspect method failed") - return containerJsonInfo, err - } - if err = json.Unmarshal([]byte(resp.ContainerJSON), &containerJsonInfo); err != nil { - logger.Errorf("unmarshal err: %v", err) - return containerJsonInfo, err - } - return containerJsonInfo, nil - } - - return containerJsonInfo, errors.New("unexpected isula client") -} - -// GetContainerType return container type -func (operator *RuntimeOperatorTool) GetContainerType() string { - if operator.OciEndpoint == DefaultIsuladAddr { - return IsulaContainer - } - return DefaultContainer -} - -type nsKey struct{} - -func setGrpcNamespaceHeader(ctx context.Context, namespace string) context.Context { - context.WithValue(ctx, nsKey{}, namespace) - ns := metadata.Pairs(grpcHeader, namespace) - md, ok := metadata.FromOutgoingContext(ctx) - if !ok { - md = ns - } else { - md = metadata.Join(ns, md) - } - return metadata.NewOutgoingContext(ctx, md) -} - -func getContainersByContainerdV1alpha2(ctx context.Context, - client v1alpha2.RuntimeServiceClient) ([]*CommonContainer, error) { - var allContainers []*CommonContainer - request := genContainerRequestV1alpha2() - r, err := client.ListContainers(ctx, request) - if err != nil { - hwlog.RunLog.Warn(err) - return nil, err - } - for _, container := range r.Containers { - allContainers = append(allContainers, &CommonContainer{ - Id: container.Id, - Labels: container.Labels, - }) - } - return allContainers, nil -} - -func getContainersByContainerdV1(ctx context.Context, client criv1.RuntimeServiceClient) ([]*CommonContainer, error) { - var allContainers []*CommonContainer - request := genContainerRequestV1() - r, err := client.ListContainers(ctx, request) - if err != nil { - hwlog.RunLog.Error(err) - return nil, err - } - for _, container := range r.Containers { - allContainers = append(allContainers, &CommonContainer{ - Id: container.Id, - Labels: container.Labels, - }) - } - return allContainers, nil -} - -func getContainersByIsulad(ctx context.Context, client isula.RuntimeServiceClient) ([]*CommonContainer, error) { - var allContainers []*CommonContainer - request := genIsulaRequest() - r, err := client.ListContainers(ctx, request) - if err != nil { - hwlog.RunLog.Error(err) - return nil, err - } - for _, container := range r.Containers { - allContainers = append(allContainers, &CommonContainer{ - Id: container.Id, - Labels: container.Labels, - }) - } - return allContainers, nil -} - -func genContainerRequestV1alpha2() *v1alpha2.ListContainersRequest { - filter := &v1alpha2.ContainerFilter{} - st := &v1alpha2.ContainerStateValue{} - st.State = v1alpha2.ContainerState_CONTAINER_RUNNING - filter.State = st - request := &v1alpha2.ListContainersRequest{ - Filter: filter, - } - return request -} - -func genContainerRequestV1() *criv1.ListContainersRequest { - filter := &criv1.ContainerFilter{} - st := &criv1.ContainerStateValue{} - st.State = criv1.ContainerState_CONTAINER_RUNNING - filter.State = st - request := &criv1.ListContainersRequest{ - Filter: filter, - } - return request -} - -func genIsulaRequest() *isula.ListContainersRequest { - filter := &isula.ContainerFilter{} - st := &isula.ContainerStateValue{} - st.State = isula.ContainerState_CONTAINER_RUNNING - filter.State = st - request := &isula.ListContainersRequest{ - Filter: filter, - } - return request -} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go deleted file mode 100644 index 2bc135c..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go +++ /dev/null @@ -1,568 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container provides utilities for container monitoring and testing. -package container - -import ( - "context" - "errors" - "fmt" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" - "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" - - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" -) - -const ( - // Test constants for runtime operations - testNamespace = "test-namespace" - - // Test error messages - testInitCriError = "init CRI client failed" - testInitOciError = "init OCI client failed" - testSockCheckError = "socket check failed" - testCriClientEmptyError = "criClient is empty" - testOciClientEmptyError = "oci client is empty" - testUnexpectedClientError = "unexpected client type" - testUnexpectedContainerdClientError = "unexpected containerd client" - testUnexpectedIsulaClientError = "unexpected isula client" - testCriV1alpha2 = "runtime.v1alpha2.RuntimeService" - testCriV1 = "runtime.v1.RuntimeService" -) - -func TestRuntimeOperatorToolInit(t *testing.T) { - r := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - convey.Convey("should initialize successfully when all components succeed", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, nil) - defer patches.Reset() - patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) - patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) - err := operator.Init() - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("should return error when socket check fails", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, errors.New(testSockCheckError)) - defer patches.Reset() - err := operator.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testSockCheckError) - }) - convey.Convey("should return error when CRI client init fails", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, nil) - defer patches.Reset() - patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, errors.New(testInitCriError)) - patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) - err := operator.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testInitCriError) - }) - convey.Convey("should return error when OCI client init fails", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, nil) - defer patches.Reset() - patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) - patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, errors.New(testInitOciError)) - err := operator.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testInitOciError) - }) -} - -func TestRuntimeOperatorToolInitCriClient(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolInitCriClient", t, func() { - convey.Convey("should initialize CRI client successfully for containerd", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - UseOciBackup: false, - UseCriBackup: false, - } - - patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - defer patches.Reset() - - err := operator.initCriClient() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should initialize CRI client successfully for isulad", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: DefaultIsuladAddr, - UseOciBackup: false, - UseCriBackup: false, - } - - patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - defer patches.Reset() - - err := operator.initCriClient() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when connection fails and no backup", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - UseOciBackup: false, - UseCriBackup: false, - } - - patches := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) - defer patches.Reset() - - err := operator.initCriClient() - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestRuntimeOperatorToolInitOciClient(t *testing.T) { - testCases := buildInitOciClientTestCases() - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - operator, patches := tc.setup() - if patches != nil { - defer patches.Reset() - } - err := operator.initOciClient() - if tc.hasError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - } - }) - } -} - -type initOciClientTestCase struct { - name string - setup func() (*RuntimeOperatorTool, *gomonkey.Patches) - hasError bool -} - -func buildInitOciClientTestCases() []initOciClientTestCase { - return []initOciClientTestCase{ - {name: "should initialize OCI client successfully for containerd", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} - p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - return op, p - }, - hasError: false}, - {name: "should initialize OCI client successfully for isulad", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: DefaultIsuladAddr, UseOciBackup: false} - p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - return op, p - }, - hasError: false}, - {name: "should return error when connection fails and no backup", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} - p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) - return op, p - }, - hasError: true}, - {name: "should return error when OCI endpoint is empty", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: "", UseOciBackup: false} - return op, nil - }, - hasError: true}, - {name: "should try backup when primary connection fails", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} - p := gomonkey.ApplyFunc(GetConnection, func(endpoint string) (*grpc.ClientConn, error) { - if endpoint == testContainerdEndpoint { - return nil, errors.New("primary failed") - } - return nil, errors.New("backup failed") - }) - return op, p - }, - hasError: true}, - {name: "should return error when all connections fail", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} - p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("all failed")) - return op, p - }, - hasError: true}, - } -} - -func TestSockCheck(t *testing.T) { - convey.Convey("TestSockCheck", t, func() { - convey.Convey("should pass when socket paths are valid", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) - defer patches.Reset() - patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, nil) - - err := sockCheck(operator) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when CRI endpoint check fails", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) - defer patches.Reset() - - err := sockCheck(operator) - convey.So(err, convey.ShouldNotBeNil) - }) - - convey.Convey("should return error when CRI endpoint permission check fails", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) - defer patches.Reset() - patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, errors.New("permission check failed")) - - err := sockCheck(operator) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestRuntimeOperatorToolClose(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolClose", t, func() { - convey.Convey("should close connections successfully", func() { - operator := &RuntimeOperatorTool{ - conn: &grpc.ClientConn{}, - criConn: &grpc.ClientConn{}, - } - - patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { - return nil - }) - defer patches.Reset() - - err := operator.Close() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when OCI connection close fails", func() { - operator := &RuntimeOperatorTool{ - conn: &grpc.ClientConn{}, - criConn: &grpc.ClientConn{}, - } - - patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { - return errors.New("close failed") - }) - defer patches.Reset() - - err := operator.Close() - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestRuntimeOperatorToolGetContainers(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetContainers", t, func() { - convey.Convey("should return error when CRI client is empty", func() { - operator := &RuntimeOperatorTool{} - - patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) - defer patches.Reset() - - containers, err := operator.GetContainers(context.Background()) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) - convey.So(containers, convey.ShouldBeNil) - }) - - convey.Convey("should return error when CRI connection is nil", func() { - operator := &RuntimeOperatorTool{ - criClient: "mock-client", - } - - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - - containers, err := operator.GetContainers(context.Background()) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) - convey.So(containers, convey.ShouldBeNil) - }) - - convey.Convey("should return error when client type is unexpected", func() { - operator := &RuntimeOperatorTool{ - criClient: "unexpected", - criConn: &grpc.ClientConn{}, - } - - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - - containers, err := operator.GetContainers(context.Background()) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testUnexpectedClientError) - convey.So(containers, convey.ShouldBeNil) - }) - }) -} - -func TestIsUnimplementedError(t *testing.T) { - tests := []struct { - name string - err error - serviceName string - want bool - }{ - { - name: "nil error returns false", - err: nil, - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "non-grpc error returns false", - err: errors.New("unknown service " + testCriV1alpha2), - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "mismatched code returns false", - err: status.Error(codes.NotFound, "unknown service "+testCriV1alpha2), - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "mismatched message returns false", - err: status.Error(codes.Unimplemented, "unknown service "+testCriV1), - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "matched unimplemented error returns true", - err: status.Error(codes.Unimplemented, "unknown service "+testCriV1alpha2), - serviceName: testCriV1alpha2, - want: true, - }, - { - name: "real grpc error format returns true", - err: fmt.Errorf("rpc error: code = Unimplemented desc = unknown service " + testCriV1alpha2), - serviceName: testCriV1alpha2, - want: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := isUnimplementedError(tt.err, tt.serviceName); got != tt.want { - t.Errorf("isUnimplementedError() = %v, want %v (err: %v)", got, tt.want, tt.err) - } - }) - } -} - -func TestRuntimeOperatorToolGetContainerInfoByID(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetContainerInfoByID", t, func() { - convey.Convey("should return error when OCI client is empty", func() { - operator := &RuntimeOperatorTool{} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when OCI connection is nil", func() { - operator := &RuntimeOperatorTool{client: "mock-client"} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when client type is unexpected", func() { - operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testUnexpectedContainerdClientError) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when GetContainer call fails", func() { - operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when JSON unmarshal fails", func() { - operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - - }) -} - -func TestRuntimeOperatorToolGetIsulaContainerInfoByID(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetIsulaContainerInfoByID", t, func() { - convey.Convey("should return error when OCI client is empty", func() { - operator := &RuntimeOperatorTool{} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when OCI connection is nil", func() { - operator := &RuntimeOperatorTool{client: "mock-client"} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when client type is unexpected", func() { - operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testUnexpectedIsulaClientError) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when Inspect call fails", func() { - operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when JSON unmarshal fails", func() { - operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - - }) -} - -func TestRuntimeOperatorToolGetContainerType(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetContainerType", t, func() { - convey.Convey("should return isula when endpoint is isulad", func() { - operator := &RuntimeOperatorTool{ - OciEndpoint: DefaultIsuladAddr, - } - - containerType := operator.GetContainerType() - convey.So(containerType, convey.ShouldEqual, IsulaContainer) - }) - - convey.Convey("should return default when endpoint is not isulad", func() { - operator := &RuntimeOperatorTool{ - OciEndpoint: testContainerdEndpoint, - } - - containerType := operator.GetContainerType() - convey.So(containerType, convey.ShouldEqual, DefaultContainer) - }) - }) -} - -func TestSetGrpcNamespaceHeader(t *testing.T) { - convey.Convey("TestSetGrpcNamespaceHeader", t, func() { - convey.Convey("should set namespace header when context has no metadata", func() { - ctx := context.Background() - result := setGrpcNamespaceHeader(ctx, testNamespace) - convey.So(result, convey.ShouldNotBeNil) - }) - - convey.Convey("should set namespace header when context has existing metadata", func() { - ctx := context.Background() - ctx = context.WithValue(ctx, "test", "value") - result := setGrpcNamespaceHeader(ctx, testNamespace) - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -func TestGenContainerRequestV1alpha2(t *testing.T) { - convey.Convey("TestGenContainerRequestV1alpha2", t, func() { - convey.Convey("should generate valid container request", func() { - request := genContainerRequestV1alpha2() - convey.So(request, convey.ShouldNotBeNil) - convey.So(request.Filter, convey.ShouldNotBeNil) - convey.So(request.Filter.State, convey.ShouldNotBeNil) - convey.So(request.Filter.State.State, convey.ShouldEqual, v1alpha2.ContainerState_CONTAINER_RUNNING) - }) - }) -} - -func TestGenContainerRequestV1(t *testing.T) { - convey.Convey("TestGenContainerRequestV1", t, func() { - convey.Convey("should generate valid container request", func() { - request := genContainerRequestV1() - convey.So(request, convey.ShouldNotBeNil) - convey.So(request.Filter, convey.ShouldNotBeNil) - convey.So(request.Filter.State, convey.ShouldNotBeNil) - convey.So(request.Filter.State.State, convey.ShouldEqual, criv1.ContainerState_CONTAINER_RUNNING) - }) - }) -} - -func TestGenIsulaRequest(t *testing.T) { - convey.Convey("TestGenIsulaRequest", t, func() { - convey.Convey("should generate valid isula request", func() { - request := genIsulaRequest() - convey.So(request, convey.ShouldNotBeNil) - convey.So(request.Filter, convey.ShouldNotBeNil) - convey.So(request.Filter.State, convey.ShouldNotBeNil) - convey.So(request.Filter.State.State, convey.ShouldEqual, isula.ContainerState_CONTAINER_RUNNING) - }) - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils.go b/mind-cluster/component/npu-exporter/collector/container/utils.go deleted file mode 100644 index b5ff57e..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/utils.go +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container for monitoring containers' npu allocation -package container - -import ( - "context" - "errors" - "fmt" - "net" - "net/url" - "strings" - "time" - - "google.golang.org/grpc" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - defaultTimeout = 5 * time.Second - unixPrefix = "unix" - // MaxLenDNS configName max len - MaxLenDNS = 512 - // MinLenDNS configName min len - MinLenDNS = 1 - maxContainers = 1024 - maxCgroupPath = 2048 - - maxDevicesNum = 100000 - maxEnvNum = 10000 -) - -// CgroupVersion is the cgroups mode of the host system -type CgroupVersion int - -// GetConnection return the grpc connection -func GetConnection(endPoint string) (*grpc.ClientConn, error) { - if endPoint == "" { - return nil, fmt.Errorf("endpoint is not set") - } - logger.Debugf("connect using endpoint '%s' with '%s' timeout", - utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://")), defaultTimeout) - addr, dialer, err := getAddressAndDialer(endPoint) - if err != nil { - hwlog.RunLog.Error(err) - return nil, err - } - ctx, cancelFn := context.WithTimeout(context.Background(), defaultTimeout) - defer cancelFn() - conn, err := grpc.DialContext(ctx, addr, grpc.WithInsecure(), grpc.WithBlock(), grpc.WithContextDialer(dialer)) - if err != nil { - return nil, err - } - logger.Debugf("connected successfully using endpoint: %s", - utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://"))) - return conn, nil -} - -func parseSocketEndpoint(endpoint string) (string, string, error) { - u, err := url.Parse(endpoint) - if err != nil { - return "", "", err - } - - switch u.Scheme { - case "unix": - return "unix", u.Path, nil - case "tcp": - return "tcp", u.Host, nil - default: - return u.Scheme, "", fmt.Errorf("protocol %q not supported", u.Scheme) - } -} - -// getAddressAndDialer returns the address parsed from the given socket endpoint and dialer -func getAddressAndDialer(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { - prefix, addr, err := parseSocketEndpoint(endpoint) - if err != nil { - return "", nil, err - } - if prefix != unixPrefix { - return "", nil, fmt.Errorf("only support unix socket") - } - return addr, dial, nil -} - -// dial return the context dialer -func dial(ctx context.Context, addr string) (net.Conn, error) { - return (&net.Dialer{}).DialContext(ctx, unixPrefix, addr) -} - -func validDNSRe(dnsContent string) error { - if len(dnsContent) < MinLenDNS || len(dnsContent) > MaxLenDNS { - return errors.New("param len invalid") - } - return nil -} - -func makeUpDeviceInfo(c *CommonContainer) (DevicesInfo, error) { - deviceInfo := DevicesInfo{} - var names []string - - ns := c.Labels[labelK8sPodNamespace] - names = append(names, ns) - podName := c.Labels[labelK8sPodName] - names = append(names, podName) - containerName := c.Labels[labelContainerName] - names = append(names, containerName) - for _, v := range names { - if err := validDNSRe(v); err != nil { - return DevicesInfo{}, err - } - } - - deviceInfo.ID = c.Id - deviceInfo.Name = ns + "_" + podName + "_" + containerName - return deviceInfo, nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils_test.go b/mind-cluster/component/npu-exporter/collector/container/utils_test.go deleted file mode 100644 index 32e6716..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/utils_test.go +++ /dev/null @@ -1,329 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// package container test methods in utils -package container - -import ( - "context" - "errors" - "net" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "google.golang.org/grpc" - - "ascend-common/common-utils/hwlog" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - testContainerID = "container123" - testPodNamespace = "default" - testPodName = "test-pod" - testContainerName = "test-container" - testUnixSocket = "unix:///test.sock" - testInvalidEndpoint = "invalid://endpoint" - testDialError = "dial error" - testGrpcDialError = "grpc dial error" - testInvalidEndpointError = "invalid endpoint" - testEndpointNotSetError = "endpoint is not set" - testDNSContent = "test-dns" - testMinDNSContent = "a" - testEmptyDNSContent = "" - testTarget = "test" - testUnixScheme = "unix" - testTcpScheme = "tcp" - testUnixAddr = "/tmp/test.sock" - testTcpAddr = "localhost:8080" - testInvalidURL = "://invalid" - testEmptyNamespace = "" - testEmptyPodName = "" - testEmptyContainerName = "" -) - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") -} - -func TestGetConnection(t *testing.T) { - convey.Convey("TestGetConnection", t, func() { - convey.Convey("should return error when endpoint is empty", func() { - testEmptyEndpoint() - }) - convey.Convey("should return error when endpoint is invalid", func() { - testInvalidEndpointFunc() - }) - convey.Convey("should return error when grpc dial context fails", func() { - testGrpcDialErrorFunc() - }) - convey.Convey("should return connection when successful", func() { - testSuccessfulConnection() - }) - }) -} - -func testEmptyEndpoint() { - conn, err := GetConnection("") - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testEndpointNotSetError) -} - -func testInvalidEndpointFunc() { - patches := gomonkey.ApplyFuncReturn(getAddressAndDialer, "", nil, errors.New(testInvalidEndpointError)) - defer patches.Reset() - conn, err := GetConnection(testInvalidEndpoint) - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testInvalidEndpointError) -} - -func testGrpcDialErrorFunc() { - patches := gomonkey.ApplyFunc(getAddressAndDialer, - func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { - return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { - return nil, errors.New(testDialError) - }, nil - }) - defer patches.Reset() - patches.ApplyFuncReturn(grpc.DialContext, nil, errors.New(testGrpcDialError)) - conn, err := GetConnection(testUnixSocket) - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testGrpcDialError) -} - -func testSuccessfulConnection() { - mockConn := &grpc.ClientConn{} - patches := gomonkey.ApplyFunc(getAddressAndDialer, - func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { - return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { - return nil, nil - }, nil - }) - defer patches.Reset() - patches.ApplyFuncReturn(grpc.DialContext, mockConn, nil) - conn, err := GetConnection(testUnixSocket) - convey.So(conn, convey.ShouldEqual, mockConn) - convey.So(err, convey.ShouldBeNil) -} - -func TestParseSocketEndpoint(t *testing.T) { - testCases := []struct { - name string - endpoint string - expectedScheme string - expectedAddr string - expectedError bool - }{ - {name: "should parse unix endpoint when valid", endpoint: "unix:///tmp/test.sock", - expectedScheme: testUnixScheme, expectedAddr: testUnixAddr, expectedError: false}, - {name: "should parse tcp endpoint when valid", endpoint: "tcp://localhost:8080", - expectedScheme: testTcpScheme, expectedAddr: testTcpAddr, expectedError: false}, - {name: "should return error when scheme is invalid", endpoint: "http://localhost:8080", - expectedScheme: "http", expectedAddr: "", expectedError: true}, - {name: "should return error when url is invalid", endpoint: testInvalidURL, - expectedScheme: "", expectedAddr: "", expectedError: true}, - } - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - scheme, addr, err := parseSocketEndpoint(tc.endpoint) - convey.So(scheme, convey.ShouldEqual, tc.expectedScheme) - convey.So(addr, convey.ShouldEqual, tc.expectedAddr) - if tc.expectedError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - } - }) - } -} - -func TestGetAddressAndDialer(t *testing.T) { - convey.Convey("TestGetAddressAndDialer", t, func() { - testCases := []struct { - name string - endpoint string - expectedAddr string - expectedError bool - }{ - { - name: "should return address when unix endpoint is valid", - endpoint: "unix:///tmp/test.sock", - expectedAddr: "/tmp/test.sock", - expectedError: false, - }, - { - name: "should return error when scheme is invalid", - endpoint: "tcp://localhost:8080", - expectedAddr: "", - expectedError: true, - }, - { - name: "should return error when parse fails", - endpoint: "://invalid", - expectedAddr: "", - expectedError: true, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - addr, dialer, err := getAddressAndDialer(tc.endpoint) - convey.So(addr, convey.ShouldEqual, tc.expectedAddr) - if tc.expectedError { - convey.So(dialer, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(dialer, convey.ShouldNotBeNil) - convey.So(err, convey.ShouldBeNil) - } - }) - } - }) -} - -func TestDial(t *testing.T) { - convey.Convey("should call net.Dialer.DialContext when dialing", t, func() { - var dialerCalled bool - patches := gomonkey.ApplyMethod(&net.Dialer{}, "DialContext", - func(d *net.Dialer, ctx context.Context, network, address string) (net.Conn, error) { - dialerCalled = true - return nil, errors.New("mock dial error") - }) - defer patches.Reset() - ctx := context.Background() - conn, err := dial(ctx, "/tmp/test.sock") - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(dialerCalled, convey.ShouldBeTrue) - }) -} - -func TestValidDNSRe(t *testing.T) { - convey.Convey("TestValidDNSRe", t, func() { - testCases := []struct { - name string - dnsContent string - expectedError bool - }{ - {name: "should pass validation when dns content has valid length", - dnsContent: testDNSContent, expectedError: false}, - {name: "should return error when dns content is empty", - dnsContent: testEmptyDNSContent, expectedError: true}, - {name: "should return error when dns content is too long", - dnsContent: string(make([]byte, MaxLenDNS+1)), expectedError: true}, - {name: "should pass validation when dns content has minimum valid length", - dnsContent: testMinDNSContent, expectedError: false}, - {name: "should pass validation when dns content has maximum valid length", - dnsContent: string(make([]byte, MaxLenDNS)), expectedError: false}, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - err := validDNSRe(tc.dnsContent) - if tc.expectedError { - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "param len invalid") - } else { - convey.So(err, convey.ShouldBeNil) - } - }) - } - }) -} - -func TestMakeUpDeviceInfo(t *testing.T) { - testCases := getMakeUpDeviceInfoTestCases() - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - deviceInfo, err := makeUpDeviceInfo(tc.container) - validateMakeUpDeviceInfoResult(deviceInfo, err, tc) - }) - } -} - -func getMakeUpDeviceInfoTestCases() []struct { - name string - container *CommonContainer - expectedError bool - expectedName string -} { - return []struct { - name string - container *CommonContainer - expectedError bool - expectedName string - }{ - {name: "should return valid device info when container has all labels", - container: createValidContainer(), expectedError: false, expectedName: "default_test-pod_test-container"}, - {name: "should return error when container has invalid namespace length", - container: createContainerWithEmptyNamespace(), expectedError: true, expectedName: ""}, - {name: "should return error when container has invalid pod name length", - container: createContainerWithEmptyPodName(), expectedError: true, expectedName: ""}, - {name: "should return error when container has invalid container name length", - container: createContainerWithEmptyContainerName(), expectedError: true, expectedName: ""}, - {name: "should return error when container has too long namespace", - container: createContainerWithLongNamespace(), expectedError: true, expectedName: ""}, - } -} - -func createValidContainer() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, - labelContainerName: testContainerName}} -} -func createContainerWithEmptyNamespace() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testEmptyNamespace, labelK8sPodName: testPodName, - labelContainerName: testContainerName}} -} -func createContainerWithEmptyPodName() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testEmptyPodName, - labelContainerName: testContainerName}} -} -func createContainerWithEmptyContainerName() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, - labelContainerName: testEmptyContainerName}} -} - -func createContainerWithLongNamespace() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: string(make([]byte, MaxLenDNS+1)), - labelK8sPodName: testPodName, labelContainerName: testContainerName}} -} - -func validateMakeUpDeviceInfoResult(deviceInfo DevicesInfo, err error, tc struct { - name string - container *CommonContainer - expectedError bool - expectedName string -}) { - if tc.expectedError { - convey.So(err, convey.ShouldNotBeNil) - convey.So(deviceInfo, convey.ShouldResemble, DevicesInfo{}) - } else { - convey.So(err, convey.ShouldBeNil) - convey.So(deviceInfo.ID, convey.ShouldEqual, tc.container.Id) - convey.So(deviceInfo.Name, convey.ShouldEqual, tc.expectedName) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go deleted file mode 100644 index 46762f3..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go +++ /dev/null @@ -1,310 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// source: containerd.proto -// protoc:3.13.0 -// protoc-gen-go 1.3.5 - -package v1 - -import ( - "context" - "fmt" - "math" - - "github.com/golang/protobuf/proto" - "github.com/golang/protobuf/ptypes/any" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" -) - -// Reference imports to suppress errors if they are not otherwise used. -var _ = fmt.Errorf -var _ = math.Inf -var _ = proto.Marshal - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the proto package it is being compiled against. -// A compilation error at this line likely means your copy of the -// proto package needs to be updated. -const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package - -type Container struct { - // ID the container id - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - // Labels the container labels - Labels map[string]string `protobuf:"bytes,2,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` - // Image the container image - Image string `protobuf:"bytes,3,opt,name=image,proto3" json:"image,omitempty"` - // Spec runtime specific. - Spec *any.Any `protobuf:"bytes,5,opt,name=spec,proto3" json:"spec,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -// Reset reset the object -func (m *Container) Reset() { *m = Container{} } - -// String -func (m *Container) String() string { return proto.CompactTextString(m) } - -// ProtoMessage -func (*Container) ProtoMessage() {} - -// Descriptor -func (*Container) Descriptor() ([]byte, []int) { - return fileDescriptor_29bcc067d8d1b7d0, []int{0} -} - -// XXX_Unmarshal -func (m *Container) XXX_Unmarshal(b []byte) error { - return xxx_messageInfo_Container.Unmarshal(m, b) -} - -// XXX_Marshal -func (m *Container) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - return xxx_messageInfo_Container.Marshal(b, m, deterministic) -} - -// XXX_Merge -func (m *Container) XXX_Merge(src proto.Message) { - xxx_messageInfo_Container.Merge(m, src) -} - -// XXX_Size -func (m *Container) XXX_Size() int { - return xxx_messageInfo_Container.Size(m) -} - -// XXX_DiscardUnknown -func (m *Container) XXX_DiscardUnknown() { - xxx_messageInfo_Container.DiscardUnknown(m) -} - -var xxx_messageInfo_Container proto.InternalMessageInfo - -// GetId -func (m *Container) GetId() string { - if m != nil { - return m.Id - } - return "" -} - -// GetLabels -func (m *Container) GetLabels() map[string]string { - if m != nil { - return m.Labels - } - return nil -} - -// GetImage -func (m *Container) GetImage() string { - if m != nil { - return m.Image - } - return "" -} - -// GetSpec -func (m *Container) GetSpec() *any.Any { - if m != nil { - return m.Spec - } - return nil -} - -type GetContainerRequest struct { - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetContainerRequest) Reset() { *m = GetContainerRequest{} } -func (m *GetContainerRequest) String() string { return proto.CompactTextString(m) } -func (*GetContainerRequest) ProtoMessage() {} -func (*GetContainerRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_29bcc067d8d1b7d0, []int{1} -} - -func (m *GetContainerRequest) XXX_Unmarshal(b []byte) error { - return xxx_messageInfo_GetContainerRequest.Unmarshal(m, b) -} -func (m *GetContainerRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - return xxx_messageInfo_GetContainerRequest.Marshal(b, m, deterministic) -} -func (m *GetContainerRequest) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetContainerRequest.Merge(m, src) -} -func (m *GetContainerRequest) XXX_Size() int { - return xxx_messageInfo_GetContainerRequest.Size(m) -} -func (m *GetContainerRequest) XXX_DiscardUnknown() { - xxx_messageInfo_GetContainerRequest.DiscardUnknown(m) -} - -var xxx_messageInfo_GetContainerRequest proto.InternalMessageInfo - -func (m *GetContainerRequest) GetId() string { - if m != nil { - return m.Id - } - return "" -} - -type GetContainerResponse struct { - Container *Container `protobuf:"bytes,1,opt,name=container,proto3" json:"container,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetContainerResponse) Reset() { *m = GetContainerResponse{} } -func (m *GetContainerResponse) String() string { return proto.CompactTextString(m) } -func (*GetContainerResponse) ProtoMessage() {} -func (*GetContainerResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_29bcc067d8d1b7d0, []int{2} -} - -func (m *GetContainerResponse) XXX_Unmarshal(b []byte) error { - return xxx_messageInfo_GetContainerResponse.Unmarshal(m, b) -} -func (m *GetContainerResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - return xxx_messageInfo_GetContainerResponse.Marshal(b, m, deterministic) -} -func (m *GetContainerResponse) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetContainerResponse.Merge(m, src) -} -func (m *GetContainerResponse) XXX_Size() int { - return xxx_messageInfo_GetContainerResponse.Size(m) -} -func (m *GetContainerResponse) XXX_DiscardUnknown() { - xxx_messageInfo_GetContainerResponse.DiscardUnknown(m) -} - -var xxx_messageInfo_GetContainerResponse proto.InternalMessageInfo - -func (m *GetContainerResponse) GetContainer() *Container { - if m != nil { - return m.Container - } - return nil -} - -func init() { - proto.RegisterType((*Container)(nil), "containerd.services.containers.v1.Container") - proto.RegisterMapType((map[string]string)(nil), "containerd.services.containers.v1.Container.LabelsEntry") - proto.RegisterType((*GetContainerRequest)(nil), "containerd.services.containers.v1.GetContainerRequest") - proto.RegisterType((*GetContainerResponse)(nil), "containerd.services.containers.v1.GetContainerResponse") -} - -func init() { - proto.RegisterFile("containerd.proto", fileDescriptor_29bcc067d8d1b7d0) -} - -var fileDescriptor_29bcc067d8d1b7d0 = []byte{ - // 327 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x51, 0x4f, 0x4b, 0xc3, 0x30, - 0x14, 0xa7, 0xad, 0x1b, 0xec, 0x15, 0x64, 0xc4, 0x1d, 0xea, 0x4e, 0x73, 0x20, 0xf4, 0xa0, 0xa9, - 0xab, 0xa0, 0x53, 0x4f, 0x2a, 0x32, 0x10, 0x0f, 0xd2, 0xa3, 0xb7, 0xb6, 0x7b, 0xce, 0x62, 0x96, - 0xd4, 0x24, 0xad, 0xf6, 0xee, 0x87, 0xf5, 0x63, 0xc8, 0xd2, 0xad, 0x4e, 0x11, 0x74, 0xb7, 0xf7, - 0x5e, 0x7f, 0x7f, 0x1b, 0xe8, 0xa6, 0x82, 0xeb, 0x38, 0xe3, 0x28, 0xa7, 0x34, 0x97, 0x42, 0x0b, - 0xb2, 0xb7, 0x76, 0x51, 0x28, 0xcb, 0x2c, 0x45, 0x45, 0x9b, 0x9b, 0xa2, 0xe5, 0xa8, 0xbf, 0x3b, - 0x13, 0x62, 0xc6, 0x30, 0x30, 0x84, 0xa4, 0x78, 0x0c, 0x62, 0x5e, 0xd5, 0xec, 0xe1, 0x87, 0x05, - 0x9d, 0xeb, 0x15, 0x98, 0x6c, 0x83, 0x9d, 0x4d, 0x3d, 0x6b, 0x60, 0xf9, 0x9d, 0xc8, 0xce, 0xa6, - 0xe4, 0x1e, 0xda, 0x2c, 0x4e, 0x90, 0x29, 0xcf, 0x1e, 0x38, 0xbe, 0x1b, 0x8e, 0xe9, 0x9f, 0x66, - 0xb4, 0x51, 0xa3, 0x77, 0x86, 0x7a, 0xc3, 0xb5, 0xac, 0xa2, 0xa5, 0x0e, 0xe9, 0x41, 0x2b, 0x9b, - 0xc7, 0x33, 0xf4, 0x1c, 0x63, 0x52, 0x2f, 0xc4, 0x87, 0x2d, 0x95, 0x63, 0xea, 0xb5, 0x06, 0x96, - 0xef, 0x86, 0x3d, 0x5a, 0xe7, 0xa5, 0xab, 0xbc, 0xf4, 0x92, 0x57, 0x91, 0x41, 0xf4, 0xcf, 0xc0, - 0x5d, 0x93, 0x25, 0x5d, 0x70, 0x9e, 0xb1, 0x5a, 0x26, 0x5e, 0x8c, 0x0b, 0x83, 0x32, 0x66, 0x05, - 0x7a, 0x76, 0x6d, 0x60, 0x96, 0x73, 0x7b, 0x6c, 0x0d, 0xf7, 0x61, 0x67, 0x82, 0xba, 0x89, 0x17, - 0xe1, 0x4b, 0x81, 0x4a, 0xff, 0xec, 0x3c, 0x4c, 0xa0, 0xf7, 0x1d, 0xa6, 0x72, 0xc1, 0x15, 0x92, - 0x5b, 0xe8, 0x34, 0x45, 0x0d, 0xdc, 0x0d, 0x0f, 0x36, 0xf9, 0x1d, 0xd1, 0x17, 0x3d, 0x7c, 0xb7, - 0x00, 0x9a, 0x0f, 0x8a, 0x94, 0xe0, 0x4c, 0x50, 0x93, 0x93, 0x7f, 0xc8, 0xfd, 0xd2, 0xa0, 0x7f, - 0xba, 0x31, 0xaf, 0xae, 0x74, 0x75, 0xf4, 0x40, 0x9f, 0x8a, 0xf8, 0x15, 0x33, 0x9a, 0x8a, 0x79, - 0xc0, 0xf3, 0xe2, 0x10, 0xdf, 0x72, 0x21, 0x35, 0xca, 0x20, 0x15, 0x8c, 0x61, 0xaa, 0xc5, 0x62, - 0x5a, 0xd2, 0x2e, 0xca, 0x51, 0xd2, 0x36, 0x4f, 0x72, 0xfc, 0x19, 0x00, 0x00, 0xff, 0xff, 0x30, - 0xcc, 0x1c, 0x74, 0x87, 0x02, 0x00, 0x00, -} - -// Reference imports to suppress errors if they are not otherwise used. -var _ context.Context -var _ grpc.ClientConnInterface - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -const _ = grpc.SupportPackageIsVersion6 - -// ContainersClient is the client API for Containers service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. -type ContainersClient interface { - Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) -} - -type containersClient struct { - cc grpc.ClientConnInterface -} - -func NewContainersClient(cc grpc.ClientConnInterface) ContainersClient { - return &containersClient{cc} -} - -func (c *containersClient) Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) { - out := new(GetContainerResponse) - err := c.cc.Invoke(ctx, "/containerd.services.containers.v1.Containers/Get", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// ContainersServer is the server API for Containers service. -type ContainersServer interface { - Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) -} - -// UnimplementedContainersServer can be embedded to have forward compatible implementations. -type UnimplementedContainersServer struct { -} - -func (*UnimplementedContainersServer) Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Get not implemented") -} - -func RegisterContainersServer(s *grpc.Server, srv ContainersServer) { - s.RegisterService(&_Containers_desc, srv) -} - -func _Containers_Get_Method(srv interface{}, ctx context.Context, desc func(interface{}) error, itcpt grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(GetContainerRequest) - if err := desc(in); err != nil { - return nil, err - } - if itcpt == nil { - return srv.(ContainersServer).Get(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/containerd.services.containers.v1.Containers/Get", - } - handler := func(ctx context.Context, request interface{}) (interface{}, error) { - return srv.(ContainersServer).Get(ctx, request.(*GetContainerRequest)) - } - return itcpt(ctx, in, info, handler) -} - -var _Containers_desc = grpc.ServiceDesc{ - ServiceName: "containerd.services.containers.v1.Containers", - HandlerType: (*ContainersServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "Get", - Handler: _Containers_Get_Method, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "containerd.proto", -} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto deleted file mode 100644 index 48a4a4b..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto +++ /dev/null @@ -1,62 +0,0 @@ -syntax = "proto3"; - -package containerd.services.containers.v1; - - -import "google/protobuf/any.proto"; -import "google/protobuf/timestamp.proto"; - -option go_package = "huawei.com/npu-exporter/v6/collector/container;v1"; - -// Containers provides metadata storage for containers used in the execution -// service. -service Containers { - rpc Get(GetContainerRequest) returns (GetContainerResponse); -} - -message Container { - // ID is the user-specified identifier. - string id = 1; - - // Labels provides an area to include arbitrary data on containers. - map labels = 2; - - // Image contains the reference of the image used to build the - string image = 3; - - message Runtime { - // Name is the name of the runtime. - string name = 1; - // Options runtime initialization options. - google.protobuf.Any options = 2; - } - // Runtime specifies runtime. - Runtime runtime = 4; - - // Spec opencotainer spec. - google.protobuf.Any spec = 5; - - // Snapshotter is the snapshotter name used for rootfs - string snapshotter = 6; - - // SnapshotKey the snapshot key to use for the container's root - string snapshot_key = 7; - - // CreatedAt is the create time of container. - google.protobuf.Timestamp created_at = 8 ; - - // UpdatedAt is the last update of container. - google.protobuf.Timestamp updated_at = 9 ; - - // Extensions allow clients to provide zero or more blobs that are directly - map extensions = 10 ; -} - -message GetContainerRequest { - string id = 1; -} - -message GetContainerResponse { - Container container = 1 ; -} - diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/spec.go b/mind-cluster/component/npu-exporter/collector/container/v1/spec.go deleted file mode 100644 index 2efa216..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/v1/spec.go +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package v1 implement the containerd client -package v1 - -// Spec is the base configuration for the container. -type Spec struct { - // Linux is platform-specific configuration for Linux based containers. - Linux *Linux `json:"linux,omitempty" platform:"linux"` - // Process for get capabilities - Process *Process `json:"process,omitempty" platform:"linux"` -} - -// Process is the base configuration for the container. -type Process struct { - // Env for container env - Env []string `json:"env,omitempty" platform:"linux"` -} - -// Linux contains platform-specific configuration for Linux based containers. -type Linux struct { - // Resources contain cgroup information for handling resource constraints - // for the container - Resources *LinuxResources `json:"resources,omitempty"` - // Devices are a list of device nodes that are created for the container -} - -// LinuxResources has container runtime resource constraints -type LinuxResources struct { - // Devices configures the device allowlist. - Devices []LinuxDeviceCgroup `json:"devices,omitempty"` -} - -// LinuxDeviceCgroup represents a device rule for the devices specified to -// the device controller -type LinuxDeviceCgroup struct { - // Allow or deny - Allow bool `json:"allow"` - // Device type, block, char, etc. - Type string `json:"type,omitempty"` - // Major is the device's major number. - Major *int64 `json:"major,omitempty"` - // Minor is the device's minor number. - Minor *int64 `json:"minor,omitempty"` - // Cgroup access permissions format, rwm. - Access string `json:"access,omitempty"` -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go deleted file mode 100644 index 53a7645..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - descTotalMemory = colcommon.BuildDesc("npu_chip_info_total_memory", "the npu total memory") - descUsedMemory = colcommon.BuildDesc("npu_chip_info_used_memory", "the npu used memory") - - notSupportedDdrDevices = map[string]bool{ - api.Ascend910B: true, - api.Ascend910A3: true, - } -) - -type ddrCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo the memoryInfo of the chip - extInfo *common.MemoryInfo -} - -// DdrCollector collect ddr info -type DdrCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the metric is supported -func (c *DdrCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := !notSupportedDdrDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "there is no DDR module. DDR information cannot be queried.") - return isSupport -} - -// Describe description of the metric -func (c *DdrCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descTotalMemory - ch <- descUsedMemory -} - -// CollectToCache collect the metric to cache -func (c *DdrCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - - for _, chip := range chipList { - logicID := chip.LogicID - mem, err := n.Dmgr.GetDeviceMemoryInfo(logicID) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForDDR, logicID, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForDDR, logicID) - - c.LocalCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mem}) - } - colcommon.UpdateCache[ddrCache](n, colcommon.GetCacheKey(c), &c.LocalCache) - -} - -// UpdatePrometheus update prometheus metrics -func (c *DdrCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache ddrCache, cardLabel []string) { - extInfo := cache.extInfo - if extInfo == nil { - return - } - memorySize := extInfo.MemorySize - memoryAvailable := extInfo.MemoryAvailable - - doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, descTotalMemory) - doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, descUsedMemory) - - // vnpu not support this metrics - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - - containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) - if !c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { - doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, npuCtrTotalMemory) - doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, npuCtrUsedMemory) - } - } - - updateFrame[ddrCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf update telegraf metrics -func (c *DdrCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - logger.Debugf("cacheKey(%v) not found", chip.PhyId) - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - memoryInfo := cache.extInfo - if memoryInfo == nil { - logger.Debugf("info in cache is nil,cacheKey(%v)", chip.PhyId) - continue - } - memorySize := memoryInfo.MemorySize - memoryAvailable := memoryInfo.MemoryAvailable - - doUpdateTelegraf(fieldMap, descTotalMemory, memorySize, "") - doUpdateTelegraf(fieldMap, descUsedMemory, memorySize-memoryAvailable, "") - - } - return fieldsMap -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go deleted file mode 100644 index d9f5601..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go +++ /dev/null @@ -1,228 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -var ( - descHbmUsedMemory = colcommon.BuildDesc("npu_chip_info_hbm_used_memory", "the npu hbm used memory") - descHbmTotalMemory = colcommon.BuildDesc("npu_chip_info_hbm_total_memory", "the npu hbm total memory") - descHbmUtilization = colcommon.BuildDesc("npu_chip_info_hbm_utilization", "the npu hbm utilization") - descHbmTemperature = colcommon.BuildDesc("npu_chip_info_hbm_temperature", "the npu hbm temperature") - descHbmBWUtil = colcommon.BuildDesc("npu_chip_info_hbm_bandwidth_utilization", "the npu hbm bandwidth util rate") - - descEccEnableFlag = colcommon.BuildDesc("npu_chip_info_hbm_ecc_enable_flag", - "whether HBM ecc detection is enabled") - descEccSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_error_cnt", - "HBM Single Bit Error Count") - descEccDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_error_cnt", - "HBM Double Bit Error Count") - - descEccTotalSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_single_bit_error_cnt", - "HBM Single Bit Aggregate Total Err Cnt") - descEccTotalDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_double_bit_error_cnt", - "HBM Double Bit Aggregate Total Err Cnt") - descEccSingleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_isolated_pages_cnt", - "HBM Single Bit Isolated Pages Count") - descEccDoubleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_isolated_pages_cnt", - "HBM Double Bit Isolated Pages Count") -) - -var ( - supportedHbmDevices = map[string]bool{ - api.Ascend910A: true, - api.Ascend910B: true, - api.Ascend910A3: true, - } -) - -type hbmCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo the hbm info - extInfo *common.HbmAggregateInfo - // hbmUtilization the hbm utilization - hbmUtilization uint32 -} - -// HbmCollector collects hbm info -type HbmCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *HbmCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedHbmDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe describes all the metrics that will be exposed. -func (c *HbmCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descHbmUsedMemory - ch <- descHbmTotalMemory - ch <- descHbmUtilization - ch <- descHbmTemperature - ch <- descHbmBWUtil - - ch <- descEccEnableFlag - ch <- descEccSingleBitErrorCnt - ch <- descEccDoubleBitErrorCnt - ch <- descEccTotalSingleBitErrorCnt - ch <- descEccTotalDoubleBitErrorCnt - ch <- descEccSingleBitIoslatedPagesCnt - ch <- descEccDoubleBitIoslatedPagesCnt -} - -// CollectToCache collects hbm info -func (c *HbmCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - getAllHBMEccInfo(c, chip.LogicID, n.Dmgr, &chip) - } - colcommon.UpdateCache[hbmCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus updates the prometheus metrics. -func (c *HbmCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hbmCache, cardLabel []string) { - extInfo := cache.extInfo - if extInfo == nil { - return - } - timestamp := cache.timestamp - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.hbmUtilization), cardLabel, descHbmUtilization) - - c.updateHbmInfo(ch, cache, cardLabel, containerMap, chipWithVnpu) - - eccInfo := extInfo.ECCInfo - updateHbmEccInfo(ch, eccInfo, timestamp, cardLabel) - } - - updateFrame[hbmCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf updates the telegraf metrics. -func (c *HbmCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - caches := colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - - doUpdateTelegrafWithValidateNum(fieldMap, descHbmUtilization, float64(cache.hbmUtilization), "") - - hbmInfo := extInfo.HbmInfo - if hbmInfo != nil { - doUpdateTelegraf(fieldMap, descHbmUsedMemory, hbmInfo.Usage, "") - doUpdateTelegraf(fieldMap, descHbmTotalMemory, hbmInfo.MemorySize, "") - doUpdateTelegraf(fieldMap, descHbmTemperature, hbmInfo.Temp, "") - doUpdateTelegraf(fieldMap, descHbmBWUtil, hbmInfo.BandWidthUtilRate, "") - } - - eccInfo := extInfo.ECCInfo - if eccInfo != nil { - doUpdateTelegraf(fieldMap, descEccEnableFlag, eccInfo.EnableFlag, "") - doUpdateTelegraf(fieldMap, descEccSingleBitErrorCnt, eccInfo.SingleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccDoubleBitErrorCnt, eccInfo.DoubleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccTotalSingleBitErrorCnt, eccInfo.TotalSingleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccTotalDoubleBitErrorCnt, eccInfo.TotalDoubleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccSingleBitIoslatedPagesCnt, eccInfo.SingleBitIsolatedPagesCnt, "") - doUpdateTelegraf(fieldMap, descEccDoubleBitIoslatedPagesCnt, eccInfo.DoubleBitIsolatedPagesCnt, "") - - } - } - return fieldsMap - -} - -func getAllHBMEccInfo(c *HbmCollector, logicID int32, dmgr devmanager.DeviceInterface, chip *colcommon.HuaWeiAIChip) { - - hbmInfo := &common.HbmAggregateInfo{} - var utilizationRate uint32 - var err error - hbmInfo.HbmInfo, err = dmgr.GetDeviceHbmInfo(logicID) - handleErr(err, colcommon.DomainForHBM, logicID) - - utilizationRate, err = dmgr.GetDeviceUtilizationRate(logicID, common.HbmUtilization) - handleErr(err, colcommon.DomainForHbmUtilization, logicID) - - hbmInfo.ECCInfo, err = dmgr.GetDeviceEccInfo(logicID, common.DcmiDeviceTypeHBM) - handleErr(err, colcommon.DomainForHBMECC, logicID) - c.LocalCache.Store(chip.PhyId, hbmCache{ - chip: *chip, - timestamp: time.Now(), - extInfo: hbmInfo, - hbmUtilization: utilizationRate}, - ) -} - -func updateHbmEccInfo(ch chan<- prometheus.Metric, eccInfo *common.ECCInfo, timestamp time.Time, cardLabel []string) { - if eccInfo == nil { - return - } - doUpdateMetric(ch, timestamp, eccInfo.EnableFlag, cardLabel, descEccEnableFlag) - doUpdateMetric(ch, timestamp, eccInfo.SingleBitErrorCnt, cardLabel, descEccSingleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.DoubleBitErrorCnt, cardLabel, descEccDoubleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.TotalSingleBitErrorCnt, cardLabel, descEccTotalSingleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.TotalDoubleBitErrorCnt, cardLabel, descEccTotalDoubleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.SingleBitIsolatedPagesCnt, cardLabel, descEccSingleBitIoslatedPagesCnt) - doUpdateMetric(ch, timestamp, eccInfo.DoubleBitIsolatedPagesCnt, cardLabel, descEccDoubleBitIoslatedPagesCnt) -} - -func (c *HbmCollector) updateHbmInfo(ch chan<- prometheus.Metric, cache hbmCache, cardLabel []string, - containerMap map[int32]container.DevicesInfo, chipWithVnpu colcommon.HuaWeiAIChip) { - hbmInfo := cache.extInfo - if hbmInfo == nil || hbmInfo.HbmInfo == nil { - return - } - timestamp := cache.timestamp - doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, descHbmUsedMemory) - doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, descHbmTotalMemory) - doUpdateMetric(ch, timestamp, hbmInfo.Temp, cardLabel, descHbmTemperature) - doUpdateMetric(ch, timestamp, hbmInfo.BandWidthUtilRate, cardLabel, descHbmBWUtil) - - // vnpu not support this metrics - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - - containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) - if c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { - doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, npuCtrTotalMemory) - doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, npuCtrUsedMemory) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go deleted file mode 100644 index 4bf59cd..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" -) - -type TestCase struct { - name string - initFunc func() - expectMetricLen int -} - -const ( - expectMetricLen4 = 4 - expectMetricLen6 = 6 - vdevId = 132 - maxMetrics = 10 - mockNs = "mockNs" - mockPodName = "mockPodName" -) - -func TestUpdateHbmInfo(t *testing.T) { - collector := HbmCollector{} - ch := make(chan int, maxMetrics) - defer close(ch) - cache := buildHbmCache() - chipWithVnpu := &colcommon.HuaWeiAIChip{} - cases := buildTestCases(&collector, chipWithVnpu, &cache) - patch := gomonkey.NewPatches() - patch.ApplyFunc(doUpdateMetric, func(_ chan<- prometheus.Metric, _ time.Time, _ interface{}, _ []string, - desc *prometheus.Desc) { - ch <- 0 - }) - patch.ApplyFuncReturn(geenContainerInfo, nil) - patch.ApplyFuncReturn(getContainerNameArray, []string{mockNs, mockPodName, mockContainerName}) - defer patch.Reset() - - for _, c := range cases { - convey.Convey(c.name, t, func() { - ch = make(chan int, maxMetrics) - c.initFunc() - collector.updateHbmInfo(nil, cache, nil, nil, *chipWithVnpu) - convey.So(len(ch), convey.ShouldEqual, c.expectMetricLen) - }) - } -} - -func buildTestCases(collector *HbmCollector, chipWithVnpu *colcommon.HuaWeiAIChip, cache *hbmCache) []TestCase { - cases := []TestCase{ - {name: "when npu is not 910 series ", initFunc: func() {}, expectMetricLen: expectMetricLen4}, - {name: "when vnpu is nil and with container info", initFunc: func() { - collector.Is910Series = true - }, expectMetricLen: expectMetricLen6}, - {name: "when chip is vnpu", initFunc: func() { - chipWithVnpu.VDevActivityInfo = &common.VDevActivityInfo{ - VDevID: vdevId, - } - }, expectMetricLen: expectMetricLen4}, - {name: "when extInfo.HbmInfo is nil", initFunc: func() { cache.extInfo.HbmInfo = nil }, expectMetricLen: 0}, - {name: "when extInfo is nil", initFunc: func() { cache.extInfo = nil }, expectMetricLen: 0}, - } - return cases -} - -func buildHbmCache() hbmCache { - cache := hbmCache{ - chip: colcommon.HuaWeiAIChip{ - PhyId: 0, - }, - hbmUtilization: 0, - timestamp: time.Now(), - extInfo: &common.HbmAggregateInfo{ - HbmInfo: &common.HbmInfo{ - BandWidthUtilRate: 0, - Frequency: 0, - MemorySize: 0, - Temp: 0, - Usage: 0, - }, - ECCInfo: &common.ECCInfo{ - EnableFlag: 0, - SingleBitErrorCnt: 0, - DoubleBitErrorCnt: 0, - TotalSingleBitErrorCnt: 0, - TotalDoubleBitErrorCnt: 0, - SingleBitIsolatedPagesCnt: 0, - DoubleBitIsolatedPagesCnt: 0, - }, - }, - } - return cache -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go deleted file mode 100644 index 1ecc3a9..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go +++ /dev/null @@ -1,312 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "fmt" - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - hccsTxDescs []*prometheus.Desc - hccsRxDescs []*prometheus.Desc - hccsErrDescs []*prometheus.Desc - hccsBWTxDescs []*prometheus.Desc - hccsBWRxDescs []*prometheus.Desc - hccsBWProfilingTime *prometheus.Desc = nil - hccsBWTotalTx *prometheus.Desc = nil - hccsBWTotalRx *prometheus.Desc = nil - - supportedHccsDevices = map[string]bool{ - api.Ascend910B: true, - api.Ascend910A3: true, - } -) - -const ( - // MaxHccsNum max hccs num - MaxHccsNum int = 8 - // hccs info begin index, 1 or 2 - num1 = 1 - num2 = 2 -) - -// init add descs in init method -func init() { - for i := 0; i < MaxHccsNum; i++ { - index := strconv.Itoa(i) - colcommon.BuildDescSlice(&hccsTxDescs, api.Prefix+"tx_cnt_"+index, - "transmitted message count for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsRxDescs, api.Prefix+"rx_cnt_"+index, - "received message count for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsErrDescs, api.Prefix+"crc_err_cnt_"+index, - "crc error count for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsBWTxDescs, api.BwPrefix+"tx_"+index, - "single-link transmission data bandwidth for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsBWRxDescs, api.BwPrefix+"rx_"+index, - "single-link receive data bandwidth for "+api.Hccs+" "+index) - } - hccsBWProfilingTime = colcommon.BuildDesc(api.BwPrefix+"profiling_time", - "sampling interval for "+api.Hccs+" bandwidth") - hccsBWTotalTx = colcommon.BuildDesc(api.BwPrefix+"total_tx", "total sent data bandwidth") - hccsBWTotalRx = colcommon.BuildDesc(api.BwPrefix+"total_rx", "total received data bandwidth") -} - -type hccsCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // hccsStat hccs info of npu chip - hccsStat *common.HccsStatisticInfo - - // hccsBW hccs bandwidth info of npu chip - hccsBW *common.HccsBandwidthInfo -} - -// HccsCollector collect hccs info -type HccsCollector struct { - colcommon.MetricsCollectorAdapter - hccsBeginIndex int - - // Automatically adapt according to the interface call - realGetStatisticInfoFunc func(logicID int32) (*common.HccsStatisticInfo, error) -} - -// IsSupported judge whether the collector is supported -func (c *HccsCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedHccsDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe description of the metric -func (c *HccsCollector) Describe(ch chan<- *prometheus.Desc) { - for _, desc := range hccsTxDescs { - ch <- desc - } - for _, desc := range hccsRxDescs { - ch <- desc - } - for _, desc := range hccsErrDescs { - ch <- desc - } - for _, desc := range hccsBWTxDescs { - ch <- desc - } - for _, desc := range hccsBWRxDescs { - ch <- desc - } - ch <- hccsBWProfilingTime - ch <- hccsBWTotalTx - ch <- hccsBWTotalRx -} - -// CollectToCache collect the metric to cache -func (c *HccsCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - logicID := chip.LogicID - var hccsStatisticInfo *common.HccsStatisticInfo - var err error - if c.realGetStatisticInfoFunc != nil { - hccsStatisticInfo, err = c.realGetStatisticInfoFunc(logicID) - } else { - hccsStatisticInfo = buildFailedHccsInfo() - err = fmt.Errorf("realGetStatisticInfoFunc is nil when get hccs info, " + - "maybe both GetHccsStatisticInfoInU64 and GetHccsStatisticInfo can't be unreached") - } - handleErr(err, colcommon.DomainForHccs, logicID) - - hccsBandwidthInfo, err := n.Dmgr.GetHccsBandwidthInfo(logicID) - handleErr(err, colcommon.DomainForHccsBW, logicID) - c.LocalCache.Store(chip.PhyId, hccsCache{ - chip: chip, - timestamp: time.Now(), - hccsStat: hccsStatisticInfo, - hccsBW: hccsBandwidthInfo}, - ) - } - - colcommon.UpdateCache[hccsCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// PreCollect pre collect hccs info -func (c *HccsCollector) PreCollect(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - if len(chipList) == 0 { - return - } - chipOne := chipList[0] - devType := n.Dmgr.GetDevType() - if devType == api.Ascend910B || common.IsA900A3SuperPod(chipOne.MainBoardId) || - common.Is800IA3Chip(chipOne.MainBoardId) { - // A2 or A900A3 SuperPod or 800IA3 begin at 1st bit - c.hccsBeginIndex = num1 - } else if common.IsA9000A3SuperPod(chipOne.MainBoardId) { - // A9000A3SuperPod begin at 2nd bit - c.hccsBeginIndex = num2 - } else { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: api.Hccs, ID: "0"}, - "not support main board id:%d", chipOne.MainBoardId) - } - - // Both failed, retry 3 times with 2s interval - const retryTimes = 3 - const retryInterval = 2 * time.Second - var success bool - var err1, err2 error - for i := 0; i < retryTimes; i++ { - _, err1 = n.Dmgr.GetHccsStatisticInfoInU64(chipOne.LogicID) - if err1 == nil { - logger.Infof("get hccs statistic info by subCmd(5) succeeded, will use subCmd(5) to get hccs info") - c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfoInU64 - success = true - break - } - _, err2 = n.Dmgr.GetHccsStatisticInfo(chipOne.LogicID) - if err2 == nil { - logger.Infof("get hccs statistic info by subCmd(3) succeeded, will use subCmd(3) to get hccs info") - c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfo - success = true - break - } - time.Sleep(retryInterval) - } - // If still failed after retries, set to nil and log error - if !success { - logger.Errorf("get hccs statistic info failed after trying both subCmd(5) and subCmd(3) with 3 retries, "+ - "err1: %v, err2: %v", err1, err2) - c.realGetStatisticInfoFunc = nil - } - -} - -// UpdatePrometheus update prometheus -func (c *HccsCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hccsCache, cardLabel []string) { - timestamp := cache.timestamp - promUpdateHccsStatisticInfo(ch, cache, c, timestamp, cardLabel) - promUpdateHccsBwInfo(ch, cache, c, timestamp, cardLabel) - } - updateFrame[hccsCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -func promUpdateHccsBwInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, - timestamp time.Time, cardLabel []string) { - bandwidthInfo := cache.hccsBW - if bandwidthInfo == nil { - return - } - if c.hccsBeginIndex < 0 { - logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateMetric(ch, timestamp, bandwidthInfo.TxBandwidth[i], cardLabel, hccsBWTxDescs[i]) - doUpdateMetric(ch, timestamp, bandwidthInfo.RxBandwidth[i], cardLabel, hccsBWRxDescs[i]) - } - doUpdateMetric(ch, timestamp, bandwidthInfo.ProfilingTime, cardLabel, hccsBWProfilingTime) - doUpdateMetric(ch, timestamp, bandwidthInfo.TotalTxbw, cardLabel, hccsBWTotalTx) - doUpdateMetric(ch, timestamp, bandwidthInfo.TotalRxbw, cardLabel, hccsBWTotalRx) -} - -func promUpdateHccsStatisticInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, - timestamp time.Time, cardLabel []string) { - statisticInfo := cache.hccsStat - - if statisticInfo == nil { - return - } - if c.hccsBeginIndex < 0 { - logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateMetric(ch, timestamp, statisticInfo.TxCnt[i], cardLabel, hccsTxDescs[i]) - doUpdateMetric(ch, timestamp, statisticInfo.RxCnt[i], cardLabel, hccsRxDescs[i]) - doUpdateMetric(ch, timestamp, statisticInfo.CrcErrCnt[i], cardLabel, hccsErrDescs[i]) - } -} - -// UpdateTelegraf update telegraf -func (c *HccsCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - telegrafUpdateHccsStatisticInfo(cache, c, fieldMap) - telegrafUpdateHccsBwInfo(cache, c, fieldMap) - } - - return fieldsMap - -} - -func telegrafUpdateHccsBwInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { - bandwidthInfo := cache.hccsBW - if bandwidthInfo == nil || c.hccsBeginIndex < 0 { - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateTelegraf(fieldMap, hccsBWTxDescs[i], bandwidthInfo.TxBandwidth[i], "") - doUpdateTelegraf(fieldMap, hccsBWRxDescs[i], bandwidthInfo.RxBandwidth[i], "") - } - doUpdateTelegraf(fieldMap, hccsBWProfilingTime, bandwidthInfo.ProfilingTime, "") - doUpdateTelegraf(fieldMap, hccsBWTotalTx, bandwidthInfo.TotalTxbw, "") - doUpdateTelegraf(fieldMap, hccsBWTotalRx, bandwidthInfo.TotalRxbw, "") -} - -func telegrafUpdateHccsStatisticInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { - statisticInfo := cache.hccsStat - - if statisticInfo == nil || c.hccsBeginIndex < 0 { - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateTelegraf(fieldMap, hccsTxDescs[i], statisticInfo.TxCnt[i], "") - doUpdateTelegraf(fieldMap, hccsRxDescs[i], statisticInfo.RxCnt[i], "") - doUpdateTelegraf(fieldMap, hccsErrDescs[i], statisticInfo.CrcErrCnt[i], "") - } -} - -// buildFailedHccsInfo build failed hccs info -func buildFailedHccsInfo() *common.HccsStatisticInfo { - errorResult := &common.HccsStatisticInfo{ - TxCnt: make([]uint64, 8), - RxCnt: make([]uint64, 8), - CrcErrCnt: make([]uint64, 8), - } - for i := 0; i < 8; i++ { - errorResult.TxCnt[i] = common.FailedValue - errorResult.RxCnt[i] = common.FailedValue - errorResult.CrcErrCnt[i] = common.FailedValue - } - return errorResult -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go deleted file mode 100644 index 4b596df..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" -) - -const ( - mockLogicID int32 = 0 - mockMainBoardId uint32 = 100 - errorMsgWith8001 string = "error code 8001 occurred" - errorMsgWithout8001 string = "error code 8002 occurred" - singleChipList int = 1 - unsupportedBoardId uint32 = 999 -) - -type preCollectTestCase struct { - name string - chipList []colcommon.HuaWeiAIChip - devType string - mainBoardId uint32 - isA900A3SuperPod bool - isA9000A3SuperPod bool - is800IA3Chip bool - getStatInfoErr error - expectedBeginIndex int - expectedFuncSet bool -} - -func TestPreCollect(t *testing.T) { - n := mockNewNpuCollector() - testCases := buildPreCollectTestCases() - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - - setupPatches(patches, n, tc) - collector := &HccsCollector{} - collector.PreCollect(n, tc.chipList) - verifyPreCollectResult(collector, tc) - }) - } -} - -func buildPreCollectTestCases() []preCollectTestCase { - cases := []preCollectTestCase{ - {name: "should return early when chipList is empty", - chipList: []colcommon.HuaWeiAIChip{}, - expectedBeginIndex: 0, - expectedFuncSet: false}, - {name: "should not set beginIndex when mainBoardId is not supported", - chipList: createMockChipList(singleChipList, unsupportedBoardId), - devType: api.Ascend910A3, - mainBoardId: unsupportedBoardId, - getStatInfoErr: nil, - expectedBeginIndex: 0, - expectedFuncSet: true}, - } - cases = append(cases, buildBeginIndexCases()...) - return cases -} - -func buildBeginIndexCases() []preCollectTestCase { - return []preCollectTestCase{ - {name: "should set beginIndex to num1 when devType is Ascend910B", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910B, - mainBoardId: mockMainBoardId, - getStatInfoErr: nil, - expectedBeginIndex: num1, - expectedFuncSet: true}, - {name: "should set beginIndex to num1 when IsA900A3SuperPod returns true", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910A3, - mainBoardId: mockMainBoardId, - isA900A3SuperPod: true, - getStatInfoErr: nil, - expectedBeginIndex: num1, - expectedFuncSet: true}, - {name: "should set beginIndex to num1 when Is800IA3Chip returns true", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910A3, - mainBoardId: mockMainBoardId, - is800IA3Chip: true, - getStatInfoErr: nil, - expectedBeginIndex: num1, - expectedFuncSet: true}, - {name: "should set beginIndex to num2 when IsA9000A3SuperPod returns true", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910A3, - mainBoardId: mockMainBoardId, - isA9000A3SuperPod: true, - getStatInfoErr: nil, - expectedBeginIndex: num2, - expectedFuncSet: true}, - } -} - -func createMockChipList(count int, mainBoardId uint32) []colcommon.HuaWeiAIChip { - if count == 0 { - return []colcommon.HuaWeiAIChip{} - } - return []colcommon.HuaWeiAIChip{ - { - LogicID: mockLogicID, - MainBoardId: mainBoardId, - }, - } -} - -func setupPatches(patches *gomonkey.Patches, n *colcommon.NpuCollector, tc preCollectTestCase) { - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tc.devType) - patches.ApplyFuncReturn(common.IsA900A3SuperPod, tc.isA900A3SuperPod) - patches.ApplyFuncReturn(common.IsA9000A3SuperPod, tc.isA9000A3SuperPod) - patches.ApplyFuncReturn(common.Is800IA3Chip, tc.is800IA3Chip) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", - &common.HccsStatisticInfo{}, tc.getStatInfoErr) -} - -func verifyPreCollectResult(collector *HccsCollector, tc preCollectTestCase) { - convey.So(collector.hccsBeginIndex, convey.ShouldEqual, tc.expectedBeginIndex) - if tc.expectedFuncSet { - convey.So(collector.realGetStatisticInfoFunc, convey.ShouldNotBeNil) - } else { - convey.So(collector.realGetStatisticInfoFunc, convey.ShouldBeNil) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go deleted file mode 100644 index 018a370..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -var ( - // bandwidth - descBandwidthTx = colcommon.BuildDesc("npu_chip_info_bandwidth_tx", - "the npu interface transport speed, unit is 'MB/s'") - descBandwidthRx = colcommon.BuildDesc("npu_chip_info_bandwidth_rx", - "the npu interface receive speed, unit is 'MB/s'") - - // linkspeed - npuChipLinkSpeed = colcommon.BuildDesc("npu_chip_link_speed", - "the npu interface receive link speed, unit is 'Mb/s'") - - // linkupNum - npuChipLinkUpNum = colcommon.BuildDesc("npu_chip_link_up_num", "the npu interface receive link-up num") - - // linkstatus - descLinkStatus = colcommon.BuildDesc("npu_chip_info_link_status", "the npu link status") -) - -type netInfoCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - extInfo *common.NpuNetInfo -} - -// NetworkCollector collects the network info -type NetworkCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check if the collector is supported -func (c *NetworkCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := n.Dmgr.IsTrainingCard() - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "only training card supports network related info") - return isSupport -} - -// Describe description of the metric -func (c *NetworkCollector) Describe(ch chan<- *prometheus.Desc) { - // bandwidth - ch <- descBandwidthTx - ch <- descBandwidthRx - // linkspeed - ch <- npuChipLinkSpeed - // linkupNum - ch <- npuChipLinkUpNum - // linkstatus - ch <- descLinkStatus -} - -// CollectToCache collect the metric to cache -func (c *NetworkCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - netInfo := collectNetworkInfo(chip.PhyId) - c.LocalCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: &netInfo}) - } - colcommon.UpdateCache[netInfoCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *NetworkCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache netInfoCache, cardLabel []string) { - netInfo := cache.extInfo - if netInfo == nil { - return - } - time := cache.timestamp - if validateNotNilForEveryElement(netInfo.BandwidthInfo) { - doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.TxValue, cardLabel, descBandwidthTx) - doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.RxValue, cardLabel, descBandwidthRx) - } - if validateNotNilForEveryElement(netInfo.LinkSpeedInfo) { - doUpdateMetricWithValidateNum(ch, time, netInfo.LinkSpeedInfo.Speed, cardLabel, npuChipLinkSpeed) - } - if validateNotNilForEveryElement(netInfo.LinkStatInfo) { - doUpdateMetricWithValidateNum(ch, time, netInfo.LinkStatInfo.LinkUPNum, cardLabel, npuChipLinkUpNum) - } - if validateNotNilForEveryElement(netInfo.LinkStatusInfo) { - doUpdateMetricWithValidateNum(ch, time, float64(hccn.GetLinkStatusCode(netInfo.LinkStatusInfo.LinkState)), - cardLabel, descLinkStatus) - } - } - updateFrame[netInfoCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf update telegraf metrics -func (c *NetworkCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - extInfo := cache.extInfo - if extInfo == nil { - continue - } - if validateNotNilForEveryElement(extInfo.BandwidthInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthTx, extInfo.BandwidthInfo.TxValue, "") - doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthRx, extInfo.BandwidthInfo.RxValue, "") - } - if validateNotNilForEveryElement(extInfo.LinkSpeedInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkSpeed, extInfo.LinkSpeedInfo.Speed, "") - } - if validateNotNilForEveryElement(extInfo.LinkStatInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkUpNum, extInfo.LinkStatInfo.LinkUPNum, "") - } - if validateNotNilForEveryElement(extInfo.LinkStatusInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, descLinkStatus, - float64(hccn.GetLinkStatusCode(extInfo.LinkStatusInfo.LinkState)), "") - } - } - return fieldsMap -} - -func collectNetworkInfo(phyID int32) common.NpuNetInfo { - newNetInfo := common.NpuNetInfo{} - - newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} - if linkState, err := hccn.GetNPULinkStatus(phyID); err == nil { - newNetInfo.LinkStatusInfo.LinkState = linkState - hwlog.ResetErrCnt(colcommon.DomainForLinkState, phyID) - } else { - logErrMetricsWithLimit(colcommon.DomainForLinkState, phyID, err) - newNetInfo.LinkStatusInfo.LinkState = colcommon.Abnormal - } - - if tx, rx, err := hccn.GetNPUInterfaceTraffic(phyID); err == nil { - newNetInfo.BandwidthInfo = &common.BandwidthInfo{} - newNetInfo.BandwidthInfo.RxValue = rx - newNetInfo.BandwidthInfo.TxValue = tx - hwlog.ResetErrCnt(colcommon.DomainForBandwidth, phyID) - } else { - newNetInfo.BandwidthInfo = nil - logErrMetricsWithLimit(colcommon.DomainForBandwidth, phyID, err) - } - if linkUpNum, err := hccn.GetNPULinkUpNum(phyID); err == nil { - newNetInfo.LinkStatInfo = &common.LinkStatInfo{} - newNetInfo.LinkStatInfo.LinkUPNum = float64(linkUpNum) - hwlog.ResetErrCnt(colcommon.DomainForLinkStat, phyID) - } else { - newNetInfo.LinkStatInfo = nil - logErrMetricsWithLimit(colcommon.DomainForLinkStat, phyID, err) - } - - if speed, err := hccn.GetNPULinkSpeed(phyID); err == nil { - newNetInfo.LinkSpeedInfo = &common.LinkSpeedInfo{} - newNetInfo.LinkSpeedInfo.Speed = float64(speed) - hwlog.ResetErrCnt(colcommon.DomainForLinkSpeed, phyID) - } else { - newNetInfo.LinkSpeedInfo = nil - logErrMetricsWithLimit(colcommon.DomainForLinkSpeed, phyID, err) - } - - return newNetInfo -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go deleted file mode 100644 index 975ffcf..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go +++ /dev/null @@ -1,453 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "math" - "strconv" - "strings" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - errorCodeDescs []*prometheus.Desc - cardLabelForProcess = append(colcommon.CardLabel, "process_id", "container_id") - cardLabelForContainer []string - cardLabelForSN []string - cardLabelForNpuName = make([]string, len(colcommon.CardLabel)) -) - -var ( - machineInfoNPUDesc = colcommon.BuildDescWithLabel("machine_npu_nums", "Amount of npu installed on the machine.", nil) - - descUtil = colcommon.BuildDesc("npu_chip_info_utilization", "the ai core utilization") - descOverUtil = colcommon.BuildDesc("npu_chip_info_overall_utilization", "the overall utilization of npu") - descVectorUtil = colcommon.BuildDesc("npu_chip_info_vector_utilization", "the vector ai core utilization") - descTemp = colcommon.BuildDesc("npu_chip_info_temperature", "the npu temperature") - descPower = colcommon.BuildDesc("npu_chip_info_power", "the npu power") - descVoltage = colcommon.BuildDesc("npu_chip_info_voltage", "the npu voltage") - - descAICoreFreq = colcommon.BuildDesc("npu_chip_info_aicore_current_freq", - "the npu ai core current frequency, unit is 'MHz'") - descHealthStatus = colcommon.BuildDesc("npu_chip_info_health_status", "the npu health status") - descDevProcessNum = colcommon.BuildDesc("npu_chip_info_process_info_num", - "the npu process num") - - descDevProcessInfo = colcommon.BuildDescWithLabel("npu_chip_info_process_info", - "the npu process info, unit is 'MB'. if process run on host, container_id and container_name will be empty", - cardLabelForProcess) - - // net status - descNetworkStatus = colcommon.BuildDesc("npu_chip_info_network_status", "the npu network health status") - - // container (vnpu not support this metrics), only report to prometheus - npuCtrUtilization = colcommon.BuildDesc("container_npu_utilization", - "npu ai core utilization in container, unit is '%'") - npuCtrTotalMemory = colcommon.BuildDesc("container_npu_total_memory", - "npu total memory in container, unit is 'MB'") - npuCtrUsedMemory = colcommon.BuildDesc("container_npu_used_memory", - "the npu used memory in container, unit is 'MB'") - - npuCtrInfo *prometheus.Desc = nil - descNpuName *prometheus.Desc = nil - descNPUSerialNumber *prometheus.Desc = nil -) - -func init() { - - colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code", "the npu error code") - for i := 1; i < common.MaxErrorCodeLen; i++ { - colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code_"+strconv.Itoa(i), "the npu error code") - } - - cardLabelForContainer = append(colcommon.CardLabel, "containerID", "containerName") - cardLabelForContainer[0] = "npuID" - npuCtrInfo = colcommon.BuildDescWithLabel("npu_container_info", "the container name and deviceID relationship", - cardLabelForContainer) - - cardLabelForSN = append(colcommon.CardLabel, "serial_number") - // NPU SN related metrics - descNPUSerialNumber = colcommon.BuildDescWithLabel("npu_chip_info_serial_number", - "the npu serial number information", cardLabelForSN) - - copy(cardLabelForNpuName, colcommon.CardLabel) - cardLabelForNpuName[1] = "name" - descNpuName = colcommon.BuildDescWithLabel("npu_chip_info_name", "the Ascend npu name with value '1'", - cardLabelForNpuName) -} - -type chipCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - - // the healthy status of the AI chip - HealthStatus string `json:"health_status"` - // the all error codes of the chip - ErrorCodes []int64 `json:"error_codes"` - // the utilization of the chip - Utilization int `json:"utilization"` - // the overall utilization of the chip - OverallUtilization int `json:"overall_utilization"` - // the vector utilization of the chip - VectorUtilization int `json:"vector_utilization"` - // the temperature of the chip - Temperature int `json:"temperature"` - // the work power of the chip - Power float32 `json:"power"` - // the work voltage of the chip - Voltage float32 `json:"voltage"` - // the AI core current frequency of the chip - AICoreCurrentFreq uint32 `json:"aicore_current_freq"` - // NetHealthStatus chip network health status - NetHealthStatus string `json:"net_health_status"` - // DevProcessInfo chip process info - DevProcessInfo *common.DevProcessInfo -} - -// BaseInfoCollector collects the base info of the chip -type BaseInfoCollector struct { - colcommon.MetricsCollectorAdapter -} - -// Describe collects the base info of the chip -func (c *BaseInfoCollector) Describe(ch chan<- *prometheus.Desc) { - // base info - ch <- machineInfoNPUDesc - ch <- descUtil - ch <- descVectorUtil - ch <- descOverUtil - ch <- descTemp - ch <- descPower - ch <- descVoltage - ch <- descHealthStatus - ch <- descNpuName - ch <- descAICoreFreq - ch <- descNPUSerialNumber - ch <- descDevProcessInfo - // status - ch <- descNetworkStatus - // container - ch <- npuCtrInfo - ch <- npuCtrUtilization - ch <- npuCtrTotalMemory - ch <- npuCtrUsedMemory - - // error code - for _, desc := range errorCodeDescs { - ch <- desc - } -} - -// CollectToCache collects the base info of the chip -func (c *BaseInfoCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - logicID := chip.LogicID - - dmgr := n.Dmgr - - freq, err := dmgr.GetDeviceFrequency(logicID, common.AICoreCurrentFreq) - if err != nil { - freq = common.UnRetError - } - temp, err := dmgr.GetDeviceTemperature(logicID) - if err != nil { - temp = common.RetError - } - vol, err := dmgr.GetDeviceVoltage(logicID) - if err != nil { - vol = common.UnRetError - } - - _, errCodes, err := dmgr.GetDeviceAllErrorCode(logicID) - if err != nil { - errCodes = make([]int64, 0) - } - - cache := &chipCache{ - chip: chip, - AICoreCurrentFreq: freq, - Temperature: int(temp), - Voltage: vol, - HealthStatus: getHealth(logicID, dmgr), - ErrorCodes: errCodes, - } - collectPower(logicID, dmgr, cache) - collectUtil(logicID, dmgr, cache) - setNetHealthStatus(logicID, dmgr, cache) - setProcessInfo(logicID, dmgr, cache) - - cache.timestamp = time.Now() - c.LocalCache.Store(chip.PhyId, *cache) - } - colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -func collectPower(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { - if dmgr.GetDevType() == api.Ascend310P { - cardPower, err := dmgr.GetMcuPowerInfo(chip.chip.CardId) - handleErr(err, colcommon.DomainForMcuPower, chip.chip.CardId) - // Ascend310P use cardPower to replace chipPower - chip.Power = cardPower - } else { - power, err := dmgr.GetDevicePowerInfo(logicID) - handleErr(err, colcommon.DomainForChipPower, logicID) - chip.Power = power - } -} - -// UpdatePrometheus updates the base info of the chip -func (c *BaseInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { - containerInfo := geenContainerInfo(&chipWithVnpu, containerMap) - timestamp := cache.timestamp - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Power), cardLabel, descPower) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Voltage), cardLabel, descVoltage) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.AICoreCurrentFreq), cardLabel, descAICoreFreq) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Temperature), cardLabel, descTemp) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Utilization), cardLabel, descUtil) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.OverallUtilization), cardLabel, descOverUtil) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.VectorUtilization), cardLabel, descVectorUtil) - doUpdateMetricWithValidateNum(ch, timestamp, 1, cardLabel, descNpuName) - doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.HealthStatus)), cardLabel, descHealthStatus) - doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.NetHealthStatus)), - cardLabel, descNetworkStatus) - - updateContainerInfo(ch, containerInfo, cardLabel, &cache, chipWithVnpu) - - updateProcessInfoForPrometheus(ch, &cache, containerInfo, timestamp, cardLabel) - updateErrorCodesInfo(ch, &cache, timestamp, cardLabel) - // Update NPU serial number info - if cache.chip.ElabelInfo != nil { - snLabel := append(cardLabel, cache.chip.ElabelInfo.SerialNumber) - doUpdateMetricWithValidateNum(ch, timestamp, 1, snLabel, descNPUSerialNumber) - } - } - updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - - ch <- prometheus.MustNewConstMetric(machineInfoNPUDesc, prometheus.GaugeValue, float64(len(chips))) -} - -func updateContainerInfo(ch chan<- prometheus.Metric, containerInfo container.DevicesInfo, - cardLabel []string, chip *chipCache, chipWithVnpu colcommon.HuaWeiAIChip) { - containerName := getContainerNameArray(containerInfo) - if len(containerName) != colcommon.ContainerNameLen { - return - } - // based on chipType , container_npu_total_memory、container_npu_used_memory reported in hbm or ddr group - doUpdateMetric(ch, chip.timestamp, 1, append(cardLabel, containerInfo.ID, strings.Join(containerName, "_")), - npuCtrInfo) - - // vnpu not support this metrics - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - - doUpdateMetricWithValidateNum(ch, chip.timestamp, float64(chip.Utilization), cardLabel, npuCtrUtilization) -} - -func updateErrorCodesInfo(ch chan<- prometheus.Metric, chip *chipCache, timestamp time.Time, cardLabel []string) { - if len(chip.ErrorCodes) > common.MaxErrorCodeLen { - logger.Warnf("Error code number is larger than %v, only the first %v will be reported, "+ - "all errorCode is: %v", common.MaxErrorCodeLen, common.MaxErrorCodeLen, chip.ErrorCodes) - } - for i := 0; i < len(chip.ErrorCodes) && i < len(errorCodeDescs); i++ { - doUpdateMetricWithValidateNum(ch, timestamp, float64(chip.ErrorCodes[i]), cardLabel, errorCodeDescs[i]) - } -} - -func updateProcessInfoForPrometheus(ch chan<- prometheus.Metric, chip *chipCache, - containerInfo container.DevicesInfo, timestamp time.Time, cardLabel []string) { - devProcessInfo := chip.DevProcessInfo - if devProcessInfo == nil { - return - } - doUpdateMetric(ch, timestamp, devProcessInfo.ProcNum, cardLabel, descDevProcessNum) - - containerID := "" - containerName := "" - cNameArray := getContainerNameArray(containerInfo) - if len(cNameArray) == colcommon.ContainerNameLen { - containerID = containerInfo.ID - containerName = strings.Join(cNameArray, "_") - } - - newCardLabel := make([]string, len(cardLabel)) - copy(newCardLabel, cardLabel) - // containerName in process info is namespace_podName_containerName - newCardLabel[len(newCardLabel)-1] = containerName - - if devProcessInfo.ProcNum == 0 { - doUpdateMetric(ch, timestamp, 0, append(newCardLabel, "", containerID), descDevProcessInfo) - return - } - - for i := int32(0); i < devProcessInfo.ProcNum; i++ { - procInfo := devProcessInfo.DevProcArray[i] - doUpdateMetric(ch, timestamp, procInfo.MemUsage, - append(newCardLabel, strconv.FormatInt(int64(procInfo.Pid), colcommon.Base), containerID), descDevProcessInfo) - } -} - -// UpdateTelegraf updates the base info of the chip -func (c *BaseInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - doUpdateTelegrafWithValidateNum(fieldMap, descTemp, float64(cache.Temperature), "") - doUpdateTelegrafWithValidateNum(fieldMap, descPower, float64(cache.Power), "") - doUpdateTelegrafWithValidateNum(fieldMap, descVoltage, float64(cache.Voltage), "") - doUpdateTelegrafWithValidateNum(fieldMap, descAICoreFreq, float64(cache.AICoreCurrentFreq), "") - doUpdateTelegrafWithValidateNum(fieldMap, descUtil, float64(cache.Utilization), "") - doUpdateTelegrafWithValidateNum(fieldMap, descVectorUtil, float64(cache.VectorUtilization), "") - doUpdateTelegrafWithValidateNum(fieldMap, descOverUtil, float64(cache.OverallUtilization), "") - doUpdateTelegrafWithValidateNum(fieldMap, descHealthStatus, float64(getHealthCode(cache.HealthStatus)), "") - doUpdateTelegrafWithValidateNum(fieldMap, descNetworkStatus, float64(getHealthCode(cache.NetHealthStatus)), "") - doUpdateTelegraf(fieldMap, descNpuName, chip.ChipInfo.Name, "") - - updateProcessInfoForTelegraf(&cache, fieldMap) - updateErrorCode(&cache, fieldMap) - // Update NPU serial number info - if cache.chip.ElabelInfo != nil { - doUpdateTelegraf(fieldMap, descNPUSerialNumber, cache.chip.ElabelInfo.SerialNumber, "") - } - - } - - if fieldsMap[colcommon.GeneralDevTagKey] == nil { - fieldsMap[colcommon.GeneralDevTagKey] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[colcommon.GeneralDevTagKey], machineInfoNPUDesc, len(chips), "") - return fieldsMap -} - -func updateErrorCode(chip *chipCache, fieldMap map[string]interface{}) { - if len(errorCodeDescs) == 0 { - return - } - descErrorCode := errorCodeDescs[0] - for i := 0; i < len(chip.ErrorCodes); i++ { - extInfo := "" - if i != 0 { - extInfo = "_" + strconv.Itoa(i) - } - doUpdateTelegrafWithValidateNum(fieldMap, descErrorCode, float64(chip.ErrorCodes[i]), extInfo) - } -} - -func updateProcessInfoForTelegraf(chip *chipCache, fieldMap map[string]interface{}) { - devProcessInfo := chip.DevProcessInfo - doUpdateTelegraf(fieldMap, descDevProcessNum, devProcessInfo.ProcNum, "") - if devProcessInfo.ProcNum == 0 { - doUpdateTelegraf(fieldMap, descDevProcessInfo, 0, "") - return - } - for i := int32(0); i < devProcessInfo.ProcNum; i++ { - procInfo := devProcessInfo.DevProcArray[i] - doUpdateTelegraf(fieldMap, descDevProcessInfo, procInfo.MemUsage, "_"+strconv.Itoa(int(procInfo.Pid))) - } -} - -func collectUtil(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { - util, err := dmgr.GetDeviceUtilizationRate(logicID, common.AICore) - handleErr(err, colcommon.DomainForAICoreUtilization, logicID) - chip.Utilization = int(util) - - overAllUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.Overall) - handleErr(err, colcommon.DomainForOverallUtilization, logicID) - chip.OverallUtilization = int(overAllUtil) - - vecUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.VectorCore) - handleErr(err, colcommon.DomainForVectorCoreUtilization, logicID) - chip.VectorUtilization = int(vecUtil) -} - -func setNetHealthStatus(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { - chip.NetHealthStatus = colcommon.Abnormal - if !dmgr.IsTrainingCard() { - return - } - - netCode, err := dmgr.GetDeviceNetWorkHealth(logicID) - logger.Debugf("chip %d network healthy code is %d", logicID, netCode) - if err != nil { - netCode = math.MaxUint32 - } - chip.NetHealthStatus = getNetworkHealthy(netCode) -} - -func getNetworkHealthy(netCode uint32) string { - if netCode == math.MaxUint32 { - return colcommon.Abnormal - } - - if netCode == common.NetworkInit || netCode == common.NetworkSuccess { - return colcommon.Healthy - } - - return colcommon.UnHealthy -} - -func getHealth(logicID int32, dmgr devmanager.DeviceInterface) string { - health, err := dmgr.GetDeviceHealth(logicID) - if err != nil || health != 0 { - return colcommon.UnHealthy - } - return colcommon.Healthy -} - -func getHealthCode(health string) int { - if health == colcommon.Abnormal { - return common.RetError - } - - if colcommon.Healthy == health { - return 1 - } - return 0 -} - -func setProcessInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *chipCache) { - productTypes := dmgr.GetProductTypeArray() - info, err := dmgr.GetDevProcessInfo(logicID) - if err != nil { - if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { - logger.Debugf("process info is not supported on %s", common.Atlas200ISoc) - hwChip.DevProcessInfo = &common.DevProcessInfo{} - return - } - handleErr(err, colcommon.DomainForProcess, logicID) - info = &common.DevProcessInfo{} - } - hwChip.DevProcessInfo = info -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go deleted file mode 100644 index ca49804..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - txPower0 = "Tx_Power0" - txPower1 = "Tx_Power1" - txPower2 = "Tx_Power2" - txPower3 = "Tx_Power3" - - rxPower0 = "Rx_Power0" - rxPower1 = "Rx_Power1" - rxPower2 = "Rx_Power2" - rxPower3 = "Rx_Power3" - - notPresent = "not present" - present = "present" - temperature = "temperature" - voltage = "Vcc" -) - -var ( - - // optical - descOpticalState = colcommon.BuildDesc("npu_chip_optical_state", "the npu interface receive optical-state") - descOpticalVcc = colcommon.BuildDesc("npu_chip_optical_vcc", "the npu interface receive optical-vcc") - descOpticalTemp = colcommon.BuildDesc("npu_chip_optical_temp", "the npu interface receive optical-temperature") - descOpticalTxPower0 = colcommon.BuildDesc("npu_chip_optical_tx_power_0", "npu interface receive optical-tx-power-0") - descOpticalTxPower1 = colcommon.BuildDesc("npu_chip_optical_tx_power_1", "npu interface receive optical-tx-power-1") - descOpticalTxPower2 = colcommon.BuildDesc("npu_chip_optical_tx_power_2", "npu interface receive optical-tx-power-2") - descOpticalTxPower3 = colcommon.BuildDesc("npu_chip_optical_tx_power_3", "npu interface receive optical-tx-power-3") - - descOpticalRxPower0 = colcommon.BuildDesc("npu_chip_optical_rx_power_0", "npu interface receive optical-rx-power-0") - descOpticalRxPower1 = colcommon.BuildDesc("npu_chip_optical_rx_power_1", "npu interface receive optical-rx-power-1") - descOpticalRxPower2 = colcommon.BuildDesc("npu_chip_optical_rx_power_2", "npu interface receive optical-rx-power-2") - descOpticalRxPower3 = colcommon.BuildDesc("npu_chip_optical_rx_power_3", "npu interface receive optical-rx-power-3") -) - -type opticalCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo indicates the optical module information - extInfo *common.OpticalInfo -} - -// OpticalCollector collect the optical metrics -type OpticalCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported judge whether the collector is supported -func (c *OpticalCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := n.Dmgr.IsTrainingCard() - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "only training card supports network related info") - return isSupport -} - -// Describe description of the metric -func (c *OpticalCollector) Describe(ch chan<- *prometheus.Desc) { - // optical - ch <- descOpticalState - ch <- descOpticalTxPower0 - ch <- descOpticalTxPower1 - ch <- descOpticalTxPower2 - ch <- descOpticalTxPower3 - ch <- descOpticalRxPower0 - ch <- descOpticalRxPower1 - ch <- descOpticalRxPower2 - ch <- descOpticalRxPower3 - ch <- descOpticalVcc - ch <- descOpticalTemp -} - -// CollectToCache collect the metric to cache -func (c *OpticalCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - opticalInfo, err := hccn.GetNPUOpticalInfo(chip.PhyId) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForOptical, chip.PhyId, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForOptical, chip.PhyId) - info := getMainOptInfo(opticalInfo) - c.LocalCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), extInfo: info}) - } - colcommon.UpdateCache[opticalCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *OpticalCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache opticalCache, cardLabel []string) { - opticalInfo := cache.extInfo - if opticalInfo == nil { - return - } - timestamp := cache.timestamp - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalState, cardLabel, descOpticalState) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalVcc, cardLabel, descOpticalVcc) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTemp, cardLabel, descOpticalTemp) - - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower0, cardLabel, descOpticalTxPower0) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower1, cardLabel, descOpticalTxPower1) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower2, cardLabel, descOpticalTxPower2) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower3, cardLabel, descOpticalTxPower3) - - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower0, cardLabel, descOpticalRxPower0) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower1, cardLabel, descOpticalRxPower1) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower2, cardLabel, descOpticalRxPower2) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower3, cardLabel, descOpticalRxPower3) - } - - updateFrame[opticalCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *OpticalCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalState, extInfo.OpticalState, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalVcc, extInfo.OpticalVcc, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTemp, extInfo.OpticalTemp, "") - - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower0, extInfo.OpticalTxPower0, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower1, extInfo.OpticalTxPower1, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower2, extInfo.OpticalTxPower2, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower3, extInfo.OpticalTxPower3, "") - - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower0, extInfo.OpticalRxPower0, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower1, extInfo.OpticalRxPower1, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower2, extInfo.OpticalRxPower2, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower3, extInfo.OpticalRxPower3, "") - } - return fieldsMap -} - -func getMainOptInfo(opticalInfo map[string]string) *common.OpticalInfo { - mainOpticalInfo := common.OpticalInfo{} - mainOpticalInfo.OpticalTxPower0 = hccn.GetFloatDataFromStr(opticalInfo[txPower0], txPower0) - mainOpticalInfo.OpticalTxPower1 = hccn.GetFloatDataFromStr(opticalInfo[txPower1], txPower1) - mainOpticalInfo.OpticalTxPower2 = hccn.GetFloatDataFromStr(opticalInfo[txPower2], txPower2) - mainOpticalInfo.OpticalTxPower3 = hccn.GetFloatDataFromStr(opticalInfo[txPower3], txPower3) - mainOpticalInfo.OpticalRxPower0 = hccn.GetFloatDataFromStr(opticalInfo[rxPower0], rxPower0) - mainOpticalInfo.OpticalRxPower1 = hccn.GetFloatDataFromStr(opticalInfo[rxPower1], rxPower1) - mainOpticalInfo.OpticalRxPower2 = hccn.GetFloatDataFromStr(opticalInfo[rxPower2], rxPower2) - mainOpticalInfo.OpticalRxPower3 = hccn.GetFloatDataFromStr(opticalInfo[rxPower3], rxPower3) - mainOpticalInfo.OpticalVcc = hccn.GetFloatDataFromStr(opticalInfo[voltage], voltage) - mainOpticalInfo.OpticalTemp = hccn.GetFloatDataFromStr(opticalInfo[temperature], temperature) - var optState float64 - if opticalInfo[present] == present { - optState = 1.0 - } else if opticalInfo[present] == notPresent { - optState = 0.0 - } else { - optState = common.RetError - } - mainOpticalInfo.OpticalState = optState - - return &mainOpticalInfo -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go deleted file mode 100644 index f68f95b..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go +++ /dev/null @@ -1,234 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - pcieBwType = "pcie_bw_type" - avgPcieBw = "avgPcieBw" - minPcieBw = "minPcieBw" - maxPcieBw = "maxPcieBw" - - avgPostfix = "_avgPcieBw" - minPostfix = "_minPcieBw" - maxPostfix = "_maxPcieBw" -) - -var ( - pcieBwLabel = append(colcommon.CardLabel, pcieBwType) - - descRxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_p_bw", - "the npu write bw to remote‘s speed, unit is 'MB/ms'", pcieBwLabel) - - descRxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_np_bw", - "the npu read bw's speed from remote, unit is 'MB/ms'", pcieBwLabel) - - descRxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_cpl_bw", - "the npu reply remote read operate cpl's speed, unit is 'MB/ms'", pcieBwLabel) - - descTxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_p_bw", - "the npu receive remote write operate's speed, unit is 'MB/ms'", pcieBwLabel) - - descTxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_np_bw", - "the npu receive remote read operate's speed, unit is 'MB/ms'", pcieBwLabel) - - descTxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_cpl_bw", - "the npu read cpl's responese bw speed from remote, unit is 'MB/ms'", pcieBwLabel) -) -var ( - supportedPcieDevices = map[string]bool{ - api.Ascend910B: true, - } -) - -type pcieCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo pcie transport and receive bandwidth, have six metrics - extInfo *common.PCIEBwStat -} - -// PcieCollector collect pcie info -type PcieCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *PcieCollector) IsSupported(n *colcommon.NpuCollector) bool { - // only 910A2 supports pcie info - isSupport := supportedPcieDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe description of the metric -func (c *PcieCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descRxPBW - ch <- descTxPBW - ch <- descRxNpBW - ch <- descTxNpBW - ch <- descRxCplBW - ch <- descTxCplBW -} - -// CollectToCache collect the metric to cache -func (c *PcieCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - pcieBwInfo, err := n.Dmgr.GetPCIEBandwidth(chip.LogicID, common.ProfilingTime) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForPcieBandwidth, chip.LogicID, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForPcieBandwidth, chip.LogicID) - c.LocalCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieBwInfo}) - } - colcommon.UpdateCache[pcieCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *PcieCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache pcieCache, cardLabel []string) { - pcieBwInfo := cache.extInfo - if pcieBwInfo == nil { - return - } - - if cache.chip.VDevActivityInfo != nil && common.IsValidVDevID(cache.chip.VDevActivityInfo.VDevID) { - logger.Debug("vnpu does not supports pcie info query") - return - } - - timestamp := cache.timestamp - - updateAvgPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) - updateMinPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) - updateMaxPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) - } - - updateFrame[pcieCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *PcieCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieAvgBw, avgPostfix) - - doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMinBw, minPostfix) - - doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMaxBw, maxPostfix) - - } - return fieldsMap -} - -func pcieBwLabelVal(cardLabels []string, pcieBwType string) []string { - return append(cardLabels, pcieBwType) -} - -func metricWithPcieBw(labelsVal []string, metrics *prometheus.Desc, val float64, valType string) prometheus.Metric { - return prometheus.MustNewConstMetric(metrics, prometheus.GaugeValue, val, pcieBwLabelVal(labelsVal, valType)...) -} - -func updateAvgPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, - cardLabel []string) { - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieAvgBw), avgPcieBw)) -} - -func updateMinPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, - cardLabel []string) { - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMinBw), minPcieBw)) -} - -func updateMaxPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, - cardLabel []string) { - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMaxBw), maxPcieBw)) -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go deleted file mode 100644 index b1d307c..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go +++ /dev/null @@ -1,263 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - macRxMacPauseNum = "mac_rx_mac_pause_num" - macTxMacPauseNum = "mac_tx_mac_pause_num" - macRxPfcPktNum = "mac_rx_pfc_pkt_num" - macTxPfcPktNum = "mac_tx_pfc_pkt_num" - macRxBadPktNum = "mac_rx_bad_pkt_num" - macTxBadPktNum = "mac_tx_bad_pkt_num" - roCERxAllPktNum = "roce_rx_all_pkt_num" - roCETxAllPktNum = "roce_tx_all_pkt_num" - roCERxErrPktNum = "roce_rx_err_pkt_num" - roCETxErrPktNum = "roce_tx_err_pkt_num" - roCERxCnpPktNum = "roce_rx_cnp_pkt_num" - roCETxCnpPktNum = "roce_tx_cnp_pkt_num" - macRxBadOctNum = "mac_rx_bad_oct_num" - macTxBadOctNum = "mac_tx_bad_oct_num" - roCEUnexpectedAckNum = "roce_unexpected_ack_num" - roCEOutOfOrderNum = "roce_out_of_order_num" - roCEVerificationErrNum = "roce_verification_err_num" - roCEQpStatusErrNum = "roce_qp_status_err_num" - roCENewPktRtyNum = "roce_new_pkt_rty_num" - roCEEcnDBNum = "roce_ecn_db_num" - macRXFcsErrPktNum = "mac_rx_fcs_err_pkt_num" -) - -var ( - // mac - descMacRxPauseNum = colcommon.BuildDesc("npu_chip_mac_rx_pause_num", "npu interface receive mac-rx-pause-num") - descMacTxPauseNum = colcommon.BuildDesc("npu_chip_mac_tx_pause_num", "npu interface receive mac-tx-pause-num") - descMacRxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_rx_pfc_pkt_num", "npu interface receive mac-rx-pfc-pkt-num") - descMacTxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_tx_pfc_pkt_num", "npu interface receive mac-tx-pfc-pkt-num") - descMacRxBadPktNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_pkt_num", "npu interface receive mac-rx-bad-pkt-num") - descMacTxBadPktNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_pkt_num", "npu interface receive mac-tx-bad-pkt-num") - descMacTxBadOctNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_oct_num", "npu interface receive mac-tx-bad-oct-num") - descMacRxBadOctNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_oct_num", "npu interface receive mac-rx-bad-oct-num") - - descRxFCSNum = colcommon.BuildDesc("npu_chip_info_rx_fcs_num", "the npu network fcs receive number") - descRxECNNum = colcommon.BuildDesc("npu_chip_info_rx_ecn_num", "the npu network ecn receive number") - - // roce - descRoceRxAllPktNum = colcommon.BuildDesc("npu_chip_roce_rx_all_pkt_num", "npu interface receive roce-rx-all-pkt-num") - descRoceTxAllPktNum = colcommon.BuildDesc("npu_chip_roce_tx_all_pkt_num", "npu interface receive roce-tx-all-pkt-num") - descRoceRxErrPktNum = colcommon.BuildDesc("npu_chip_roce_rx_err_pkt_num", "npu interface receive roce-rx-err-pkt-num") - descRoceTxErrPktNum = colcommon.BuildDesc("npu_chip_roce_tx_err_pkt_num", "npu interface receive roce-tx-err-pkt-num") - descRoceRxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_rx_cnp_pkt_num", "npu interface receive roce-rx-cnp-pkt-num") - descRoceTxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_tx_cnp_pkt_num", "npu interface receive roce-tx-cnp-pkt-num") - - descRoceNewPktRtyNum = colcommon.BuildDesc("npu_chip_roce_new_pkt_rty_num", - "npu interface receive roce-new-pkt-rty-num") - descRoceOutOfOrderNum = colcommon.BuildDesc("npu_chip_roce_out_of_order_num", - "the npu interface receive roce-out-of-order-num") - descRoceQpStatusErrNum = colcommon.BuildDesc("npu_chip_roce_qp_status_err_num", - "the npu interface receive roce-qp-status-err-num") - descRoceUnexpectedAcktNum = colcommon.BuildDesc("npu_chip_roce_unexpected_ack_num", - "the npu interface receive roce-unexpected-ack-num") - descRoceVerificationErrNum = colcommon.BuildDesc("npu_chip_roce_verification_err_num", - "the npu interface receive roce-verification-err-num") -) - -type roceCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo the statistics about packets - extInfo *common.StatInfo -} - -// RoceCollector collect roce info -type RoceCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *RoceCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := n.Dmgr.IsTrainingCard() - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "only training card supports network related info") - return isSupport -} - -// Describe description of the metric -func (c *RoceCollector) Describe(ch chan<- *prometheus.Desc) { - - // mac - ch <- descMacRxPauseNum - ch <- descMacTxPauseNum - ch <- descMacRxPfcPktNum - ch <- descMacTxPfcPktNum - ch <- descMacRxBadPktNum - ch <- descMacTxBadPktNum - ch <- descMacTxBadOctNum - ch <- descMacRxBadOctNum - ch <- descRxFCSNum - - // roce - ch <- descRoceRxAllPktNum - ch <- descRoceTxAllPktNum - ch <- descRoceRxErrPktNum - ch <- descRoceTxErrPktNum - ch <- descRoceRxCnpPktNum - ch <- descRoceTxCnpPktNum - ch <- descRoceNewPktRtyNum - ch <- descRoceUnexpectedAcktNum - ch <- descRoceOutOfOrderNum - ch <- descRoceVerificationErrNum - ch <- descRoceQpStatusErrNum - ch <- descRxECNNum - -} - -// CollectToCache collect the metric to cache -func (c *RoceCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - statInfo, err := hccn.GetNPUStatInfo(chip.DeviceID) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForRoce, chip.LogicID, err) - return - } - hwlog.ResetErrCnt(colcommon.DomainForRoce, chip.LogicID) - c.LocalCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), extInfo: getMainStatInfo(statInfo)}) - } - colcommon.UpdateCache[roceCache](n, colcommon.GetCacheKey(c), &c.LocalCache) - -} - -// UpdatePrometheus update prometheus metrics -func (c *RoceCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache roceCache, cardLabel []string) { - statInfo := cache.extInfo - if statInfo == nil { - return - } - updateStatInfoOfMac(ch, cache.timestamp, statInfo, cardLabel) - updateStatInfoOfRoCE(ch, cache.timestamp, statInfo, cardLabel) - } - updateFrame[roceCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *RoceCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - doUpdateTelegraf(fieldMap, descMacRxPauseNum, extInfo.MacRxPauseNum, "") - doUpdateTelegraf(fieldMap, descMacTxPauseNum, extInfo.MacTxPauseNum, "") - doUpdateTelegraf(fieldMap, descMacRxPfcPktNum, extInfo.MacRxPfcPktNum, "") - doUpdateTelegraf(fieldMap, descMacTxPfcPktNum, extInfo.MacTxPfcPktNum, "") - doUpdateTelegraf(fieldMap, descMacRxBadPktNum, extInfo.MacRxBadPktNum, "") - doUpdateTelegraf(fieldMap, descMacTxBadPktNum, extInfo.MacTxBadPktNum, "") - doUpdateTelegraf(fieldMap, descMacTxBadOctNum, extInfo.MacTxBadOctNum, "") - doUpdateTelegraf(fieldMap, descMacRxBadOctNum, extInfo.MacRxBadOctNum, "") - doUpdateTelegraf(fieldMap, descRxFCSNum, extInfo.MacRXFcsErrPktNum, "") - - doUpdateTelegraf(fieldMap, descRoceRxAllPktNum, extInfo.RoceRxAllPktNum, "") - doUpdateTelegraf(fieldMap, descRoceTxAllPktNum, extInfo.RoceTxAllPktNum, "") - doUpdateTelegraf(fieldMap, descRoceRxErrPktNum, extInfo.RoceRxErrPktNum, "") - doUpdateTelegraf(fieldMap, descRoceTxErrPktNum, extInfo.RoceTxErrPktNum, "") - doUpdateTelegraf(fieldMap, descRoceRxCnpPktNum, extInfo.RoceRxCnpPktNum, "") - doUpdateTelegraf(fieldMap, descRoceTxCnpPktNum, extInfo.RoceTxCnpPktNum, "") - doUpdateTelegraf(fieldMap, descRoceNewPktRtyNum, extInfo.RoceNewPktRtyNum, "") - doUpdateTelegraf(fieldMap, descRoceUnexpectedAcktNum, extInfo.RoceUnexpectedAckNum, "") - doUpdateTelegraf(fieldMap, descRoceOutOfOrderNum, extInfo.RoceOutOfOrderNum, "") - doUpdateTelegraf(fieldMap, descRoceVerificationErrNum, extInfo.RoceVerificationErrNum, "") - doUpdateTelegraf(fieldMap, descRoceQpStatusErrNum, extInfo.RoceQpStatusErrNum, "") - doUpdateTelegraf(fieldMap, descRxECNNum, extInfo.RoceEcnDBNum, "") - } - return fieldsMap -} -func getMainStatInfo(statInfo map[string]int) *common.StatInfo { - mainStatInfo := common.StatInfo{} - mainStatInfo.MacRxPauseNum = float64(statInfo[macRxMacPauseNum]) - mainStatInfo.MacTxPauseNum = float64(statInfo[macTxMacPauseNum]) - mainStatInfo.MacRxPfcPktNum = float64(statInfo[macRxPfcPktNum]) - mainStatInfo.MacTxPfcPktNum = float64(statInfo[macTxPfcPktNum]) - mainStatInfo.MacRxBadPktNum = float64(statInfo[macRxBadPktNum]) - mainStatInfo.MacTxBadPktNum = float64(statInfo[macTxBadPktNum]) - mainStatInfo.RoceRxAllPktNum = float64(statInfo[roCERxAllPktNum]) - mainStatInfo.RoceTxAllPktNum = float64(statInfo[roCETxAllPktNum]) - mainStatInfo.RoceRxErrPktNum = float64(statInfo[roCERxErrPktNum]) - mainStatInfo.RoceTxErrPktNum = float64(statInfo[roCETxErrPktNum]) - mainStatInfo.RoceRxCnpPktNum = float64(statInfo[roCERxCnpPktNum]) - mainStatInfo.RoceTxCnpPktNum = float64(statInfo[roCETxCnpPktNum]) - mainStatInfo.MacRxBadOctNum = float64(statInfo[macRxBadOctNum]) - mainStatInfo.MacTxBadOctNum = float64(statInfo[macTxBadOctNum]) - mainStatInfo.RoceUnexpectedAckNum = float64(statInfo[roCEUnexpectedAckNum]) - mainStatInfo.RoceOutOfOrderNum = float64(statInfo[roCEOutOfOrderNum]) - mainStatInfo.RoceVerificationErrNum = float64(statInfo[roCEVerificationErrNum]) - mainStatInfo.RoceQpStatusErrNum = float64(statInfo[roCEQpStatusErrNum]) - mainStatInfo.RoceNewPktRtyNum = float64(statInfo[roCENewPktRtyNum]) - mainStatInfo.RoceEcnDBNum = float64(statInfo[roCEEcnDBNum]) - mainStatInfo.MacRXFcsErrPktNum = float64(statInfo[macRXFcsErrPktNum]) - - return &mainStatInfo -} - -func updateStatInfoOfMac(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { - doUpdateMetric(ch, ts, statInfo.MacRxPauseNum, cardLabel, descMacRxPauseNum) - doUpdateMetric(ch, ts, statInfo.MacTxPauseNum, cardLabel, descMacTxPauseNum) - doUpdateMetric(ch, ts, statInfo.MacRxPfcPktNum, cardLabel, descMacRxPfcPktNum) - doUpdateMetric(ch, ts, statInfo.MacTxPfcPktNum, cardLabel, descMacTxPfcPktNum) - doUpdateMetric(ch, ts, statInfo.MacRxBadPktNum, cardLabel, descMacRxBadPktNum) - doUpdateMetric(ch, ts, statInfo.MacTxBadPktNum, cardLabel, descMacTxBadPktNum) - doUpdateMetric(ch, ts, statInfo.MacTxBadOctNum, cardLabel, descMacTxBadOctNum) - doUpdateMetric(ch, ts, statInfo.MacRxBadOctNum, cardLabel, descMacRxBadOctNum) - doUpdateMetric(ch, ts, statInfo.MacRXFcsErrPktNum, cardLabel, descRxFCSNum) -} - -func updateStatInfoOfRoCE(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { - doUpdateMetric(ch, ts, statInfo.RoceRxAllPktNum, cardLabel, descRoceRxAllPktNum) - doUpdateMetric(ch, ts, statInfo.RoceTxAllPktNum, cardLabel, descRoceTxAllPktNum) - doUpdateMetric(ch, ts, statInfo.RoceRxErrPktNum, cardLabel, descRoceRxErrPktNum) - doUpdateMetric(ch, ts, statInfo.RoceTxErrPktNum, cardLabel, descRoceTxErrPktNum) - doUpdateMetric(ch, ts, statInfo.RoceRxCnpPktNum, cardLabel, descRoceRxCnpPktNum) - doUpdateMetric(ch, ts, statInfo.RoceTxCnpPktNum, cardLabel, descRoceTxCnpPktNum) - doUpdateMetric(ch, ts, statInfo.RoceNewPktRtyNum, cardLabel, descRoceNewPktRtyNum) - doUpdateMetric(ch, ts, statInfo.RoceUnexpectedAckNum, cardLabel, descRoceUnexpectedAcktNum) - doUpdateMetric(ch, ts, statInfo.RoceOutOfOrderNum, cardLabel, descRoceOutOfOrderNum) - doUpdateMetric(ch, ts, statInfo.RoceVerificationErrNum, cardLabel, descRoceVerificationErrNum) - doUpdateMetric(ch, ts, statInfo.RoceQpStatusErrNum, cardLabel, descRoceQpStatusErrNum) - doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) - doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go deleted file mode 100644 index 918469c..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -var ( - descSioCrcTxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_tx_err_cnt", - "sio transmitted error count between die") - descSioCrcRxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_rx_err_cnt", - "sio received error count between die") -) -var ( - supportedSioDevices = map[string]bool{ - api.Ascend910A3: true, - } -) - -type sioCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo sio status between dies, only support super pod - extInfo *common.SioCrcErrStatisticInfo -} - -// SioCollector collect sio info -type SioCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *SioCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedSioDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "sio information cannot be queried.") - return isSupport -} - -// Describe description of the metric -func (c *SioCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descSioCrcTxErrCnt - ch <- descSioCrcRxErrCnt -} - -// CollectToCache collect the metric to cache -func (c *SioCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - logicID := chip.LogicID - sioInfo, err := n.Dmgr.GetSioInfo(logicID) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForSio, logicID, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForSio, logicID) - - c.LocalCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: sioInfo}) - } - colcommon.UpdateCache[sioCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *SioCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache sioCache, cardLabel []string) { - extInfo := cache.extInfo - if extInfo == nil { - return - } - doUpdateMetric(ch, cache.timestamp, extInfo.TxErrCnt, cardLabel, descSioCrcTxErrCnt) - doUpdateMetric(ch, cache.timestamp, extInfo.RxErrCnt, cardLabel, descSioCrcRxErrCnt) - } - updateFrame[sioCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf update telegraf metrics -func (c *SioCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - - doUpdateTelegraf(fieldMap, descSioCrcTxErrCnt, extInfo.TxErrCnt, "") - doUpdateTelegraf(fieldMap, descSioCrcRxErrCnt, extInfo.RxErrCnt, "") - } - return fieldsMap -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go deleted file mode 100644 index 8cb32bd..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/versions" -) - -var ( - versionInfoDesc = common.BuildDescWithLabel("npu_exporter_version_info", "exporter version with value '1'", - []string{"exporterVersion"}) -) - -// VersionCollector collect sio info -type VersionCollector struct { - common.MetricsCollectorAdapter -} - -// Describe description of the metric -func (c *VersionCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- versionInfoDesc -} - -// UpdatePrometheus update prometheus metric -func (c *VersionCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { - ch <- prometheus.MustNewConstMetric(versionInfoDesc, prometheus.GaugeValue, 1, []string{versions.BuildVersion}...) -} - -// UpdateTelegraf update telegraf metric -func (c *VersionCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - - if fieldsMap[common.GeneralDevTagKey] == nil { - fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], versionInfoDesc, versions.BuildVersion, "") - return fieldsMap -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go deleted file mode 100644 index 5117ec9..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - cardLabelForVNpuName = make([]string, len(colcommon.CardLabel)) - podAiCoreUtilizationRate *prometheus.Desc = nil - podTotalMemory *prometheus.Desc = nil - podUsedMemory *prometheus.Desc = nil -) - -var ( - supportedVnpuDevices = map[string]bool{ - api.Ascend310P: true, - } -) - -const ( - vNpuUUID = "v_dev_id" - aiCoreCnt = "aicore_count" - isVirtual = "is_virtual" -) - -func init() { - cardLabelForVNpuName = append(colcommon.CardLabel, isVirtual) - cardLabelForVNpuName[2] = vNpuUUID - cardLabelForVNpuName[3] = aiCoreCnt - - podAiCoreUtilizationRate = colcommon.BuildDescWithLabel("vnpu_pod_aicore_utilization", - "the vnpu aicore utilization rate, unit is '%'", cardLabelForVNpuName) - podTotalMemory = colcommon.BuildDescWithLabel("vnpu_pod_total_memory", - "the vnpu total memory on pod, unit is 'KB'", cardLabelForVNpuName) - podUsedMemory = colcommon.BuildDescWithLabel("vnpu_pod_used_memory", - "the vnpu used memory on pod, unit is 'KB'", cardLabelForVNpuName) - -} - -// VnpuCollector collect vnpu info -type VnpuCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *VnpuCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedVnpuDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe description of the metric -func (c *VnpuCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- podAiCoreUtilizationRate - ch <- podTotalMemory - ch <- podUsedMemory -} - -// CollectToCache collect the metric to cache -func (c *VnpuCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - cache := &chipCache{ - chip: chip, - } - cache.timestamp = time.Now() - c.LocalCache.Store(chip.PhyId, *cache) - } - colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *VnpuCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { - if chipWithVnpu.VDevActivityInfo == nil { - return - } - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if !common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - containerName := getContainerNameArray(containerMap[int32(vDevActivityInfo.VDevID)]) - if len(containerName) != colcommon.ContainerNameLen { - return - } - cardLabel = getPodDisplayInfo(&chipWithVnpu, containerName) - doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevAiCoreRate, cardLabel, podAiCoreUtilizationRate) - doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevTotalMem, cardLabel, podTotalMemory) - doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevUsedMem, cardLabel, podUsedMemory) - } - - updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *VnpuCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - - vDevActivityInfo := chip.VDevActivityInfo - if vDevActivityInfo == nil || !common.IsValidVDevID(vDevActivityInfo.VDevID) { - continue - } - - devTagKey := strconv.Itoa(int(cache.chip.LogicID)) + "_" + strconv.Itoa(int(vDevActivityInfo.VDevID)) - - if fieldsMap[devTagKey] == nil { - fieldsMap[devTagKey] = make(map[string]interface{}) - } - - doUpdateTelegraf(fieldsMap[devTagKey], podAiCoreUtilizationRate, vDevActivityInfo.VDevAiCoreRate, "") - doUpdateTelegraf(fieldsMap[devTagKey], podTotalMemory, vDevActivityInfo.VDevTotalMem, "") - doUpdateTelegraf(fieldsMap[devTagKey], podUsedMemory, vDevActivityInfo.VDevUsedMem, "") - } - return fieldsMap -} - -func getPodDisplayInfo(chip *colcommon.HuaWeiAIChip, containerName []string) []string { - if len(containerName) != colcommon.ContainerNameLen { - logger.Errorf("container name length %v is not %v", len(containerName), colcommon.ContainerNameLen) - return nil - } - - chipInfo := common.DeepCopyChipInfo(chip.ChipInfo) - vDevActivityInfo := common.DeepCopyVDevActivityInfo(chip.VDevActivityInfo) - - return []string{ - strconv.Itoa(int(chip.DeviceID)), - common.GetNpuName(chipInfo), - strconv.Itoa(int(vDevActivityInfo.VDevID)), - strconv.FormatFloat(vDevActivityInfo.VDevAiCore, 'f', colcommon.DecimalPlaces, colcommon.BitSize), - containerName[colcommon.NameSpaceIdx], - containerName[colcommon.PodNameIdx], - containerName[colcommon.ConNameIdx], - strconv.FormatBool(vDevActivityInfo.IsVirtualDev), - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go deleted file mode 100644 index d57ade0..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "strconv" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - vnpuMetricNum = 3 - validVnpuID = 100 - invalidVnpuID = 1 -) - -// TestVnpuCollectorIsSupported test VnpuCollector IsSupported -func TestVnpuCollectorIsSupported(t *testing.T) { - n := mockNewNpuCollector() - cases := []testCase{ - buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), - buildTestCase("VnpuCollector: testIsSupported on other type", &VnpuCollector{}, "OTHER", false), - } - - for _, c := range cases { - patches := gomonkey.NewPatches() - convey.Convey(c.name, t, func() { - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) - isSupported := c.collectorType.IsSupported(n) - convey.So(isSupported, convey.ShouldEqual, c.expectValue) - }) - } -} - -func TestVnpuCollectorDescribe(t *testing.T) { - collector := &VnpuCollector{} - convey.Convey("TestVnpuCollectorDescribe", t, func() { - ch := make(chan *prometheus.Desc, vnpuMetricNum) - collector.Describe(ch) - convey.So(len(ch), convey.ShouldEqual, vnpuMetricNum) - close(ch) - }) -} - -func TestVnpuCollectorCollectToCache(t *testing.T) { - collector := &VnpuCollector{} - n := mockNewNpuCollector() - testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} - - convey.Convey("TestVnpuCollectorCollectToCache", t, func() { - collector.CollectToCache(n, testChips) - cacheInfo := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(collector)) - convey.So(cacheInfo, convey.ShouldNotBeNil) - }) -} - -func TestVnpuCollectorUpdatePrometheus(t *testing.T) { - collector := &VnpuCollector{} - n := mockNewNpuCollector() - containerMap := mockContainerInfo() - - testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} - collector.CollectToCache(n, testChips) - chip := createValidVnpuChip() - testCases := []struct { - name string - preHandleFunc func() - expectValue int - }{ - {name: "TestVnpuCollectorUpdatePrometheus_effective virtual device scenarios", - preHandleFunc: func() {}, - expectValue: vnpuMetricNum, - }, - {name: "TestVnpuCollectorUpdatePrometheus_there is no container info", - preHandleFunc: func() { - containerMap = map[int32]container.DevicesInfo{} - }, - expectValue: 0, - }, - {name: "TestVnpuCollectorUpdatePrometheus_the vdevid is invalid", - preHandleFunc: func() { - chip.VDevActivityInfo.VDevID = invalidVnpuID - }, - expectValue: 0, - }, - {name: "TestVnpuCollectorUpdatePrometheus_there is no vdev info", - preHandleFunc: func() { - chip.VDevActivityInfo = nil - }, - expectValue: 0, - }, - } - ch := make(chan prometheus.Metric, vnpuMetricNum) - defer close(ch) - for _, tt := range testCases { - convey.Convey(tt.name, t, func() { - tt.preHandleFunc() - collector.UpdatePrometheus(ch, n, containerMap, []colcommon.HuaWeiAIChip{chip}) - convey.So(len(ch), convey.ShouldEqual, tt.expectValue) - //clean ch - for { - if len(ch) == 0 { - break - } - <-ch - } - }) - } -} - -func mockContainerInfo() map[int32]container.DevicesInfo { - containerMap := map[int32]container.DevicesInfo{ - validVnpuID: { - Devices: []int{0}, - ID: strconv.Itoa(validVnpuID), - Name: "nsName_podName_ctrName", - }, - } - return containerMap -} - -func TestVnpuCollectorUpdateTelegraf(t *testing.T) { - collector := &VnpuCollector{} - n := mockNewNpuCollector() - containerMap := mockContainerInfo() - testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} - collector.CollectToCache(n, testChips) - chip := createValidVnpuChip() - convey.Convey("TestVnpuCollectorUpdateTelegraf", t, func() { - convey.Convey("effective virtual device scenarios", func() { - chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} - newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) - convey.So(len(newFieldMaps), convey.ShouldEqual, 1) - convey.So(len(newFieldMaps["0_100"]), convey.ShouldEqual, vnpuMetricNum) - }) - convey.Convey("there is no container info", func() { - chip.VDevActivityInfo = nil - chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} - containerMap = map[int32]container.DevicesInfo{} - newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) - convey.So(len(newFieldMaps), convey.ShouldEqual, 0) - }) - - }) -} - -func TestGetPodDisplayInfo(t *testing.T) { - const num8 = 8 - convey.Convey("TestGetPodDisplayInfo", t, func() { - chip := createValidVnpuChip() - convey.Convey("valid container information", func() { - containerNames := []string{"namespace", "pod-name", "container-name"} - labels := getPodDisplayInfo(&chip, containerNames) - convey.Convey("should return 8 metrics", func() { - convey.So(len(labels), convey.ShouldEqual, num8) - convey.So(labels[len(labels)-1], convey.ShouldEqual, "true") - }) - }) - - convey.Convey("invalid container information", func() { - containerNames := []string{"short"} - labels := getPodDisplayInfo(&chip, containerNames) - convey.Convey("should return nil", func() { - convey.So(labels, convey.ShouldBeNil) - }) - }) - }) -} - -func createValidVnpuChip() colcommon.HuaWeiAIChip { - chip := createChip() - chip.VDevActivityInfo = &common.VDevActivityInfo{ - VDevID: validVnpuID, - VDevAiCore: 1, - VDevTotalMem: 1, - VDevUsedMem: 1, - IsVirtualDev: true, - } - return chip -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go deleted file mode 100644 index 7524c68..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go +++ /dev/null @@ -1,548 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "strconv" - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - maxMetricsCount = 2000 - num5 = 5 - mockContainerName = "mockContainerName" - maxChipNum int32 = 8 -) - -var ( - collectorChain []colcommon.MetricsCollector -) - -// TestDescribe test Describe -func TestDescribe(t *testing.T) { - - convey.Convey("test prometheus desc ", t, func() { - ch := make(chan *prometheus.Desc, maxMetricsCount) - for _, c := range collectorChain { - c.Describe(ch) - } - t.Logf("Describe len(ch):%v", len(ch)) - convey.So(ch, convey.ShouldNotBeEmpty) - }) -} - -type testCase struct { - name string - collectorType colcommon.MetricsCollector - deviceType string - expectValue bool -} - -func buildTestCase(name string, collectorType colcommon.MetricsCollector, deviceType string, - expectValue bool) testCase { - return testCase{ - name: name, - collectorType: collectorType, - deviceType: deviceType, - expectValue: expectValue, - } -} - -// testIsSupported test IsSupported -func TestIsSupported(t *testing.T) { - n := mockNewNpuCollector() - cases := []testCase{ - buildTestCase("DdrCollector: testIsSupported on Ascend310", &DdrCollector{}, api.Ascend310, true), - buildTestCase("DdrCollector: testIsSupported on Ascend310P", &DdrCollector{}, api.Ascend310P, true), - buildTestCase("DdrCollector: testIsSupported on Ascend910", &DdrCollector{}, api.Ascend910, true), - buildTestCase("DdrCollector: testIsSupported on Ascend910B", &DdrCollector{}, api.Ascend910B, false), - buildTestCase("DdrCollector: testIsSupported on Ascend910A3", &DdrCollector{}, api.Ascend910A3, false), - - buildTestCase("HccsCollector: testIsSupported on Ascend310", &HccsCollector{}, api.Ascend310, false), - buildTestCase("HccsCollector: testIsSupported on Ascend310P", &HccsCollector{}, api.Ascend310P, false), - buildTestCase("HccsCollector: testIsSupported on Ascend910", &HccsCollector{}, api.Ascend910, false), - buildTestCase("HccsCollector: testIsSupported on Ascend910B", &HccsCollector{}, api.Ascend910B, true), - buildTestCase("HccsCollector: testIsSupported on Ascend910A3", &HccsCollector{}, api.Ascend910A3, true), - - buildTestCase("SioCollector: testIsSupported on Ascend310", &SioCollector{}, api.Ascend310, false), - buildTestCase("SioCollector: testIsSupported on Ascend310P", &SioCollector{}, api.Ascend310P, false), - buildTestCase("SioCollector: testIsSupported on Ascend910", &SioCollector{}, api.Ascend910, false), - buildTestCase("SioCollector: testIsSupported on Ascend910B", &SioCollector{}, api.Ascend910B, false), - buildTestCase("SioCollector: testIsSupported on Ascend910A3", &SioCollector{}, api.Ascend910A3, true), - - buildTestCase("VnpuCollector: testIsSupported on Ascend310", &VnpuCollector{}, api.Ascend310, false), - buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), - buildTestCase("VnpuCollector: testIsSupported on Ascend910", &VnpuCollector{}, api.Ascend910, false), - buildTestCase("VnpuCollector: testIsSupported on Ascend910B", &VnpuCollector{}, api.Ascend910B, false), - buildTestCase("VnpuCollector: testIsSupported on Ascend910A3", &VnpuCollector{}, api.Ascend910A3, false), - } - - for _, c := range cases { - patches := gomonkey.NewPatches() - convey.Convey(c.name, t, func() { - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) - isSupported := c.collectorType.IsSupported(n) - convey.So(isSupported, convey.ShouldEqual, c.expectValue) - }) - } -} - -// TestIsSupported2 test IsSupported -func TestIsSupported2(t *testing.T) { - n := mockNewNpuCollector() - convey.Convey("TestIsSupported ", t, func() { - for _, c := range collectorChain { - c.IsSupported(n) - } - }) - -} - -// TestCollectToCache test CollectToCache -func TestCollectToCache(t *testing.T) { - n := mockNewNpuCollector() - - convey.Convey("TestCollectToCache", t, func() { - - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceMemoryInfo", mockMemoryInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHbmInfo", mockHbmAggregateInfo().HbmInfo, nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceEccInfo", mockHbmAggregateInfo().ECCInfo, nil) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfo", mockHccsStaticsInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", mockHccsStaticsInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsBandwidthInfo", mockHccsBWInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetPCIEBandwidth", mockPcieInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetSioInfo", mockSioInfo(), nil) - patches.ApplyFuncReturn(hccn.GetNPULinkStatus, "UP", nil) - patches.ApplyFuncReturn(hccn.GetNPUInterfaceTraffic, float64(0), float64(0), nil) - patches.ApplyFuncReturn(hccn.GetNPULinkUpNum, 0, nil) - patches.ApplyFuncReturn(hccn.GetNPULinkSpeed, 0, nil) - patches.ApplyFuncReturn(hccn.GetNPUOpticalInfo, mockOpticalInfo(), nil) - patches.ApplyFuncReturn(hccn.GetNPUStatInfo, mockRoceInfoMap(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceFrequency", uint32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceTemperature", int32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceVoltage", float32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceAllErrorCode", int32(1), []int64{0}, nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHealth", uint32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDevicePowerInfo", float32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceUtilizationRate", uint32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDevProcessInfo", mockProcessInfo(), nil) - - chips := mockGetNPUChipList() - for _, c := range collectorChain { - c.PreCollect(n, chips) - c.CollectToCache(n, chips) - } - - convey.So(colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(&DdrCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(&HbmCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(&HccsCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(&NetworkCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(&BaseInfoCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(&OpticalCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(&PcieCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(&RoceCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(&SioCollector{})), - convey.ShouldNotBeEmpty) - - }) -} - -// TestUpdatePrometheus test UpdatePrometheus -func TestUpdatePrometheus(t *testing.T) { - n := mockNewNpuCollector() - - convey.Convey("TestUpdatePrometheus", t, func() { - - ch := make(chan prometheus.Metric, maxMetricsCount) - - patches := gomonkey.NewPatches() - defer patches.Reset() - containerInfos := mockGetContainerNPUInfo() - chips := mockGetNPUChipList() - - mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) - mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) - mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) - mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) - mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) - mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) - mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) - mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) - mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) - - for _, c := range collectorChain { - c.UpdatePrometheus(ch, n, containerInfos, chips) - } - - t.Logf("TestUpdatePrometheus len(ch):%v", len(ch)) - convey.So(ch, convey.ShouldNotBeEmpty) - }) -} - -// TestUpdateTelegraf test UpdateTelegraf -func TestUpdateTelegraf(t *testing.T) { - n := mockNewNpuCollector() - - convey.Convey("TestUpdatePrometheus", t, func() { - - patches := gomonkey.NewPatches() - defer patches.Reset() - containerInfos := mockGetContainerNPUInfo() - chips := mockGetNPUChipList() - - mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) - mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) - mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) - mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) - mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) - mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) - mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) - mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) - mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) - fieldsMap := make(map[string]map[string]interface{}) - - for _, c := range collectorChain { - c.UpdateTelegraf(fieldsMap, n, containerInfos, chips) - } - - t.Logf("fieldsMap len(ch):%v", len(fieldsMap)) - convey.So(fieldsMap, convey.ShouldNotBeEmpty) - }) -} - -func mockRoceCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), - extInfo: getMainStatInfo(mockRoceInfoMap())}) - } - colcommon.UpdateCache[roceCache](n, cacheKey, &localCache) -} - -func mockRoceInfoMap() map[string]int { - return map[string]int{ - macRxMacPauseNum: 0, - macTxMacPauseNum: 0, - macRxPfcPktNum: 0, - macTxPfcPktNum: 0, - macRxBadPktNum: 0, - macTxBadPktNum: 0, - roCERxAllPktNum: 0, - roCETxAllPktNum: 0, - roCERxErrPktNum: 0, - roCETxErrPktNum: 0, - roCERxCnpPktNum: 0, - roCETxCnpPktNum: 0, - macRxBadOctNum: 0, - macTxBadOctNum: 0, - roCEUnexpectedAckNum: 0, - roCEOutOfOrderNum: 0, - roCEVerificationErrNum: 0, - roCEQpStatusErrNum: 0, - roCENewPktRtyNum: 0, - roCEEcnDBNum: 0, - macRXFcsErrPktNum: 0, - } -} - -func mockDdrCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mockMemoryInfo()}) - } - colcommon.UpdateCache[ddrCache](n, cacheKey, &localCache) -} - -func mockHccsCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, hccsCache{chip: chip, timestamp: time.Now(), - hccsStat: mockHccsStaticsInfo(), hccsBW: mockHccsBWInfo()}) - } - colcommon.UpdateCache[hccsCache](n, cacheKey, &localCache) -} - -func mockHccsBWInfo() *common.HccsBandwidthInfo { - return &common.HccsBandwidthInfo{ - ProfilingTime: 0, - RxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, - TxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, - TotalRxbw: 0, - TotalTxbw: 0, - } -} - -func mockHccsStaticsInfo() *common.HccsStatisticInfo { - return &common.HccsStatisticInfo{ - TxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, - RxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, - CrcErrCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, - } -} - -func mockSioCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: mockSioInfo()}) - } - colcommon.UpdateCache[sioCache](n, cacheKey, &localCache) -} - -func mockSioInfo() *common.SioCrcErrStatisticInfo { - return &common.SioCrcErrStatisticInfo{ - TxErrCnt: 0, - RxErrCnt: 0, - } -} -func mockPcieCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - pcieInfo := mockPcieInfo() - localCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieInfo}) - } - colcommon.UpdateCache[pcieCache](n, cacheKey, &localCache) -} - -func mockPcieInfo() common.PCIEBwStat { - return common.PCIEBwStat{ - PcieRxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieRxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieRxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieTxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieTxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieTxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - } -} - -func mockOpticalCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), - extInfo: getMainOptInfo(mockOpticalInfo())}) - } - colcommon.UpdateCache[opticalCache](n, cacheKey, &localCache) -} - -func mockOpticalInfo() map[string]string { - return map[string]string{ - txPower0: "1 mW", - txPower1: "1 mW", - txPower2: "1 mW", - txPower3: "1 mW", - rxPower0: "1 mW", - rxPower1: "1 mW", - rxPower2: "1 mW", - rxPower3: "1 mW", - voltage: "1 mV", - temperature: "50 C", - present: "1.0", - } -} - -func mockHbmCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, hbmCache{chip: chip, timestamp: time.Now(), extInfo: mockHbmAggregateInfo(), - hbmUtilization: 0}, - ) - } - colcommon.UpdateCache[hbmCache](n, cacheKey, &localCache) -} - -func mockNetInfoCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: mockNetInfo()}) - } - colcommon.UpdateCache[netInfoCache](n, cacheKey, &localCache) -} - -func mockNetInfo() *common.NpuNetInfo { - return &common.NpuNetInfo{ - LinkStatusInfo: &common.LinkStatusInfo{LinkState: "0"}, - BandwidthInfo: &common.BandwidthInfo{RxValue: 0, TxValue: 0}, - LinkStatInfo: &common.LinkStatInfo{LinkUPNum: 0}, - LinkSpeedInfo: &common.LinkSpeedInfo{Speed: 0}, - } -} - -func mockChipCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, chipCache{chip: chip, timestamp: time.Now(), - HealthStatus: "Healthy", - ErrorCodes: []int64{0}, - Utilization: 0, - OverallUtilization: 0, - VectorUtilization: 0, - Temperature: 0, - Power: 0, - Voltage: 0, - AICoreCurrentFreq: 0, - NetHealthStatus: "Healthy", - DevProcessInfo: mockProcessInfo(), - }) - } - colcommon.UpdateCache[chipCache](n, cacheKey, &localCache) -} - -func mockProcessInfo() *common.DevProcessInfo { - return &common.DevProcessInfo{ - ProcNum: 1, - DevProcArray: []common.DevProcInfo{{Pid: 0, MemUsage: 0}}, - } -} - -func mockMemoryInfo() *common.MemoryInfo { - return &common.MemoryInfo{ - MemorySize: 0, - MemoryAvailable: 0, - Frequency: 0, - Utilization: 0, - } -} - -func mockHbmAggregateInfo() *common.HbmAggregateInfo { - return &common.HbmAggregateInfo{ - HbmInfo: &common.HbmInfo{ - MemorySize: 1, - Frequency: 1, - Usage: 1, - Temp: 1, - BandWidthUtilRate: 1, - }, - ECCInfo: &common.ECCInfo{ - EnableFlag: 1, - }, - } -} - -func mockNewNpuCollector() *colcommon.NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: time.Duration(num5) * time.Second, - updateTime: time.Duration(num5) * time.Second, - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := colcommon.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -func mockGetNPUChipList() []colcommon.HuaWeiAIChip { - chips := make([]colcommon.HuaWeiAIChip, 0) - for id := int32(0); id < maxChipNum; id++ { - chip := colcommon.HuaWeiAIChip{ - CardId: id, - PhyId: id, - DeviceID: id, - LogicID: id, - ChipInfo: &common.ChipInfo{ - Name: api.Ascend910, - Type: "Ascend", - Version: "V1", - }, - } - - chips = append(chips, chip) - } - return chips -} - -func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { - containsInfo := make(map[int32]container.DevicesInfo) - for id := int32(0); id < maxChipNum; id++ { - - containerInfo := container.DevicesInfo{ - ID: strconv.Itoa(int(id)), - Name: mockContainerName, - Devices: []int{int(id)}, - } - containsInfo[id] = containerInfo - } - return containsInfo -} - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - - initChain() -} - -func initChain() { - collectorChain = []colcommon.MetricsCollector{ - &HccsCollector{}, - &BaseInfoCollector{}, - &SioCollector{}, - &VersionCollector{}, - &HbmCollector{}, - &DdrCollector{}, - &VnpuCollector{}, - &PcieCollector{}, - &NetworkCollector{}, - &RoceCollector{}, - &OpticalCollector{}, - } -} - -func createChip() colcommon.HuaWeiAIChip { - return colcommon.HuaWeiAIChip{ - CardId: 0, - PhyId: 0, - DeviceID: 0, - LogicID: 0, - ChipInfo: &common.ChipInfo{ - Name: api.Ascend910, - Type: "Ascend", - Version: "V1", - }, - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go deleted file mode 100644 index 7a0697d..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics offer common utils for collector -package metrics - -import ( - "math" - "reflect" - "strconv" - "strings" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -func validateNum(num float64) bool { - if num == -1 || num == math.MaxUint32 || float32(num) == math.MaxUint32 { - return false - } - - return true -} - -func doUpdateTelegrafWithValidateNum(fieldMap map[string]interface{}, desc *prometheus.Desc, - value float64, extInfo string) { - if validateNum(value) { - doUpdateTelegraf(fieldMap, desc, value, extInfo) - } -} - -func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { - fieldMap[utils.GetDescName(desc)+extInfo] = value -} - -func doUpdateMetricWithValidateNum(ch chan<- prometheus.Metric, timestamp time.Time, value float64, - cardLabel []string, desc *prometheus.Desc) { - if validateNum(value) { - doUpdateMetric(ch, timestamp, value, cardLabel, desc) - } -} -func doUpdateMetric(ch chan<- prometheus.Metric, timestamp time.Time, value interface{}, - cardLabel []string, desc *prometheus.Desc) { - var finalValue float64 - - switch value.(type) { - case int: - finalValue = float64(value.(int)) - case int32: - finalValue = float64(value.(int32)) - case int64: - finalValue = float64(value.(int64)) - case uint32: - finalValue = float64(value.(uint32)) - case uint64: - finalValue = float64(value.(uint64)) - case float32: - finalValue = float64(value.(float32)) - case float64: - finalValue = value.(float64) - default: - logger.Errorf("invalid param in function doUpdateMetric,"+ - "metrics name is (%v), value type is (%T),value is (%v)", utils.GetDescName(desc), value, value) - } - // collect failed, set value to -1 - if finalValue == common.FailedValue { - finalValue = common.FailedMetricValue - } - ch <- prometheus.NewMetricWithTimestamp(timestamp, - prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, finalValue, cardLabel...)) -} - -func getContainerInfoWithDefault(cNameArray []string) (containerName, namespaceValue, podNameValue string) { - if len(cNameArray) == colcommon.ContainerNameLen { - namespaceValue = cNameArray[colcommon.NameSpaceIdx] - podNameValue = cNameArray[colcommon.PodNameIdx] - containerName = cNameArray[colcommon.ConNameIdx] - } - return containerName, namespaceValue, podNameValue -} - -func geenGeneralCardLabel(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) []string { - - containerInfo := geenContainerInfo(chip, containerMap) - - containerName, namespaceValue, podNameValue := getContainerInfoWithDefault(getContainerNameArray(containerInfo)) - cardLabel := collectCardLabelValue(chip, namespaceValue, podNameValue, containerName) - return cardLabel -} - -func geenContainerInfo(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) container.DevicesInfo { - deviceID := chip.DeviceID - if chip.VDevActivityInfo != nil && chip.VDevActivityInfo.IsVirtualDev { - deviceID = int32(chip.VDevActivityInfo.VDevID) - } - containerInfo, ok := containerMap[deviceID] - if !ok { - containerInfo = container.DevicesInfo{} - } - return containerInfo -} -func collectCardLabelValue(chip *colcommon.HuaWeiAIChip, namespaceValue, podNameValue, containerName string) []string { - - return []string{strconv.FormatInt(int64(chip.DeviceID), colcommon.Base), common.GetNpuName(chip.ChipInfo), chip.VDieID, - chip.PCIeBusInfo, namespaceValue, podNameValue, containerName} -} - -func getContainerNameArray(devInfo container.DevicesInfo) []string { - if devInfo.Name == "" { - return nil - } - - return strings.Split(devInfo.Name, "_") -} - -func getFieldMap(fieldsMap map[string]map[string]interface{}, devTagKey int32) map[string]interface{} { - devTagKeyStr := strconv.Itoa(int(devTagKey)) - if fieldsMap[devTagKeyStr] == nil { - fieldsMap[devTagKeyStr] = make(map[string]interface{}) - } - return fieldsMap[devTagKeyStr] -} - -func handleErr(err error, domain string, logicID int32) { - if err != nil { - logErrMetricsWithLimit(domain, logicID, err) - } else { - hwlog.ResetErrCnt(domain, logicID) - } -} - -func logErrMetricsWithLimit(metric string, logicID int32, err error) { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{ - Domain: metric, - ID: logicID}, - "logicID(%d),%v", logicID, err) -} - -func validateNotNilForEveryElement(objs ...interface{}) bool { - for _, v := range objs { - val := reflect.ValueOf(v) - if val.Kind() != reflect.Ptr { - return false - } - if val.IsNil() { - return false - } - } - return true -} -func logForUnSupportDevice(isSupport bool, devType string, group string, extInfo string) { - if !isSupport { - logger.Infof("devType %v does not support [%v], %v", devType, group, extInfo) - } -} - -func updateFrame[T any](cacheKey string, n *colcommon.NpuCollector, containerMap map[int32]container.DevicesInfo, - chips []colcommon.HuaWeiAIChip, callBack func(chipWithVnpu colcommon.HuaWeiAIChip, cache T, cardLabel []string)) { - - caches := colcommon.GetInfoFromCache[T](n, cacheKey) - if len(caches) == 0 { - logger.Debugf("cacheKey(%v) not found", cacheKey) - return - } - for _, chip := range chips { - cardLabel := geenGeneralCardLabel(&chip, containerMap) - cache, ok := caches[chip.PhyId] - if !ok { - logger.Warnf("cacheKey(%v) not found, chip.PhyId(%v)", cacheKey, chip.PhyId) - continue - } - - callBack(chip, cache, cardLabel) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go deleted file mode 100644 index 9cb88bd..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics offer common utils for collector -package metrics - -import ( - "math" - "testing" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" -) - -const ( - invalidNum = -1 - num100 = 100 -) - -// TestValidateNum test numerical verification -func TestValidateNum(t *testing.T) { - convey.Convey("TestValidateNum", t, func() { - convey.Convey("return true when the num is valid", func() { - convey.So(validateNum(0), convey.ShouldBeTrue) - convey.So(validateNum(num100), convey.ShouldBeTrue) - }) - - convey.Convey("return false when the num is invalid", func() { - convey.So(validateNum(invalidNum), convey.ShouldBeFalse) - convey.So(validateNum(math.MaxUint32), convey.ShouldBeFalse) - }) - }) -} - -// TestDoUpdateTelegraf test update telegraf -func TestDoUpdateTelegraf(t *testing.T) { - convey.Convey("TestDoUpdateTelegraf", t, func() { - fieldMap := make(map[string]interface{}) - desc := prometheus.NewDesc("test_metric", "", nil, nil) - - convey.Convey("update when num is valid", func() { - doUpdateTelegrafWithValidateNum(fieldMap, desc, num100, "_suffix") - convey.So(fieldMap["test_metric_suffix"], convey.ShouldEqual, num100) - }) - - convey.Convey("don't update when num is invalid", func() { - doUpdateTelegrafWithValidateNum(fieldMap, desc, -1, "_suffix") - convey.So(fieldMap, convey.ShouldBeEmpty) - }) - }) -} - -// TestDoUpdateMetric test update prometheus -func TestDoUpdateMetric(t *testing.T) { - const ( - num10 = 10 - num100 = 100 - negaNum = -5 - floatNum = 3.14 - ) - convey.Convey("TestDoUpdateMetric", t, func() { - ch := make(chan prometheus.Metric, 1) - desc := prometheus.NewDesc("test_metric", "", []string{"label"}, nil) - - convey.Convey("convert the various numeric types correctly", func() { - testCases := []struct { - input interface{} - expected float64 - }{ - {int(num10), num10}, - {int32(negaNum), negaNum}, - {uint64(num100), num100}, - {float32(floatNum), floatNum}, - } - - for _, tc := range testCases { - doUpdateMetric(ch, time.Now(), tc.input, []string{"label"}, desc) - m := <-ch - convey.So(m, convey.ShouldNotBeEmpty) - } - }) - }) -} - -// TestContainerInfo test container information processing -func TestContainerInfo(t *testing.T) { - convey.Convey("TestContainerInfo", t, func() { - convey.Convey("correctly split the array of container names", func() { - testCases := []struct { - input []string - expected []string - }{ - {[]string{"ns", "pod", "container"}, []string{"container", "ns", "pod"}}, - {[]string{"short"}, []string{"", "", ""}}, - } - - for _, tc := range testCases { - c, ns, pod := getContainerInfoWithDefault(tc.input) - convey.So([]string{c, ns, pod}, convey.ShouldResemble, tc.expected) - } - }) - }) -} - -// TestCardLabel test card label generation -func TestCardLabel(t *testing.T) { - convey.Convey("TestCardLabel", t, func() { - chip := &colcommon.HuaWeiAIChip{ - DeviceID: 0, - ChipInfo: &common.ChipInfo{Name: "1", Type: "1", Version: "1"}, - VDieID: "die1", - PCIeBusInfo: "0000:00:01.0", - } - - expected := []string{ - "0", - "1-1-1", - "die1", - "0000:00:01.0", - "test-ns", - "test-pod", - "test-container", - } - - convey.Convey("correctly generate an array of tags", func() { - labels := collectCardLabelValue(chip, "test-ns", "test-pod", "test-container") - convey.So(labels, convey.ShouldResemble, expected) - }) - }) -} - -// TestNilValidation test null pointer validation -func TestNilValidation(t *testing.T) { - convey.Convey("TestNilValidation", t, func() { - var nilPtr *int - val := 10 - - convey.Convey("all non null pointers should return true", func() { - convey.So(validateNotNilForEveryElement(&val), convey.ShouldBeTrue) - }) - - convey.Convey("a null pointer should return false", func() { - convey.So(validateNotNilForEveryElement(nilPtr), convey.ShouldBeFalse) - }) - - convey.Convey("non pointer types should return false", func() { - convey.So(validateNotNilForEveryElement(val), convey.ShouldBeFalse) - }) - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics deleted file mode 100644 index 8f51362..0000000 --- a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics +++ /dev/null @@ -1,166 +0,0 @@ -# HELP machine_npu_nums Amount of npu installed on the machine. -# TYPE machine_npu_nums gauge -machine_npu_nums 8 -# HELP npu_chip_info_aicore_current_freq the npu ai core current frequency, unit is 'MHz' -# TYPE npu_chip_info_aicore_current_freq gauge -npu_chip_info_aicore_current_freq{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_bandwidth_rx the npu interface receive speed, unit is 'MB/s' -# TYPE npu_chip_info_bandwidth_rx gauge -npu_chip_info_bandwidth_rx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_bandwidth_tx the npu interface transport speed, unit is 'MB/s' -# TYPE npu_chip_info_bandwidth_tx gauge -npu_chip_info_bandwidth_tx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_error_code the npu error code -# TYPE npu_chip_info_error_code gauge -npu_chip_info_error_code{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_hbm_total_memory the npu hbm total memory -# TYPE npu_chip_info_hbm_total_memory gauge -npu_chip_info_hbm_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_hbm_used_memory the npu hbm used memory -# TYPE npu_chip_info_hbm_used_memory gauge -npu_chip_info_hbm_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_health_status the npu health status -# TYPE npu_chip_info_health_status gauge -npu_chip_info_health_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -# HELP npu_chip_info_link_status the npu link status -# TYPE npu_chip_info_link_status gauge -npu_chip_info_link_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_name the Ascend npu name with value '1' -# TYPE npu_chip_info_name gauge -npu_chip_info_name{container_name="",id="0",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="1",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="2",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="3",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="4",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="5",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="6",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="7",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -# HELP npu_chip_info_network_status the npu network health status -# TYPE npu_chip_info_network_status gauge -npu_chip_info_network_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_power the npu power -# TYPE npu_chip_info_power gauge -npu_chip_info_power{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_temperature the npu temperature -# TYPE npu_chip_info_temperature gauge -npu_chip_info_temperature{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_total_memory the npu total memory -# TYPE npu_chip_info_total_memory gauge -npu_chip_info_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_used_memory the npu used memory -# TYPE npu_chip_info_used_memory gauge -npu_chip_info_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_utilization the ai core utilization -# TYPE npu_chip_info_utilization gauge -npu_chip_info_utilization{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_voltage the npu voltage -# TYPE npu_chip_info_voltage gauge -npu_chip_info_voltage{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_exporter_version_info exporter version with value '1' -# TYPE npu_exporter_version_info gauge -npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 deleted file mode 100644 index bd501ee..0000000 --- a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 +++ /dev/null @@ -1,6 +0,0 @@ -# HELP machine_npu_nums Amount of npu installed on the machine. -# TYPE machine_npu_nums gauge -machine_npu_nums 0 -# HELP npu_exporter_version_info exporter version with value '1' -# TYPE npu_exporter_version_info gauge -npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/go.mod b/mind-cluster/component/npu-exporter/go.mod deleted file mode 100644 index 0d84960..0000000 --- a/mind-cluster/component/npu-exporter/go.mod +++ /dev/null @@ -1,63 +0,0 @@ -module huawei.com/npu-exporter/v6 - -go 1.18 - -require ( - ascend-common v0.0.0 - github.com/agiledragon/gomonkey/v2 v2.8.0 - github.com/golang/protobuf v1.5.3 - github.com/influxdata/telegraf v1.26.3 - github.com/prometheus/client_golang v1.15.0 - github.com/smartystreets/goconvey v1.6.4 - github.com/stretchr/testify v1.8.2 - google.golang.org/grpc v1.57.2 - google.golang.org/protobuf v1.30.0 - k8s.io/cri-api v0.25.13 -) - -require ( - github.com/BurntSushi/toml v1.2.1 // indirect - github.com/alecthomas/participle v0.4.1 // indirect - github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect - github.com/awnumar/memcall v0.1.2 // indirect - github.com/awnumar/memguard v0.22.3 // indirect - github.com/benbjohnson/clock v1.3.3 // indirect - github.com/beorn7/perks v1.0.1 // indirect - github.com/blues/jsonata-go v1.5.4 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/coreos/go-semver v0.3.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/fatih/color v1.15.0 // indirect - github.com/fsnotify/fsnotify v1.6.0 // indirect - github.com/gobwas/glob v0.2.3 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/snappy v0.0.4 // indirect - github.com/google/go-cmp v0.6.0 // indirect - github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect - github.com/gosnmp/gosnmp v1.35.0 // indirect - github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 // indirect - github.com/jtolds/gls v4.20.0+incompatible // indirect - github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.17 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect - github.com/naoina/go-stringutil v0.1.0 // indirect - github.com/philhofer/fwd v1.1.2 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.42.0 // indirect - github.com/prometheus/procfs v0.9.0 // indirect - github.com/prometheus/prometheus v0.42.0 // indirect - github.com/rogpeppe/go-internal v1.11.0 // indirect - github.com/sleepinggenius2/gosmi v0.4.4 // indirect - github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect - github.com/tinylib/msgp v1.1.8 // indirect - golang.org/x/crypto v0.31.0 // indirect - golang.org/x/net v0.25.0 // indirect - golang.org/x/sys v0.28.0 // indirect - golang.org/x/text v0.21.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apimachinery v0.26.2 // indirect -) - -replace ascend-common => ../ascend-common diff --git a/mind-cluster/component/npu-exporter/go.sum b/mind-cluster/component/npu-exporter/go.sum deleted file mode 100644 index d638dd1..0000000 --- a/mind-cluster/component/npu-exporter/go.sum +++ /dev/null @@ -1,561 +0,0 @@ -cloud.google.com/go v0.110.1 h1:oDJ19Fu9TX9Xs06iyCw4yifSqZ7JQ8BeuVHcTmWQlOA= -cloud.google.com/go/bigquery v1.51.1 h1:qI/8vkBbzLkv0BJmzE7ajA6uZqQC+C31MAwgb+vJe2U= -cloud.google.com/go/compute v1.19.1 h1:am86mquDUgjGNWxiGn+5PGLbmgiWXlE/yNWpIpNvuXY= -cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= -cloud.google.com/go/iam v1.0.0 h1:hlQJMovyJJwYjZcTohUH4o1L8Z8kYz+E+W/zktiLCBc= -cloud.google.com/go/monitoring v1.13.0 h1:2qsrgXGVoRXpP7otZ14eE1I568zAa92sJSDPyOJvwjM= -cloud.google.com/go/pubsub v1.30.1 h1:RdzTlwhswvROjPIoTfnSJ9tEp0LY2S5ATX90anOw7E8= -cloud.google.com/go/storage v1.29.0 h1:6weCgzRvMg7lzuUurI4697AqIRPU1SvzHhynwpW31jI= -code.cloudfoundry.org/clock v1.0.0 h1:kFXWQM4bxYvdBw2X8BbBeXwQNgfoWv1vqAk2ZZyBN2o= -collectd.org v0.5.0 h1:y4uFSAuOmeVhG3GCRa3/oH+ysePfO/+eGJNfd0Qa3d8= -github.com/Azure/azure-amqp-common-go/v4 v4.0.0 h1:mV5O74KYmonn0ZXtwfMjGUtZ9Z+Hv7AIFVS1s03sRvo= -github.com/Azure/azure-event-hubs-go/v3 v3.4.0 h1:LtH0nHkXivyV/GajOu5ZFC5sb/5KZ8j+9U8UsfHVTOo= -github.com/Azure/azure-kusto-go v0.8.0 h1:AeO6VBRGzB1BhmWeheSyN+WSrx+1wmhHm47vzptitdw= -github.com/Azure/azure-pipeline-go v0.2.3 h1:7U9HBg1JFK3jHl5qmo4CTZKFTVgMwdFHMVtCdfBE21U= -github.com/Azure/azure-sdk-for-go v65.0.0+incompatible h1:HzKLt3kIwMm4KeJYTdx9EbjRYTySD/t8i1Ee/W5EGXw= -github.com/Azure/azure-sdk-for-go/sdk/azcore v0.21.1 h1:qoVeMsc9/fh/yhxVaA0obYjVH/oI/ihrOoMwsLS9KSA= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v0.13.2 h1:mM/yraAumqMMIYev6zX0oxHqX6hreUs5wXf76W47r38= -github.com/Azure/azure-sdk-for-go/sdk/internal v0.9.1 h1:sLZ/Y+P/5RRtsXWylBjB5lkgixYfm0MQPiwrSX//JSo= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor v0.4.1 h1:P6UDRqlbywdpvhpVZeiB5p+DuhMTrVD4xfvPW55bs8M= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v0.3.1 h1:EXTDtCSTfPauGawsG+Ae/W46B1PkrgzuKNrcFqy4ljM= -github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v0.3.0 h1:Px2UA+2RvSSvv+RvJNuUB6n7rs5Wsel4dXLe90Um2n4= -github.com/Azure/azure-storage-blob-go v0.15.0 h1:rXtgp8tN1p29GvpGgfJetavIG0V7OgcSXPpwp3tx6qk= -github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd h1:b3wyxBl3vvr15tUAziPBPK354y+LSdfPCpex5oBttHo= -github.com/Azure/go-amqp v0.18.0 h1:95bTiJq0oxjK1RUlt5T3HF/THj6jWTRZpSXMPSOJLz8= -github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= -github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= -github.com/Azure/go-autorest/autorest v0.11.28 h1:ndAExarwr5Y+GaHE6VCaY1kyS/HwwGGyuimVhWsHOEM= -github.com/Azure/go-autorest/autorest/adal v0.9.23 h1:Yepx8CvFxwNKpH6ja7RZ+sKX+DWYNldbLiALMC3BTz8= -github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 h1:wkAZRgT/pn8HhFyzfe9UnqOjJYqlembgCTi72Bm/xKk= -github.com/Azure/go-autorest/autorest/azure/cli v0.4.5 h1:0W/yGmFdTIT77fvdlGZ0LMISoLHFJ7Tx4U0yeB+uFs4= -github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= -github.com/Azure/go-autorest/autorest/to v0.4.0 h1:oXVqrxakqqV1UZdSazDOPOLvOIz+XA683u8EctwboHk= -github.com/Azure/go-autorest/autorest/validation v0.3.1 h1:AgyqjAd94fwNAoTjl/WQXg4VvFeRFpO+UhNyRXqF1ac= -github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= -github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= -github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e h1:NeAW1fUYUEWhft7pkxDf6WoUvEZJ/uOKsvtpjLnn8MU= -github.com/AzureAD/microsoft-authentication-library-for-go v0.4.0 h1:WVsrXCnHlDDX8ls+tootqRE87/hL9S/g4ewig9RsD/c= -github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= -github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/ClickHouse/clickhouse-go v1.5.4 h1:cKjXeYLNWVJIx2J1K6H2CqyRmfwVJVY1OV1coaaFcI0= -github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= -github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= -github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= -github.com/Mellanox/rdmamap v0.0.0-20191106181932-7c3c4763a6ee h1:atI/FFjXh6hIVlPE1Jup9m8N4B9q/OSbMUe2EBahs+w= -github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= -github.com/Shopify/sarama v1.38.1 h1:lqqPUPQZ7zPqYlWpTh+LQ9bhYNu2xJL6k1SJN4WVe2A= -github.com/aerospike/aerospike-client-go/v5 v5.11.0 h1:z3ZmDSm3I10VMXXIIrsFCFq3IenwFqTCnLNyvnFVzrk= -github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= -github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= -github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2/go.mod h1:CxCgO+NdpMdi9SsTlGbc0W+/UNxO3I0AabOEJZ3w61w= -github.com/alecthomas/kong v0.2.1/go.mod h1:+inYUSluD+p4L8KdviBSgzcqEjUQOfC5fQDRFuc36lI= -github.com/alecthomas/participle v0.4.1 h1:P2PJWzwrSpuCWXKnzqvw0b0phSfH1kJo4p2HvLynVsI= -github.com/alecthomas/participle v0.4.1/go.mod h1:T8u4bQOSMwrkTWOSyt8/jSFPEnRtd0FKFMjVfYBlqPs= -github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ= -github.com/alecthomas/repr v0.0.0-20210301060118-828286944d6a/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= -github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= -github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= -github.com/aliyun/alibaba-cloud-sdk-go v1.62.193 h1:Cwd5cNwrQqtOzOJ1vqswYe3amU3vOz3v0wQF8WizmXI= -github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= -github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= -github.com/antchfx/jsonquery v1.3.1 h1:kh3599hMLpygvcxoENcj99eCvnS++JjRX10LjNYhK58= -github.com/antchfx/xmlquery v1.3.15 h1:aJConNMi1sMha5G8YJoAIF5P+H+qG1L73bSItWHo8Tw= -github.com/antchfx/xpath v1.2.5-0.20230505064641-588960cceeac h1:Et7H7mEPWuivbFEXi3dWa8hobnvF380TS2mq7JmgjEI= -github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ= -github.com/apache/arrow/go/v12 v12.0.0 h1:xtZE63VWl7qLdB0JObIXvvhGjoVNrQ9ciIHG2OK5cmc= -github.com/apache/iotdb-client-go v0.12.2-0.20220722111104-cd17da295b46 h1:28HyUQcr8ZCyCAatR0gkf9PuLr52U2T+66tx5Th0nxI= -github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg= -github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 h1:Bmjk+DjIi3tTAU0wxGaFbfjGUqlxxSXARq9A96Kgoos= -github.com/aristanetworks/goarista v0.0.0-20190325233358-a123909ec740 h1:FD4/ikKOFxwP8muWDypbmBWc634+YcAs3eBrYAmRdZY= -github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA= -github.com/awnumar/memcall v0.1.2 h1:7gOfDTL+BJ6nnbtAp9+HQzUFjtP1hEseRQq8eP055QY= -github.com/awnumar/memcall v0.1.2/go.mod h1:S911igBPR9CThzd/hYQQmTc9SWNu3ZHIlCGaWsWsoJo= -github.com/awnumar/memguard v0.22.3 h1:b4sgUXtbUjhrGELPbuC62wU+BsPQy+8lkWed9Z+pj0Y= -github.com/awnumar/memguard v0.22.3/go.mod h1:mmGunnffnLHlxE5rRgQc3j+uwPZ27eYb61ccr8Clz2Y= -github.com/aws/aws-sdk-go-v2 v1.18.0 h1:882kkTpSFhdgYRKVZ/VCgf7sd0ru57p2JCxz4/oN5RY= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= -github.com/aws/aws-sdk-go-v2/config v1.18.8 h1:lDpy0WM8AHsywOnVrOHaSMfpaiV2igOw8D7svkFkXVA= -github.com/aws/aws-sdk-go-v2/credentials v1.13.20 h1:oZCEFcrMppP/CNiS8myzv9JgOzq2s0d3v3MXYil/mxQ= -github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.2.0 h1:8kvinmbIDObqsWegKP0JjeanYPiA4GUVpAtciNWE+jw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.2 h1:jOzQAesnBFDmz93feqKnsTHsXrlwWORNZMFHMV+WLFU= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.7.1 h1:p9Dys1g2YdaqMalnp6AwCA+tpMMdJNGw5YYKP/u3sUk= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.32 h1:dpbVNUjczQ8Ae3QKHbpHBpfvaVkRdesxpTOe9pTouhU= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.26 h1:QH2kOS3Ht7x+u0gHCh06CXL/h6G8LQJFpZfFBYBNboo= -github.com/aws/aws-sdk-go-v2/internal/ini v1.3.28 h1:KeTxcGdNnQudb46oOl4d90f2I33DF/c6q3RnZAmvQdQ= -github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.25.9 h1:7jgW378oM948BxuOBarXeeaKSrRaCj7didsdeSwYGGo= -github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.20.9 h1:sXs+JjIwgKA27t+5O8YgXl0cmZpEmctyDVO5y6cMdqA= -github.com/aws/aws-sdk-go-v2/service/dynamodb v1.17.3 h1:2oB4ikNEMLaPtu6lbNFJyTSayBILvrOfa2VfOffcuvU= -github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.4.0 h1:QbFWJr2SAyVYvyoOHvJU6sCGLnqNT94ZbWElJMEI1JY= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.10 h1:dpiPHgmFstgkLG07KaYAewvuptq5kvo52xn7tVSrtrQ= -github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.7.23 h1:5AwQnYQT3ZX/N7hPTAx4ClWyucaiqr2esQRMNbJIby0= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.26 h1:uUt4XctZLhl9wBE1L8lobU3bVN8SNUP7T+olb0bWBO4= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.9.0 h1:0BOlTqnNnrEO04oYKzDxMMe68t107pmIotn18HtVonY= -github.com/aws/aws-sdk-go-v2/service/kinesis v1.17.8 h1:9Kk24woetm1Tm4cAZNoJStJW1VQAeh92lLD9XZ4176g= -github.com/aws/aws-sdk-go-v2/service/s3 v1.19.0 h1:5mRAms4TjSTOGYsqKYte5kHr1PzpMJSyLThjF3J+hw0= -github.com/aws/aws-sdk-go-v2/service/sso v1.12.8 h1:5cb3D6xb006bPTqEfCNaEA6PPEfBXxxy4NNeX/44kGk= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.8 h1:NZaj0ngZMzsubWZbrEFSB4rgSQRbFq38Sd6KBxHuOIU= -github.com/aws/aws-sdk-go-v2/service/sts v1.18.9 h1:Qf1aWwnsNkyAoqDqmdM3nHwN78XQjec27LjM6b9vyfI= -github.com/aws/aws-sdk-go-v2/service/timestreamwrite v1.16.0 h1:HHVOprdnZxhM6F5JgljW8nCklfwUyOlbd/wuca6vORA= -github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= -github.com/awslabs/kinesis-aggregation/go v0.0.0-20210630091500-54e17340d32f h1:Pf0BjJDga7C98f0vhw+Ip5EaiE07S3lTKpIYPNS0nMo= -github.com/benbjohnson/clock v1.3.3 h1:g+rSsSaAzhHJYcIQE78hJ3AhyjjtQvleKDjlhdBnIhc= -github.com/benbjohnson/clock v1.3.3/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= -github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= -github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/blues/jsonata-go v1.5.4 h1:XCsXaVVMrt4lcpKeJw6mNJHqQpWU751cnHdCFUq3xd8= -github.com/blues/jsonata-go v1.5.4/go.mod h1:uns2jymDrnI7y+UFYCqsRTEiAH22GyHnNXrkupAVFWI= -github.com/bmatcuk/doublestar/v3 v3.0.0 h1:TQtVPlDnAYwcrVNB2JiGuMc++H5qzWZd9PhkNo5WyHI= -github.com/bufbuild/protocompile v0.4.0 h1:LbFKd2XowZvQ/kajzguUp2DC9UEIQhIq77fZZlaQsNA= -github.com/caio/go-tdigest v3.1.0+incompatible h1:uoVMJ3Q5lXmVLCCqaMGHLBWnbGoN6Lpu7OAUPR60cds= -github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= -github.com/cenkalti/backoff/v4 v4.2.0 h1:HN5dHm3WBOgndBH6E8V0q2jIYIR3s9yglV8k/+MN3u4= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cisco-ie/nx-telemetry-proto v0.0.0-20230117155933-f64c045c77df h1:GmrltUp5Qf5XhT+LmqMDizsgm/6VHTSxPWRdrq21yRo= -github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg= -github.com/containerd/containerd v1.6.18 h1:qZbsLvmyu+Vlty0/Ex5xc0z2YtKpIsb5n45mAMI+2Ns= -github.com/coocood/freecache v1.2.3 h1:lcBwpZrwBZRZyLk/8EMyQVXRiFl663cCuMOrjCALeto= -github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= -github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec= -github.com/couchbase/go-couchbase v0.1.1 h1:ClFXELcKj/ojyoTYbsY34QUrrYCBi/1G749sXSCkdhk= -github.com/couchbase/gomemcached v0.1.3 h1:HIc5qMYNbuhB7zNaiEtj61DCYkquAwrQlf64q7JzdEY= -github.com/couchbase/goutils v0.1.0 h1:0WLlKJilu7IBm98T8nS9+J36lBFVLRUSIUtyD/uWpAE= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/denisenkom/go-mssqldb v0.12.3 h1:pBSGx9Tq67pBOTLmxNuirNTeB8Vjmf886Kx+8Y+8shw= -github.com/devigned/tab v0.1.1 h1:3mD6Kb1mUOYeLpJvTVSDwSg5ZsfSxfvxGRTxRsJsITA= -github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= -github.com/digitalocean/go-libvirt v0.0.0-20220811165305-15feff002086 h1:FTREXo+EVmU9nOCaQ46PvH0hs1Rt2/diCoTAtxzDxrA= -github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= -github.com/djherbis/times v1.5.0 h1:79myA211VwPhFTqUk8xehWrsEO+zcIZj0zT8mXPVARU= -github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= -github.com/docker/docker v23.0.4+incompatible h1:Kd3Bh9V/rO+XpTP/BLqM+gx8z7+Yb0AA2Ibj+nNo4ek= -github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/doclambda/protobufquery v0.0.0-20220727165953-0da287796ee9 h1:677nbAF3nq56BEZ2R/VMl0wROQqJo4vJ/ZWuzm+vsUU= -github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= -github.com/dynatrace-oss/dynatrace-metric-utils-go v0.5.0 h1:wHGPJSXvwKQVf/XfhjUPyrhpcPKWNy8F3ikH+eiwoBg= -github.com/eapache/go-resiliency v1.3.0 h1:RRL0nge+cWGlxXbUzJ7yMcq6w2XBEr19dCN6HECGaT0= -github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6 h1:8yY/I9ndfrgrXUbOGObLHKBR4Fl3nZXwM2c7OYTT8hM= -github.com/eapache/queue v1.1.0 h1:YOEu7KNc61ntiQlcEeUIoDTJ2o8mQznoNvUhiigpIqc= -github.com/eclipse/paho.golang v0.10.0 h1:oUGPjRwWcZQRgDD9wVDV7y7i7yBSxts3vcvcNJo8B4Q= -github.com/eclipse/paho.mqtt.golang v1.4.2 h1:66wOzfUHSSI1zamx7jR6yMEI5EuHnT1G6rNA5PM12m4= -github.com/emicklei/go-restful/v3 v3.10.1 h1:rc42Y5YTp7Am7CS630D7JmhRjq4UlEUuEKfrDac4bSQ= -github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= -github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= -github.com/form3tech-oss/jwt-go v3.2.5+incompatible h1:/l4kBbb4/vGSsdtB5nUe8L7B9mImVMaBPw9L/0TBHU8= -github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= -github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro= -github.com/go-asn1-ber/asn1-ber v1.5.4 h1:vXT6d/FNDiELJnLb6hGNa309LMsrCoYFvpwHDF0+Y1A= -github.com/go-ldap/ldap/v3 v3.4.4 h1:qPjipEpt+qDa6SI/h1fzuGWoRUY+qqQ9sOZq67/PYUs= -github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= -github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= -github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= -github.com/go-redis/redis/v7 v7.4.1 h1:PASvf36gyUpr2zdOUS/9Zqc80GbM+9BDyiJSJDDOrTI= -github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= -github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= -github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= -github.com/go-stomp/stomp v2.1.4+incompatible h1:D3SheUVDOz9RsjVWkoh/1iCOwD0qWjyeTZMUZ0EXg2Y= -github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= -github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= -github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= -github.com/gofrs/uuid v4.2.0+incompatible h1:yyYWMnhkhrKwwr8gAOcOCYxOOscHgDS9yZgBrnJfGa0= -github.com/gofrs/uuid/v5 v5.0.0 h1:p544++a97kEL+svbcFbCQVM9KFu0Yo25UoISXGNNH9M= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c= -github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg= -github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= -github.com/golang-sql/sqlexp v0.1.0 h1:ZCD6MBpcuOVfGVqsEmY5/4FtYiKz6tSyUv9LPEDei6A= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/flatbuffers v23.3.3+incompatible h1:5PJI/WbJkaMTvpGxsHVKG/LurN/KnWXNyGpwSCDgen0= -github.com/google/gnostic v0.6.9 h1:ZK/5VhkoX835RikCHpSUJV9a+S3e1zLh59YnyWeBW+0= -github.com/google/gnxi v0.0.0-20221016143401-2aeceb5a2901 h1:xlsMG0I0F6Ou3a4zRWu3cThivTt2N2V1cZafIloTBTU= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-github/v32 v32.1.0 h1:GWkQOdXqviCPx7Q7Fj+KyPoGm4SwHRh8rheoPhd27II= -github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/s2a-go v0.1.3 h1:FAgZmpLl/SXurPEZyCMPBIiiYeTbqfjlbdnCNTAkbGE= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/googleapis/enterprise-certificate-proxy v0.2.3 h1:yk9/cqRKtT9wXZSsRH9aurXEpJX+U6FLtpYTdC3R06k= -github.com/googleapis/gax-go/v2 v2.8.0 h1:UBtEZqx1bjXtOQ5BVTkuYghXrr3N4V123VKJK67vJZc= -github.com/gopcua/opcua v0.3.7 h1:iGjLW3D+ztnjtZQPKsJ0nwibHyDw1m11NfqOU8KSFQ8= -github.com/gophercloud/gophercloud v1.2.0 h1:1oXyj4g54KBg/kFtCdMM6jtxSzeIyg8wv4z1HoGPp1E= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= -github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= -github.com/gosnmp/gosnmp v1.35.0 h1:EuWWNPxTCdAUx2/NbQcSa3WdNxjzpy4Phv57b4MWpJM= -github.com/gosnmp/gosnmp v1.35.0/go.mod h1:2AvKZ3n9aEl5TJEo/fFmf/FGO4Nj4cVeEc5yuk88CYc= -github.com/grid-x/modbus v0.0.0-20211113184042-7f2251c342c9 h1:Q7e9kXS3sRbTjsNDKazbcbDSGAKjFdk096M3qYbwNpE= -github.com/grid-x/serial v0.0.0-20211107191517-583c7356b3aa h1:Rsn6ARgNkXrsXJIzhkE4vQr5Gbx2LvtEMv4BJOK4LyU= -github.com/gwos/tcg/sdk v0.0.0-20220621192633-df0eac0a1a4c h1:pVr0TkSFnMP4BWSsEak/4bxD8/K+foJ9V8DGyZ6PIDE= -github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= -github.com/harlow/kinesis-consumer v0.3.6-0.20211204214318-c2b9f79d7ab6 h1:38nI+nE+oUmLmlNjuByhvnmuBrcQVLNkOJhSSM4eJv0= -github.com/hashicorp/consul/api v1.20.0 h1:9IHTjNVSZ7MIwjlW3N3a7iGiykCMDpxZu8jsxFJh0yc= -github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= -github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= -github.com/hashicorp/go-hclog v1.4.0 h1:ctuWFGrhFha8BnnzxqeRGidlEcQkDyL5u8J8t5eA11I= -github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc= -github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= -github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= -github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= -github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= -github.com/hashicorp/packer-plugin-sdk v0.3.1 h1:Gr/mnihsdUcPfGiruFL93BQkiFh3EFPwyxxTWkwvRsQ= -github.com/hashicorp/serf v0.10.1 h1:Z1H2J60yRKvfDYAOZLd2MU0ND4AH/WDz7xYHDWQsIPY= -github.com/huandu/xstrings v1.3.2 h1:L18LIDzqlW6xN2rEkpdV8+oL/IXWJ1APd+vsdYy4Wdw= -github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= -github.com/influxdata/go-syslog/v3 v3.0.0 h1:jichmjSZlYK0VMmlz+k4WeOQd7z745YLsvGMqwtYt4I= -github.com/influxdata/influxdb-observability/common v0.3.3 h1:fzsgJKiV/bucNPRYggLE1F6UgpePQaYh72Lqj1rPEmI= -github.com/influxdata/influxdb-observability/influx2otel v0.3.3 h1:KWesgMC0sqRLfvPZXnCzJauCZ82XoHtKTFJVKmEk63M= -github.com/influxdata/influxdb-observability/otel2influx v0.3.3 h1:zdesvjHJYXccZ4vd6hP6vXwbd6YbAj7AGMhOjk9pt0k= -github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= -github.com/influxdata/tail v1.0.1-0.20210707231403-b283181d1fa7 h1:0rQOs1VHLVFpAAOIR0mJEvVOIaMYFgYdreeVbgI9sII= -github.com/influxdata/telegraf v1.26.3 h1:wawD3VTdnPDbHnJ1RBGgCf0YB7vlxREZ70rvEepHdGs= -github.com/influxdata/telegraf v1.26.3/go.mod h1:w+VUZ4NRDzfhRmhEdBbbNZBNT7E8qRkLiL73j/pD0ug= -github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 h1:vvyMtD5LTJc1W9sQKjDkAWdcg0478CszSdzlHtiAXCY= -github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65/go.mod h1:zApaNFpP/bTpQItGZNNUMISDMDAnTXu9UqJ4yT3ocz8= -github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8 h1:W2IgzRCb0L9VzMujq/QuTaZUKcH8096jWwP519mHN6Q= -github.com/intel/iaevents v1.1.0 h1:FzxMBfXk/apG2EUXUCfaq3gUQ+q+TgZ1HNMjjUILUGE= -github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= -github.com/jackc/pgconn v1.13.0 h1:3L1XMNV2Zvca/8BYhzcRFS70Lr0WlDg16Di6SFGAbys= -github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= -github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= -github.com/jackc/pgproto3/v2 v2.3.1 h1:nwj7qwf0S+Q7ISFfBndqeLwSwxs+4DPsbRFjECT1Y4Y= -github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b h1:C8S2+VttkHFdOOCXJe+YGfa4vHYwlt4Zx+IVXQ97jYg= -github.com/jackc/pgtype v1.12.0 h1:Dlq8Qvcch7kiehm8wPGIW0W3KsCCHJnRacKW0UM8n5w= -github.com/jackc/pgx/v4 v4.17.1 h1:tASdE79tX9LOQu3MMvioWT6YaZkf58ZhmLHhV4sv5WM= -github.com/jackc/puddle v1.3.0 h1:eHK/5clGOatcjX3oWGBO/MpxpbHzSwud5EWTSCI+MX0= -github.com/jaegertracing/jaeger v1.38.0 h1:rDQ36TnSxUX4gTskMQzEdpieS0BGYdfXXnUJmGnNMGw= -github.com/james4k/rcon v0.0.0-20120923215419-8fbb8268b60a h1:JxcWget6X/VfBMKxPIc28Jel37LGREut2fpV+ObkwJ0= -github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= -github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= -github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg= -github.com/jcmturner/gokrb5/v8 v8.4.3 h1:iTonLeSJOn7MVUtyMT+arAn5AKAPrkilzhGw8wE/Tq8= -github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY= -github.com/jeremywohl/flatten/v2 v2.0.0-20211013061545-07e4a09fb8e4 h1:eA9wi6ZzpIRobvXkn/S2Lyw1hr2pc71zxzOPl7Xjs4w= -github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c= -github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/native v1.0.0 h1:Ts/E8zCSEsG17dUqv7joXJFybuMLjQfWE04tsBODTxk= -github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/karrick/godirwalk v1.16.2 h1:eY2INUWoB2ZfpF/kXasyjWJ3Ncuof6qZuNWYZFN3kAI= -github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= -github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= -github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk= -github.com/knadh/koanf v1.5.0 h1:q2TSd/3Pyc/5yP9ldIrSdIz26MCcyNQzW0pEAugLPNs= -github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b h1:udzkj9S/zlT5X367kqJis0QP7YMxobob6zhzq6Yre00= -github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= -github.com/leodido/ragel-machinery v0.0.0-20181214104525-299bdde78165 h1:bCiVCRCs1Heq84lurVinUPy19keqGEe4jh5vtK37jcg= -github.com/linkedin/goavro/v2 v2.12.0 h1:rIQQSj8jdAUlKQh6DttK8wCRv4t4QO09g1C4aBWXslg= -github.com/logzio/azure-monitor-metrics-receiver v1.0.0 h1:TAzhIZL2ueyyc81qIw8FGg4nUbts4Hvc3oOxSobY1IA= -github.com/lufia/plan9stats v0.0.0-20220913051719-115f729f3c8c h1:VtwQ41oftZwlMnOEbMWQtSEUgU64U4s+GHk7hZK+jtY= -github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= -github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= -github.com/mattn/go-ieproxy v0.0.1 h1:qiyop7gCflfhwCzGyeT0gro3sF9AIg9HU98JORTkqfI= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= -github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= -github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= -github.com/mdlayher/apcupsd v0.0.0-20220319200143-473c7b5f3c6a h1:JOlLsLUQnokTyWWwEvOVoKH3XUl6oDMP8jisO54l6J8= -github.com/mdlayher/genetlink v1.2.0 h1:4yrIkRV5Wfk1WfpWTcoOlGmsWgQj3OtQN9ZsbrE+XtU= -github.com/mdlayher/netlink v1.6.0 h1:rOHX5yl7qnlpiVkFWoqccueppMtXzeziFjWAjLg6sz0= -github.com/mdlayher/socket v0.2.3 h1:XZA2X2TjdOwNoNPVPclRCURoX/hokBY8nkTmRZFEheM= -github.com/microsoft/ApplicationInsights-Go v0.4.4 h1:G4+H9WNs6ygSCe6sUyxRc2U81TI5Es90b2t/MwX5KqY= -github.com/miekg/dns v1.1.51 h1:0+Xg7vObnhrz/4ZCZcZh7zPXlmU0aveS2HDBd0m0qSo= -github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= -github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= -github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= -github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= -github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= -github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= -github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= -github.com/moby/ipvs v1.1.0 h1:ONN4pGaZQgAx+1Scz5RvWV4Q7Gb+mvfRh3NsPS+1XQQ= -github.com/moby/patternmatcher v0.5.0 h1:YCZgJOeULcxLw1Q+sVR636pmS7sPEn1Qo2iAN6M7DBo= -github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= -github.com/moby/term v0.0.0-20221128092401-c43b287e0e0f h1:J/7hjLaHLD7epG0m6TBMGmp4NQ+ibBYLfeyJWdAIFLA= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/montanaflynn/stats v0.6.6 h1:Duep6KMIDpY4Yo11iFsvyqJDyfzLF9+sndUKT+v64GQ= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/multiplay/go-ts3 v1.1.0 h1:OWOjRxBCRds+FbpyM1JKSscRbbmYr/IIrh6V78CM5Xw= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= -github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks= -github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0= -github.com/nats-io/jwt/v2 v2.3.0 h1:z2mA1a7tIf5ShggOFlR1oBPgd6hGqcDYsISxZByUzdI= -github.com/nats-io/nats-server/v2 v2.9.9 h1:bmj0RhvHOc8+z5/RuhI38GqPwtkFAHQuU3e99FVA/TI= -github.com/nats-io/nats.go v1.24.0 h1:CRiD8L5GOQu/DcfkmgBcTTIQORMwizF+rPk6T0RaHVQ= -github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8= -github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= -github.com/netsampler/goflow2 v1.3.3 h1:uheCMgWwbaHnVdsvc2bqbdQe93E73pVF77WGu/kPE7U= -github.com/newrelic/newrelic-telemetry-sdk-go v0.8.1 h1:6OX5VXMuj2salqNBc41eXKz6K+nV6OB/hhlGnAKCbwU= -github.com/nsqio/go-nsq v1.1.0 h1:PQg+xxiUjA7V+TLdXw7nVrJ5Jbl3sN86EhGCQj4+FYE= -github.com/olivere/elastic v6.2.37+incompatible h1:UfSGJem5czY+x/LqxgeCBgjDn6St+z8OnsCuxwD3L0U= -github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.73.0 h1:b62Oq3dniQm3eg8OcnBnlZCyZ4O85iyKPFuCIeYNCKk= -github.com/openconfig/gnmi v0.9.1 h1:hVOdLTaRjdy68oCGJbkf2vrmnUoQ5xbINqBOAMix4xM= -github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= -github.com/opencontainers/image-spec v1.1.0-rc2 h1:2zx/Stx4Wc5pIPDvIxHXvXtQFW/7XWJGmnM7r3wg034= -github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= -github.com/opensearch-project/opensearch-go/v2 v2.2.0 h1:6RicCBiqboSVtLMjSiKgVQIsND4I3sxELg9uwWe/TKM= -github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b h1:FfH+VrHHk6Lxt9HdVS0PXzSXFyS2NbZKXv33FYPol0A= -github.com/p4lang/p4runtime v1.3.0 h1:3fUhHj0JtsGcL2Bh0uxpACdBJBDqpZyLgj93tqKzoJY= -github.com/pborman/ansi v1.0.0 h1:OqjHMhvlSuCCV5JT07yqPuJPQzQl+WXsiZ14gZsqOrQ= -github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw= -github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0= -github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc= -github.com/pion/dtls/v2 v2.2.6 h1:yXMxKr0Skd+Ub6A8UqXTRLSywskx93ooMRHsQUtd+Z4= -github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= -github.com/pion/transport/v2 v2.0.2 h1:St+8o+1PEzPT51O9bv+tH/KYYLMNR5Vwm5Z3Qkjsywg= -github.com/pion/udp/v2 v2.0.1 h1:xP0z6WNux1zWEjhC7onRA3EwwSliXqu1ElUZAQhUP54= -github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 h1:KoWmjvw+nsYOo29YJK9vDA65RGE3NrOnUtO7a+RF9HU= -github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c h1:NRoLoZvkBTKvR5gQLgA3e0hqjkY9u1wm+iOL45VN/qI= -github.com/prometheus-community/pro-bing v0.1.0 h1:zjzLGhfNPP0bP1OlzGB+SJcguOViw7df12LPg2vUJh8= -github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= -github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= -github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= -github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= -github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= -github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= -github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= -github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= -github.com/prometheus/prometheus v0.42.0 h1:G769v8covTkOiNckXFIwLx01XE04OE6Fr0JPA0oR2nI= -github.com/prometheus/prometheus v0.42.0/go.mod h1:Pfqb/MLnnR2KK+0vchiaH39jXxvLMBk+3lnIGP4N7Vk= -github.com/rabbitmq/amqp091-go v1.8.0 h1:GBFy5PpLQ5jSVVSYv8ecHGqeX7UTLYR4ItQbDCss9MM= -github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= -github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= -github.com/riemann/riemann-go-client v0.5.1-0.20211206220514-f58f10cdce16 h1:bGXoxRwUpPTCaQ86DRE+3wqE9vh3aH8W0HH5L/ygOFM= -github.com/robbiet480/go.nut v0.0.0-20220219091450-bd8f121e1fa1 h1:YmFqprZILGlF/X3tvMA4Rwn3ySxyE3hGUajBHkkaZbM= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/safchain/ethtool v0.3.0 h1:gimQJpsI6sc1yIqP/y8GYgiXn/NjgvpM0RNoWLVVmP0= -github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e h1:CGjiMQ0wMH4wtNWrlj6kiTbkPt2F3rbYnhGX6TWLfco= -github.com/shirou/gopsutil/v3 v3.23.3 h1:Syt5vVZXUDXPEXpIBt5ziWsJ4LdSAAxF4l/xZeQgSEE= -github.com/shoenig/go-m1cpu v0.1.4 h1:SZPIgRM2sEF9NJy50mRHu9PKGwxyyTTJIWvCtgVbozs= -github.com/showwin/speedtest-go v1.4.2 h1:3YjBajURQTJCv/rVwJsd5UtCYlaiqCihg5NhPxJapk8= -github.com/signalfx/com_signalfx_metrics_protobuf v0.0.3 h1:32k2QLgsKhcEs55q4REPKyIadvid5FPy2+VMgvbmKJ0= -github.com/signalfx/gohistogram v0.0.0-20160107210732-1ccfd2ff5083 h1:WsShHmu12ZztYPfh9b+I+VjYD1o8iOHhB67WZCMEEE8= -github.com/signalfx/golib/v3 v3.3.50 h1:TTBpfzsO00F8ep6rhLgBmRIPUpRqBenacezjE4xCweI= -github.com/signalfx/sapm-proto v0.12.0 h1:OtOe+Jm8L61Ml8K6X8a89zc8/RlaaMRElCImeGKR/Ew= -github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= -github.com/sleepinggenius2/gosmi v0.4.4 h1:xgu+Mt7CptuB10IPt3SVXBAA9tARToT4B9xGzjjxQX8= -github.com/sleepinggenius2/gosmi v0.4.4/go.mod h1:l8OniPmd3bJzw0MXP2/qh7AhP/e+bTY2CNivIhsnDT0= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= -github.com/snowflakedb/gosnowflake v1.6.13 h1:r8iozak/p3P2jYfjF3EbeteqMMzPWjwmVrdENJDW6EI= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= -github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/testcontainers/testcontainers-go v0.18.0 h1:8RXrcIQv5xX/uBOSmZd297gzvA7F0yuRA37/918o7Yg= -github.com/thomasklein94/packer-plugin-libvirt v0.3.4 h1:K+NkHFcZuiUTp4ZiDdBhWRMZiSMdsXwGuzyg4THKDAU= -github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= -github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= -github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= -github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0= -github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw= -github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= -github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= -github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= -github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= -github.com/vapourismo/knx-go v0.0.0-20220829185957-fb5458a5389d h1:BJMc7MNW/p80cCkC46JimNuowOWCnSSW5IHjtUrXzNk= -github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs= -github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= -github.com/vjeantet/grok v1.0.1 h1:2rhIR7J4gThTgcZ1m2JY4TrJZNgjn985U28kT2wQrJ4= -github.com/vmware/govmomi v0.28.1-0.20220921224932-b4b508abf208 h1:IDVzGQ2aczmTEfTos4hzmFw20tGQ4zZsVnel9C6VEpA= -github.com/wavefronthq/wavefront-sdk-go v0.13.0 h1:3s9maJmzI4orW+hiVBfCNp/SIu8ISXi6rtewmDGzheE= -github.com/wvanbergen/kafka v0.0.0-20171203153745-e2edea948ddf h1:TOV5PC6fIWwFOFra9xJfRXZcL2pLhMI8oNuDugNxg9Q= -github.com/wvanbergen/kazoo-go v0.0.0-20180202103751-f72d8611297a h1:ILoU84rj4AQ3q6cjQvtb9jBjx4xzR/Riq/zYhmDQiOk= -github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= -github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= -github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= -github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= -github.com/xdg/scram v1.0.5 h1:TuS0RFmt5Is5qm9Tm2SoD89OPqe4IRiFtyFY4iwWXsw= -github.com/xdg/stringprep v1.0.3 h1:cmL5Enob4W83ti/ZHuZLuKD/xqJfus4fVPwE+/BDm+4= -github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/yuin/gopher-lua v0.0.0-20200816102855-ee81675732da h1:NimzV1aGyq29m5ukMK0AMWEhFaL/lrEOaephfuoiARg= -github.com/yusufpapurcu/wmi v1.2.2 h1:KBNDSne4vP5mbSWnJbO+51IMOXJB67QiYCSBrubbPRg= -github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= -go.mongodb.org/mongo-driver v1.11.2 h1:+1v2rDQUWNcGW7/7E0Jvdz51V38XXxJfhzbV17aNHCw= -go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= -go.opentelemetry.io/collector v0.73.0 h1:oEBFtf5WcXiIPGXcjOM5gSQ3GNh/3d6pHf0IThhGmfw= -go.opentelemetry.io/collector/component v0.73.0 h1:ka24yVJoVETCru+l5Fm85xGc2y0HwvGfYwyRe7qmjq0= -go.opentelemetry.io/collector/confmap v0.73.0 h1:tC8x8sDk7JQ3QcbosqrxLe756sYcg4iUdTXsx7Ie4CM= -go.opentelemetry.io/collector/consumer v0.73.0 h1:gy89oaG198A7KGbXIsMIdN4lWVQqqSdx6dsBCfzLujU= -go.opentelemetry.io/collector/featuregate v0.73.0 h1:hpHKXmRiJqMLefIzXwIuqDo9df2HcI/66IAKLo+g7nc= -go.opentelemetry.io/collector/pdata v1.0.0-rcv0011 h1:7lT0vseP89mHtUpvgmWYRvQZ0eY+SHbVsnXY20xkoMg= -go.opentelemetry.io/collector/semconv v0.73.0 h1:gF4f6z1q8YfWzzo/gPKysjFmmM4Pv4nC2bWrTPxTPaE= -go.opentelemetry.io/otel v1.14.0 h1:/79Huy8wbf5DnIPhemGB+zEPVwnN6fuQybr/SRXa6hM= -go.opentelemetry.io/otel/metric v0.37.0 h1:pHDQuLQOZwYD+Km0eb657A25NaRzy0a+eLyKfDXedEs= -go.opentelemetry.io/otel/sdk v1.14.0 h1:PDCppFRDq8A1jL9v6KMI6dYesaq+DFcDZvjsoGvxGzY= -go.opentelemetry.io/otel/sdk/metric v0.37.0 h1:haYBBtZZxiI3ROwSmkZnI+d0+AVzBWeviuYQDeBWosU= -go.opentelemetry.io/otel/trace v1.14.0 h1:wp2Mmvj41tDsyAJXiWDWpfNsOiIyd38fy85pyKcFq/M= -go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= -go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= -go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= -golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/exp v0.0.0-20230307190834-24139beb5833 h1:SChBja7BCQewoTAU7IgvucQKMIXrEpFxNMs0spT3/5s= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= -golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= -golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= -golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= -golang.zx2c4.com/wireguard v0.0.0-20211209221555-9c9e7e272434 h1:3zl8RkJNQ8wfPRomwv/6DBbH2Ut6dgMaWTxM0ZunWnE= -golang.zx2c4.com/wireguard/wgctrl v0.0.0-20211230205640-daad0b7ba671 h1:tJAYx7pB6b5bNqi7XatStqFT2zFAxhXcGDq1R6FqqjU= -google.golang.org/api v0.121.0 h1:8Oopoo8Vavxx6gt+sgs8s8/X60WBAtKQq6JqnkF+xow= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 h1:9NWlQfY2ePejTmfwUH1OWwmznFa+0kKcHGPDvcPza9M= -google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 h1:m8v1xLLLzMe1m5P+gCTF8nJB9epwZQUBERm20Oy1poQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 h1:0nDDozoAU19Qb2HwhXadU8OcsiO/09cnTqhUtq2MEOM= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= -google.golang.org/grpc v1.57.2 h1:uw37EN34aMFFXB2QPW7Tq6tdTbind1GpRxw5aOX3a5k= -google.golang.org/grpc v1.57.2/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= -google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= -gopkg.in/fatih/pool.v2 v2.0.0 h1:xIFeWtxifuQJGk/IEPKsTduEKcKvPmhoiVDGpC40nKg= -gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= -gopkg.in/gorethink/gorethink.v3 v3.0.5 h1:e2Uc/Xe+hpcVQFsj6MuHlYog3r0JYpnTzwDj/y2O4MU= -gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= -gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= -gopkg.in/olivere/elastic.v5 v5.0.86 h1:xFy6qRCGAmo5Wjx96srho9BitLhZl2fcnpuidPwduXM= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= -gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637 h1:yiW+nvdHb9LVqSHQBXfZCieqV4fzYhNBql77zY0ykqs= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -honnef.co/go/tools v0.2.2 h1:MNh1AVMyVX23VUHE2O27jm6lNj3vjO5DexS4A1xvnzk= -k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ= -k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ= -k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I= -k8s.io/client-go v0.26.2 h1:s1WkVujHX3kTp4Zn4yGNFK+dlDXy1bAAkIl+cFAiuYI= -k8s.io/cri-api v0.25.13 h1:FaVci3+y5COQuyAFWUckdfOxRpD+m0cnaW2q0OPVm1Q= -k8s.io/cri-api v0.25.13/go.mod h1:yKsLus3raCZ+WbR2m5hS+3hUs5BgSldj2CFJTWyx48M= -k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= -k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw= -k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d h1:VcFq5n7wCJB2FQMCIHfC+f+jNcGgNMar1uKd6rVlifU= -k8s.io/utils v0.0.0-20230308161112-d77c459e9343 h1:m7tbIjXGcGIAtpmQr7/NAi7RsWoW3E7Zcm4jI1HicTc= -layeh.com/radius v0.0.0-20221205141417-e7fbddd11d68 h1:2NDro2Jzkrqfngy/sA5GVnChs7fx8EzcQKFi/lI2cfg= -lukechampine.com/uint128 v1.2.0 h1:mBi/5l91vocEN8otkC5bDLhi2KdCticRiwbdB0O+rjI= -modernc.org/cc/v3 v3.40.0 h1:P3g79IUS/93SYhtoeaHW+kRCIrYaxJ27MFPv+7kaTOw= -modernc.org/ccgo/v3 v3.16.13 h1:Mkgdzl46i5F/CNR/Kj80Ri59hC8TKAhZrYSaqvkwzUw= -modernc.org/libc v1.22.3 h1:D/g6O5ftAfavceqlLOFwaZuA5KYafKwmr30A6iSqoyY= -modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= -modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= -modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= -modernc.org/sqlite v1.21.0 h1:4aP4MdUf15i3R3M2mx6Q90WHKz3nZLoz96zlB6tNdow= -modernc.org/strutil v1.1.3 h1:fNMm+oJklMGYfU9Ylcywl0CO5O6nTfaowNsh2wpPjzY= -modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= -sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go deleted file mode 100644 index 1318957..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go +++ /dev/null @@ -1,20 +0,0 @@ -//go:build !custom || inputs || inputs.npu - -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package all this for register -package all - -import _ "github.com/influxdata/telegraf/plugins/inputs/npu" // register plugin diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md b/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md deleted file mode 100644 index 72fc73e..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md +++ /dev/null @@ -1,107 +0,0 @@ -# npu plugin of telegraf -## 使用介绍 -该插件代码可根据以下两种方法来使用(选择其一即可): - -### 1、源码集成使用(适合未安装Telegraf的情况) -对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/write_external_plugin/ -#### **编译步骤:** -拉取telegraf v1.26.0分支源码 -```shell -git clone -b v1.26.0 https://github.com/influxdata/telegraf.git -``` -拉取插件源码 -```shell -git clone -b [latest_tag] https://gitcode.com/Ascend/mind-cluster.git -# [latest_tag]此tag请自行修改,建议采用仓库的最新标签,否则可能导致引用函数失效 -``` -将插件代码集成到telegraf源码中(其中路径按实际修改) -```shell -cp -r mind-cluster/component/npu-exporter/platforms/inputs/npu telegraf/plugins/inputs -``` -将插件注册到telegraf(其中路径按实际修改) -```shell -cp -r mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go telegraf/plugins/inputs/all -``` -将telegraf源码中的Makefile里的“CGO_ENABLED=0”改为“CGO_ENABLED=1” -```shell -cd telegraf -sed -i s"/CGO_ENABLED=0/CGO_ENABLED=1/" Makefile -``` - -将如下内容加入到telegraf源码的go.mod的文件里 -注意:[latest_tag]请自行修改为commitID/分支名称/tag名称中的一种,建议采用仓库的最新标签,否则可能导致引用函数失效 -```go.mod -require huawei.com/npu-exporter/v6 v6.0.0-RC1 - -replace huawei.com/npu-exporter/v6 => gitcode.com/Ascend/mind-cluster.git/component/npu-exporter/v6 [latest_tag] -replace ascend-common => gitcode.com/Ascend/mind-cluster.git/component/ascend-common [latest_tag] -``` - -然后执行 -```shell -go mod tidy -``` -接着编译telegraf -```shell -make all -``` -运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) -```shell -mkdir -m 750 /var/log/mindx-dl/npu-exporter -``` -源码集成时,该日志可通过hwlog.LogConfig{}结构体来配置,该结构体的详细信息如下 -```go -type LogConfig struct { - // log file path, default "/var/log/mindx-dl/npu-exporter/npu-plugin.log" in npu plugin - LogFileName string - // only write to std out, default value: false - OnlyToStdout bool - // only write to file, default value: false - OnlyToFile bool - // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 - LogLevel int - // size of a single log file (MB), default value: 2MB in npu plugin - FileMaxSize int - // MaxLineLength Max length of each log line, default value: 256 - MaxLineLength int - // maximum number of backup log files, set as 2 in npu plugin - MaxBackups int - // maximum number of days for backup log files, default value: 2 - MaxAge int - // whether backup files need to be compressed, default value: false - IsCompress bool - // expiration time for log cache, default value: 1s - ExpiredTime int - // Size of log cache space, default: 2048 - CacheSize int -} -``` -#### **使用示例:** -使用插件中提供的配置文件运行telegraf -```shell -./telegraf --config path_to_plugins/inputs/npu/sample.conf -``` - -### 2、二进制集成,使用telegraf的execd机制(适合已安装Telegraf的情况) -对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/shim/ - -从[MindCluster社区](https://www.hiascend.com/developer/download/community/result?module=cluster)获取npu-exporter软件包,并从中解压出npu-exporter二进制文件 - -### 使用 -运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) -```shell -mkdir -m 750 /var/log/mindx-dl/npu-exporter -``` -先编写配置文件,如test.conf -``` -[[inputs.execd]] - command = ["path_to_npu_plugin/npu-exporter", "-platform=Telegraf"] - signal = "none" - -[[outputs.file]] - files=["stdout"] -``` -然后运行telegraf -```shell -./telegraf --config path_to_config_file/test.conf -``` \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go deleted file mode 100644 index 4c200e0..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package npu this for parse and pack -package npu - -import ( - _ "embed" - "strings" - - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/inputs" - - "ascend-common/api" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -//go:embed sample.conf -var sampleConfig string - -const ( - num2 = 2 -) - -// WatchNPU npu watch struct -type WatchNPU struct { - collector *common.NpuCollector -} - -// SampleConfig used to return sampleConfig -func (*WatchNPU) SampleConfig() string { - return sampleConfig -} - -// Gather used to gather information from dcmi info and hccn tool info -func (npu *WatchNPU) Gather(acc telegraf.Accumulator) error { - - fieldsMap := make(map[string]map[string]interface{}) - const devName = "ascend" - - devTagValue := "" - if cardType := npu.collector.Dmgr.GetDevType(); cardType == api.Ascend910A3 || cardType == api.Ascend910B || - cardType == api.Ascend910A { - devTagValue = strings.ToLower(api.Ascend910) - } else { - devTagValue = strings.ToLower(cardType) - } - logger.DynamicConfigure(logger.Config{Acc: acc}) - - containerMap := common.GetContainerNPUInfo(npu.collector) - chips := common.GetChipListWithVNPU(npu.collector) - - fieldsMap = npu.gatherChain(fieldsMap, common.ChainForSingleGoroutine, containerMap, chips) - fieldsMap = npu.gatherChain(fieldsMap, common.ChainForMultiGoroutine, containerMap, chips) - fieldsMap = npu.gatherChain(fieldsMap, common.ChainForCustomPlugin, containerMap, chips) - - generalFields := fieldsMap[common.GeneralDevTagKey] - acc.AddFields(devName, generalFields, map[string]string{"device": devTagValue}) - - // after the report is completed, deleted to avoid repeated reporting in the for loop - delete(fieldsMap, common.GeneralDevTagKey) - for key, fields := range fieldsMap { - - ids := strings.Split(key, "_") - devTag := map[string]string{"device": devTagValue + "-" + ids[0]} - if len(ids) >= num2 { - devTag["vdev_id"] = ids[1] - } - - acc.AddFields(devName, fields, devTag) - } - - return nil -} - -func (npu *WatchNPU) gatherChain(fieldsMap map[string]map[string]interface{}, chain []common.MetricsCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - - for _, collector := range chain { - fieldsMap = collector.UpdateTelegraf(fieldsMap, npu.collector, containerMap, chips) - } - return fieldsMap -} - -func init() { - inputs.Add("npu", func() telegraf.Input { - return &WatchNPU{ - collector: common.Collector, - } - }) -} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go deleted file mode 100644 index c8adef4..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package npu this for parse and pack -package npu - -import ( - "fmt" - "strings" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/influxdata/telegraf" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - num5 = 5 -) - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - initChain() -} - -func initChain() { - common.ChainForSingleGoroutine = []common.MetricsCollector{ - &metrics.VersionCollector{}, - } -} - -func mockNewNpuCollector() *common.NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: time.Duration(num5), - updateTime: time.Duration(num5), - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -// TestGather verifies different device type scenarios -func TestGather(t *testing.T) { - tests := []struct { - name string - deviceType string - expectedTag string - }{ - {name: api.Ascend910A3, - deviceType: api.Ascend910A3, - expectedTag: api.Ascend910, - }, - {name: api.Ascend310P, - deviceType: api.Ascend310P, - expectedTag: api.Ascend310P, - }, - } - npu := &WatchNPU{ - collector: mockNewNpuCollector(), - } - acc := &MockAccumulator{} - - for _, tt := range tests { - convey.Convey(tt.name, t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - - patches.ApplyMethodReturn(npu.collector.Dmgr, "GetDevType", tt.deviceType) - patches.ApplyFuncReturn(common.GetContainerNPUInfo, nil) - patches.ApplyFuncReturn(common.GetChipListWithVNPU, nil) - patches.ApplyMethodReturn(common.ChainForSingleGoroutine[0], "UpdateTelegraf", - map[string]map[string]interface{}{ - common.GeneralDevTagKey: {"npu_exporter_version_info": "7.0.0"}, - "0": {"npu_chip_info_power": "1"}, - "1_100": {"npu_chip_info_voltage": "1"}, - }) - - err := npu.Gather(acc) - convey.So(err, convey.ShouldBeNil) - convey.So(acc.fields["ascend,device="+strings.ToLower(tt.expectedTag)], convey.ShouldNotBeEmpty) - }) - } -} - -// TestGatherChain tests the gatherChain method of WatchNPU -func TestGatherChain(t *testing.T) { - npu := &WatchNPU{} - fieldsMap := make(map[string]map[string]interface{}) - chain := []common.MetricsCollector{&metrics.VersionCollector{}} - - convey.Convey("TestGatherChain", t, func() { - result := npu.gatherChain(fieldsMap, chain, nil, nil) - logger.Infof("result:%v", result) - convey.So(len(result), convey.ShouldEqual, 1) - }) -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -// MockAccumulator is a mock implementation of telegraf.Accumulator -type MockAccumulator struct { - fields map[string]map[string]interface{} -} - -func (m *MockAccumulator) AddFields(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { - if m.fields == nil { - m.fields = make(map[string]map[string]interface{}) - } - pairs := make([]string, 0, len(tags)) - for k, v := range tags { - pairs = append(pairs, fmt.Sprintf("%s=%v", k, v)) - } - metricKey := measurement + "," + strings.Join(pairs, ",") - m.fields[metricKey] = fields -} - -func (m *MockAccumulator) AddGauge(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddCounter(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddSummary(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddHistogram(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddMetric(metric telegraf.Metric) { -} - -func (m *MockAccumulator) SetPrecision(precision time.Duration) { -} - -func (m *MockAccumulator) AddError(err error) { -} - -func (m *MockAccumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator { - return nil -} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf b/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf deleted file mode 100644 index 11fe998..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf +++ /dev/null @@ -1,9 +0,0 @@ -[agent] - interval="20s" - flush_interval="20s" - -[[inputs.npu]] - npu_log_level = 1 - -[[outputs.file]] - files=["stdout"] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go deleted file mode 100644 index 088eeb9..0000000 --- a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package prometheus for prometheus collector -package prom - -import ( - "github.com/prometheus/client_golang/prometheus" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -// CollectorForPrometheus Entry point for collecting and converting -type CollectorForPrometheus struct { - collector *common.NpuCollector -} - -// NewPrometheusCollector create an instance of prometheus Collector -func NewPrometheusCollector(collector *common.NpuCollector) *CollectorForPrometheus { - promCollector := &CollectorForPrometheus{ - collector: collector, - } - return promCollector -} - -// Describe desc metrics of prometheus -func (*CollectorForPrometheus) Describe(ch chan<- *prometheus.Desc) { - if ch == nil { - logger.Error("ch is nil ") - return - } - const cacheSize = 100 - tempCh := make(chan *prometheus.Desc, cacheSize) - done := make(chan bool) - - go func() { - seenMetrics := make(map[string]struct{}) - for desc := range tempCh { - if desc == nil { - continue - } - descKey := utils.GetDescName(desc) - if _, exists := seenMetrics[descKey]; exists { - logger.Warnf("duplicate metric description detected, keeping first declaration, ignoring duplicate: %s", desc) - continue - } - seenMetrics[descKey] = struct{}{} - ch <- desc - } - // tempCh closed - done <- true - }() - - describeChain(tempCh, common.ChainForSingleGoroutine) - describeChain(tempCh, common.ChainForMultiGoroutine) - describeChain(tempCh, common.ChainForCustomPlugin) - - close(tempCh) - - <-done -} - -func describeChain(ch chan<- *prometheus.Desc, chain []common.MetricsCollector) { - for _, collector := range chain { - if collector != nil { - collector.Describe(ch) - } - } -} - -// Collect update metrics of prometheus -func (n *CollectorForPrometheus) Collect(ch chan<- prometheus.Metric) { - containerMap := common.GetContainerNPUInfo(n.collector) - chips := common.GetChipListWithVNPU(n.collector) - collectChain(ch, n, containerMap, chips, common.ChainForSingleGoroutine) - collectChain(ch, n, containerMap, chips, common.ChainForMultiGoroutine) - collectChain(ch, n, containerMap, chips, common.ChainForCustomPlugin) -} - -func collectChain(ch chan<- prometheus.Metric, n *CollectorForPrometheus, containerMap map[int32]container.DevicesInfo, - chips []common.HuaWeiAIChip, chain []common.MetricsCollector) { - if ch == nil { - logger.Error("ch is nil") - return - } - for _, collector := range chain { - collector.UpdatePrometheus(ch, n.collector, containerMap, chips) - } -} diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go deleted file mode 100644 index 331ca66..0000000 --- a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go +++ /dev/null @@ -1,159 +0,0 @@ -/* -Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package prometheus for prometheus collector -package prom - -import ( - "strconv" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - maxMetricsCount = 2000 - num5 = 5 - mockContainerName = "mockContainerName" - maxChipNum int32 = 8 -) - -func TestDescribe(t *testing.T) { - - convey.Convey("test prometheus desc ", t, func() { - collector := NewPrometheusCollector(nil) - - convey.Convey("test prometheus desc when ch is nil", func() { - collector.Describe(nil) - }) - convey.Convey("test prometheus desc when ch is not nil", func() { - ch := make(chan *prometheus.Desc, maxMetricsCount) - collector.Describe(ch) - t.Logf("Describe len(ch):%v", len(ch)) - - convey.So(ch, convey.ShouldNotBeEmpty) - }) - - }) -} - -func TestCollect(t *testing.T) { - convey.Convey("test prometheus collect ", t, func() { - npuCollector := mockNewNpuCollector() - collector := NewPrometheusCollector(npuCollector) - - convey.Convey("test prometheus collect when ch is nil", func() { - collector.Collect(nil) - }) - convey.Convey("test prometheus collect when ch is not nil", func() { - - ch := make(chan prometheus.Metric, maxMetricsCount) - - patches := gomonkey.NewPatches() - collector.Collect(ch) - - patches.ApplyFuncReturn(common.GetChipListWithVNPU, mockGetNPUChipList()) - patches.ApplyFuncReturn(common.GetContainerNPUInfo, mockGetContainerNPUInfo()) - - t.Logf("Describe len(ch):%v", len(ch)) - convey.So(ch, convey.ShouldNotBeEmpty) - }) - }) -} - -func mockNewNpuCollector() *common.NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: time.Duration(num5), - updateTime: time.Duration(num5), - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -func mockGetNPUChipList() []common.HuaWeiAIChip { - chips := make([]common.HuaWeiAIChip, 0) - for id := int32(0); id < maxChipNum; id++ { - chip := common.HuaWeiAIChip{ - CardId: id, - PhyId: id, - DeviceID: id, - LogicID: id, - } - - chips = append(chips, chip) - } - return chips -} - -func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { - containsInfo := make(map[int32]container.DevicesInfo) - for id := int32(0); id < maxChipNum; id++ { - - containerInfo := container.DevicesInfo{ - ID: strconv.Itoa(int(id)), - Name: mockContainerName, - Devices: []int{int(id)}, - } - containsInfo[id] = containerInfo - } - return containsInfo -} - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - - initChain() -} - -func initChain() { - common.ChainForSingleGoroutine = []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.BaseInfoCollector{}, - &metrics.SioCollector{}, - &metrics.VersionCollector{}, - &metrics.HbmCollector{}, - &metrics.DdrCollector{}, - &metrics.VnpuCollector{}, - &metrics.PcieCollector{}, - } - common.ChainForMultiGoroutine = []common.MetricsCollector{ - &metrics.NetworkCollector{}, - &metrics.RoceCollector{}, - &metrics.OpticalCollector{}, - } -} diff --git a/mind-cluster/component/npu-exporter/plugins/README.md b/mind-cluster/component/npu-exporter/plugins/README.md deleted file mode 100644 index 5690dac..0000000 --- a/mind-cluster/component/npu-exporter/plugins/README.md +++ /dev/null @@ -1,388 +0,0 @@ -## 自定义插件开发说明 - -用户可参考提供的demo,或将代码拷贝到plugins目录下,重新编译部署,下面对demo中各文件进行说明 - -- `dcmi.go` 、`dcmi_interface_api.h`:用户自定义NPU指标的接口声明与cgo实现,用于对接驱动dcmi接口,具体可参考demo实现,全部dcmi接口续参考驱动的dcmi接口文档。 -- `custom_metrics.go` 实现`MetricCollector`的接口,用于指标采集与上报,需要实现下面的接口,具体可参考demo实现: - - Describe:prometheus上报指标前,需要先定义指标的,该接口用于prometheus的指标定义 - - CollectToCache: 指标采集方法,每个采集周期都会执行,从外部获取数据,并传入到内部缓存中 - - UpdatePrometheus: 按照prometheus的格式,将缓存中的数据返回 - - UpdateTelagraf:按照telagraf的格式,将缓存中的数据返回。 - - IsSupporterd:检测当前环境,判断是否支持当前设备的检测。 - - PreCollect:正式开始采集前执行一次,可用于设备初始化。可以为空。 - - PostCollect:采集结束后执行一次,可用于数据的回收。可以为空。 -- `register.go`,提供插件注册函数,在npu-exporter启动时完成插件注册并完成dcmi接口初始化,**RegisterPlugin函数签名不要修改**,自定义插件通过`AddPluginCollector`接口注册,指标名称需要与`pluginConfiguration.json`中的指标组名称保持一致 - -对于插件指标组内定义的指标名称,不要与现有代码中已定义的插件指标(当前NPU指标、插件指标)重名 - -自定义插件采集时间超过10s后,npu-exporter会打印日志,提示插件采集时间过长,执行下一个插件采集。 - -### 编译部署 - -插件开发完后,执行Npu-exporter代码目录下的`build/build.sh`完成编译,需要提前准备go开发环境。 - -编译完成后,会在output目录下生成新的二进制文件与相关配置文件,根据需要打开或关闭相应开关,根据安装部署章节的安装指导,重新作镜像部署即可 - - - -`dcmi.go` - -```go -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins this for dcmi interface -package plugins - -// #cgo LDFLAGS: -ldl -/* - #include - #include - #include - #include - - #include "dcmi_interface_api.h" - - static void *dcmiHandle; - #define SO_NOT_FOUND -99999 - #define FUNCTION_NOT_FOUND -99998 - #define SUCCESS 0 - #define ERROR_UNKNOWN -99997 - #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); - - static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); - int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ - CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) - } - - // load .so files and functions - static int dcmiLoad_dl(const char* dcmiLibPath){ - if (dcmiLibPath == NULL) { - fprintf (stderr,"lib path is null\n"); - return SO_NOT_FOUND; - } - dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); - if (dcmiHandle == NULL){ - fprintf (stderr,"%s\n",dlerror()); - return SO_NOT_FOUND; - } - - dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); - - return SUCCESS; - } - - static int dcmiShutDown(void){ - if (dcmiHandle == NULL) { - return SUCCESS; - } - return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); - } -*/ -import "C" -import ( - "fmt" - - "unsafe" - - "ascend-common/common-utils/utils" - "ascend-common/devmanager/common" -) - -const ( - dcmiLibraryName = "libdcmi.so" -) - -// DcLoad load dcmi symbol -func DcLoad() error { - dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) - if err != nil { - return err - } - cDcmiTemplateName := C.CString(dcmiLibPath) - defer C.free(unsafe.Pointer(cDcmiTemplateName)) - if retCode := C.dcmiLoad_dl(cDcmiTemplateName); retCode != C.SUCCESS { - return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) - } - return nil -} - -// DcShutDown clean the dynamically loaded resource -func DcShutDown() error { - if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { - return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) - } - - return nil -} - -// DcGetDeviceHealth get device health -func DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var health C.uint - if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), - &health); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ - "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) - } - if common.IsGreaterThanOrEqualInt32(int64(health)) { - return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ - "health: %d", cardID, deviceID, int64(health)) - } - return int32(health), nil -} - -``` - - - -`dcmi_interface_api.h` - -```c++ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef __DCMI_INTERFACE_API_H__ -#define __DCMI_INTERFACE_API_H__ - -#ifdef __cplusplus -#if __cplusplus -extern "C" { -#endif -#endif /* __cplusplus */ - -#define DCMIDLLEXPORT static - -DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); - -#ifdef __cplusplus -#if __cplusplus -} -#endif -#endif /* __cplusplus */ - -#endif /* __DCMI_INTERFACE_API_H__ */ -``` - - - -`custom_metrics.go` - -```go -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -import ( - "strings" - "sync" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - PluginInfoDesc = prometheus.NewDesc("plugin_info", "exporter custom plugin info", - []string{"plugin_label"}, nil) - - PluginNpuInfoDesc = prometheus.NewDesc("npu_plugin_info", "exporter custom npu plugin info", - []string{"npu_plugin_label"}, nil) -) - -const ( - pluginInfoKey = "pluginInfoKey" - pluginInfoValue = 1.11111 - pluginLabel = "pluginLabel" - npuPluginLabel = "npuPluginInfoKey" - npuPluginInfoKey = "npuPluginInfoKey" - pluginName = "MyPlugin" -) - -// PluginInfoCollector collect custom plugin info -type PluginInfoCollector struct { - common.MetricsCollectorAdapter - Cache sync.Map -} - -// Describe description of the metric -func (c *PluginInfoCollector) Describe(ch chan<- *prometheus.Desc) { - // add desc - logger.Debug("PluginInfoCollector Describe") - ch <- PluginInfoDesc - ch <- PluginNpuInfoDesc -} - -// CollectToCache collect the metric to cache -func (c *PluginInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { - // collect metric to cache - logger.Debug("PluginInfoCollector CollectToCache") - c.Cache.Store(pluginInfoKey, pluginInfoValue) - health, err := DcGetDeviceHealth(0, 0) - if err != nil { - logger.Error(err) - return - } - c.Cache.Store(npuPluginInfoKey, health) -} - -// UpdatePrometheus update prometheus metric -func (c *PluginInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { - logger.Debug("PluginInfoCollector UpdatePrometheus") - // get metric from cache - pluginCache, _ := c.Cache.Load(pluginInfoKey) - npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) - // update plugin info - ch <- prometheus.NewMetricWithTimestamp(time.Now(), - prometheus.MustNewConstMetric(PluginInfoDesc, prometheus.GaugeValue, pluginCache.(float64), pluginLabel)) - // update npu plugin info - value := float64(npuPluginCache.(int32)) - ch <- prometheus.NewMetricWithTimestamp(time.Now(), - prometheus.MustNewConstMetric(PluginNpuInfoDesc, prometheus.GaugeValue, value, npuPluginLabel)) - -} - -// UpdateTelegraf update telegraf metric -func (c *PluginInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - logger.Debug("PluginInfoCollector UpdateTelegraf") - // get metric from cache - pluginCache, _ := c.Cache.Load(pluginInfoKey) - npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) - // update plugin info - if fieldsMap[common.GeneralDevTagKey] == nil { - fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], PluginInfoDesc, pluginCache.(float64), "") - // update npu plugin info - const NpuLogicID = "1" - value := float64(npuPluginCache.(int32)) - if fieldsMap[NpuLogicID] == nil { - fieldsMap[NpuLogicID] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[NpuLogicID], PluginNpuInfoDesc, value, "") - return fieldsMap -} - -// PreCollect pre handle before collect -func (c *PluginInfoCollector) PreCollect(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { - logger.Debug("PluginInfoCollector PreCollect") -} - -// PostCollect post handle after collect -func (c *PluginInfoCollector) PostCollect(n *common.NpuCollector) { - logger.Debug("PluginInfoCollector PostCollect") -} - -// IsSupported Check whether the current hardware supports this metric -func (c *PluginInfoCollector) IsSupported(n *common.NpuCollector) bool { - logger.Debug("PluginInfoCollector IsSupported") - return true -} - -// getDescName parse metrics name from prometheus.Desc object -func getDescName(desc *prometheus.Desc) string { - str := desc.String() - startIndex := strings.Index(str, "fqName: ") + len("fqName: ") - readfqName := str[startIndex:] - - endIndex := strings.Index(readfqName, ",") - if endIndex != -1 { - readfqName = readfqName[:endIndex] - } - - readfqName = strings.Trim(readfqName, "\"") - return readfqName -} - -func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { - fieldMap[getDescName(desc)+extInfo] = value -} - - -``` - - - -`register.go` - -```go -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -import ( - "huawei.com/npu-exporter/v6/collector/config" - "huawei.com/npu-exporter/v6/utils/logger" -) - -// RegisterPlugin register plugin collector -func RegisterPlugin() { - err := config.AddPluginCollector(pluginName, &PluginInfoCollector{}) - if err != nil { - logger.Errorf("add plugin failed: %v\n", err) - } - logger.Infof("add plugin ok: %v\n", pluginName) - err = DcLoad() - if err != nil { - logger.Errorf("dcmi init failed: %v\n", err) - return - } -} - -``` - diff --git a/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go b/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go deleted file mode 100644 index db462a4..0000000 --- a/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go +++ /dev/null @@ -1,358 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -import ( - "encoding/json" - "fmt" - "os" - "sort" - "strings" - "sync" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/config" - "huawei.com/npu-exporter/v6/collector/container" - npuutils "huawei.com/npu-exporter/v6/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - metricDesc *prometheus.Desc - labelKeys []string // a list of tag keys extracted from the datalist - jsonFilePath string - isSupported bool - currentVersion versionInfo -) - -const ( - size100k = 100 * 1024 - maxLabelSize = 10 - num1000 = 1000 - maxDataListSize = 128 - maxMetricNameSize = 128 - maxDescSize = 1024 - fileMetricsDisabledMsg = "file metrics collection will be disabled" - skipCurrentCollectionMsg = "will skip current collection and report cached metrics" - excludedPermission = 0111 // file should not have any execute permission -) - -type versionInfo struct { - name string - desc string - version string -} - -// TextMetricData represents the JSON structure -type TextMetricData struct { - Version string `json:"version"` - Desc string `json:"desc"` - Name string `json:"name"` - Timestamp int64 `json:"timestamp"` - DataList []DataItem `json:"data_list"` -} - -// DataItem represents each item in data_list -type DataItem struct { - Label map[string]string `json:"label"` - Value float64 `json:"value"` -} - -// InitTextMetricsDesc init text metric -func InitTextMetricsDesc(filePath string) { - if filePath == "" { - return - } - paths := strings.Split(filePath, ",") - if len(paths) > 1 { - logger.Warnf("multiple file paths detected in filePath: %s, only the first file will be used", filePath) - jsonFilePath = strings.TrimSpace(paths[0]) - } else { - jsonFilePath = filePath - } - if utils.IsDir(jsonFilePath) { - logger.Errorf("file path %s is a directory, only support specify file path", filePath) - return - } - fileData, err := waitForFile(jsonFilePath, time.Minute) - if err != nil { - logger.Warnf("read json file %s failed, %s: %v", jsonFilePath, fileMetricsDisabledMsg, err) - return - } - var metricsData TextMetricData - if err := json.Unmarshal(fileData, &metricsData); err != nil { - logger.Warnf("unmarshal json file %s failed, %s: %v, "+ - "Possible causes:\n1. The file is not in JSON format\n2. File size is more than 100KB ", jsonFilePath, fileMetricsDisabledMsg, err) - return - } - - if err := isDataOk(&metricsData); err != nil { - logger.Warnf("%v, %s", err, fileMetricsDisabledMsg) - return - } - - desc := metricsData.Desc - labelKeys = make([]string, 0, len(metricsData.DataList[0].Label)) - for key := range metricsData.DataList[0].Label { - labelKeys = append(labelKeys, key) - } - sort.Strings(labelKeys) - logger.Infof("init text metric succeeded, metricName: %v, version: %v, desc: %v, labels: %v", - metricsData.Name, metricsData.Version, desc, labelKeys) - - metricDesc = prometheus.NewDesc(metricsData.Name, desc, labelKeys, nil) - isSupported = true - currentVersion = versionInfo{ - name: metricsData.Name, - desc: desc, - version: metricsData.Version, - } - err = config.AddPluginCollector("text", &TextMetricsInfoCollector{}) - if err != nil { - logger.Errorf("%v", err) - } -} - -func isDataOk(metricsData *TextMetricData) error { - if len(metricsData.DataList) == 0 { - return fmt.Errorf("dataList is empty in json file %s", jsonFilePath) - } - if len(metricsData.DataList) > maxDataListSize { - return fmt.Errorf("size of dataList(%d) is more than max allowed dataList size(%d) in json file %s", - len(metricsData.DataList), maxDataListSize, jsonFilePath) - } - if len(metricsData.DataList[0].Label) > maxLabelSize { - return fmt.Errorf("size of first item's Label(%d) is more than max allowed label size(%d) in json file %s", - len(metricsData.DataList[0].Label), maxLabelSize, jsonFilePath) - } - if metricsData.Name == "" { - return fmt.Errorf("name field is empty in json file %s", jsonFilePath) - } - if len(metricsData.Name) > maxMetricNameSize { - return fmt.Errorf("length of metric name should not larger than %d, but current is %d", - maxMetricNameSize, len(metricsData.Name)) - } - if metricsData.Desc == "" { - return fmt.Errorf("desc field is empty in json file %s", jsonFilePath) - } - if len(metricsData.Desc) > maxDescSize { - return fmt.Errorf("length of metric desc should not larger than %d, but current is %d", - maxDescSize, len(metricsData.Desc)) - } - if metricsData.Version == "" { - return fmt.Errorf("version field is empty in json file %s", jsonFilePath) - } - // only support 1.0 version currently - if metricsData.Version != "1.0" { - return fmt.Errorf("version should be 1.0, but current is %s", metricsData.Version) - } - if metricsData.Timestamp <= 0 { - return fmt.Errorf("timestamp field is empty or not correct in json file %s", jsonFilePath) - } - return nil -} - -// waitForFile wait for file to exist -func waitForFile(filePath string, timeout time.Duration) ([]byte, error) { - const tickerDuration = 100 - deadline := time.Now().Add(timeout) - ticker := time.NewTicker(tickerDuration * time.Millisecond) - defer ticker.Stop() - once := sync.Once{} - - for { - fileData, err := utils.ReadLimitBytes(filePath, size100k) - err2 := checkFile(filePath) - if err2 != nil { - hwlog.RunLog.Errorf("check file err, %s: %v", filePath, err2) - } - if err2 != nil && !os.IsNotExist(err2) { - return nil, err2 - } - - if err == nil && err2 == nil && len(fileData) > 0 { - logger.Infof("successfully read json file %s", filePath) - return fileData, nil - } - if os.IsNotExist(err) || len(fileData) == 0 { - once.Do(func() { - logger.Warnf("file [%v] is not exist or file is empty, will wait 1 minute", filePath) - }) - if time.Now().After(deadline) { - return nil, fmt.Errorf("file %s does not exist or file is empty after waiting %v", filePath, timeout) - } - select { - case <-ticker.C: - continue - } - } - return nil, err - } -} - -func checkFile(filePath string) error { - absFilePath, err := utils.CheckPath(filePath) - if err != nil { - return err - } - if err = utils.DoCheckOwnerAndPermission(absFilePath, excludedPermission, 0); err != nil { - logger.Errorf("file permission should not included %04o: %v", excludedPermission, err) - return err - } - return nil -} - -// TextMetricsInfoCollector collect custom plugin info -type TextMetricsInfoCollector struct { - common.MetricsCollectorAdapter - Cache sync.Map -} - -// Describe description of the metric -func (c *TextMetricsInfoCollector) Describe(ch chan<- *prometheus.Desc) { - // add desc - if metricDesc != nil { - ch <- metricDesc - } -} - -// CollectToCache collect the metric to cache -func (c *TextMetricsInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { - // collect metric to cache - logger.Debugf("TextMetricsInfoCollector CollectToCache") - - fileData, err := utils.ReadLimitBytes(jsonFilePath, size100k) - if err != nil { - logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "readFileErr"}, - "read json file %s failed: %v", jsonFilePath, err) - return - } - hwlog.ResetErrCnt("textMetrics", "readFileErr") - - var metricsData TextMetricData - if err := json.Unmarshal(fileData, &metricsData); err != nil { - logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "unmarshalFileErr"}, - "unmarshal json file %s failed: %v", jsonFilePath, err) - return - } - hwlog.ResetErrCnt("textMetrics", "unmarshalFileErr") - - if err := isDataOk(&metricsData); err != nil { - logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "dataNotOk"}, - "%v, %s", err, skipCurrentCollectionMsg) - return - } - hwlog.ResetErrCnt("textMetrics", "dataNotOk") - - if versionChanged(metricsData) { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "textMetrics", ID: "versionChanged"}, - "json file base info changed, old: %v, new: %v", currentVersion, - versionInfo{name: metricsData.Name, desc: metricsData.Desc, version: metricsData.Version}) - return - } - hwlog.ResetErrCnt("textMetrics", "versionChanged") - - c.Cache.Store(common.GetCacheKey(c), metricsData) -} - -func versionChanged(data TextMetricData) bool { - if currentVersion.name != data.Name || currentVersion.desc != data.Desc || - currentVersion.version != data.Version { - return true - } - return false -} - -// UpdatePrometheus update prometheus metric -func (c *TextMetricsInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { - logger.Debug("TextMetricsInfoCollector UpdatePrometheus") - if metricDesc == nil { - logger.Warnf("metricDesc is not initialized, skip UpdatePrometheus") - return - } - cacheKey := common.GetCacheKey(c) - data, ok := c.Cache.Load(cacheKey) - if !ok { - logger.Debugf("cache key %s not found", cacheKey) - return - } - - textMetricsData, ok := data.(TextMetricData) - if !ok { - logger.Warnf("cache data type mismatch for key %s", cacheKey) - return - } - - timestamp := time.Unix(0, textMetricsData.Timestamp*num1000) - - for _, item := range textMetricsData.DataList { - labelValues := make([]string, len(labelKeys)) - for i, key := range labelKeys { - if value, ok := item.Label[key]; ok { - labelValues[i] = value - } else { - labelValues[i] = "" - } - } - - ch <- prometheus.NewMetricWithTimestamp(timestamp, - prometheus.MustNewConstMetric(metricDesc, prometheus.GaugeValue, item.Value, labelValues...)) - } -} - -// UpdateTelegraf update telegraf metric -func (c *TextMetricsInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - logger.Debug("TextMetricsInfoCollector UpdateTelegraf") - - if metricDesc == nil { - logger.Warnf("metricDesc is not initialized, skip UpdateTelegraf") - return fieldsMap - } - - cacheKey := common.GetCacheKey(c) - data, ok := c.Cache.Load(cacheKey) - if !ok { - logger.Debugf("cache key %s not found", cacheKey) - return fieldsMap - } - - textMetricData, ok := data.(TextMetricData) - if !ok { - logger.Warnf("cache data type mismatch for key %s", cacheKey) - return fieldsMap - } - - for _, item := range textMetricData.DataList { - if fieldsMap[common.GeneralDevTagKey] == nil { - fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) - } - npuutils.DoUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], metricDesc, item.Value, "") - } - - return fieldsMap -} - -// IsSupported Check whether the current hardware supports this metric -func (c *TextMetricsInfoCollector) IsSupported(n *common.NpuCollector) bool { - return isSupported -} diff --git a/mind-cluster/component/npu-exporter/plugins/register.go b/mind-cluster/component/npu-exporter/plugins/register.go deleted file mode 100644 index e9b5f41..0000000 --- a/mind-cluster/component/npu-exporter/plugins/register.go +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -// RegisterPlugin register plugin collector -func RegisterPlugin() { - -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/general_logger.go b/mind-cluster/component/npu-exporter/utils/logger/general_logger.go deleted file mode 100644 index 3f1e19c..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/general_logger.go +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "context" - "fmt" - - "ascend-common/common-utils/hwlog" -) - -const ( - maxLogLineLength = 1024 - defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" -) - -type generalLogger struct { -} - -// dynamicConfigure configures the logger -func (c *generalLogger) dynamicConfigure(Config) { -} - -// log logs with specified level -func (c *generalLogger) log(ctx context.Context, level Level, args ...interface{}) { - fn, ok := logFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), args...) -} - -// logf logs with specified level and format -func (c *generalLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { - fn, ok := logfFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), format, args...) -} - -func (c *generalLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, - args ...interface{}) { - - if opts.MaxCounts == 0 { - opts.MaxCounts = hwlog.ProblemOccurMaxNumbers - } - - if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - fn, ok := logfFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), format, args...) - } -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger.go b/mind-cluster/component/npu-exporter/utils/logger/logger.go deleted file mode 100644 index 723e070..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/logger.go +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "context" - "errors" - "fmt" - - "github.com/influxdata/telegraf" - - "ascend-common/common-utils/hwlog" -) - -// the method mapping table (avoid rebuilding with every call) -var ( - logFuncs = map[Level]logFunc{} - logfFuncs = map[Level]logfFunc{} -) - -const ( - // DebugLevel Debug level - DebugLevel Level = iota - 1 - // InfoLevel Info level - InfoLevel - // WarnLevel Warn level - WarnLevel - // ErrorLevel Error level - ErrorLevel - - // PrometheusPlatform Prometheus platform - PrometheusPlatform = "Prometheus" - // TelegrafPlatform Telegraf platform - TelegrafPlatform = "Telegraf" -) - -// HwLogConfig default log file -var HwLogConfig = &hwlog.LogConfig{ - LogFileName: defaultLogFile, - ExpiredTime: hwlog.DefaultExpiredTime, - CacheSize: hwlog.DefaultCacheSize, - MaxLineLength: maxLogLineLength, -} - -// Level log level -type Level int - -// logFunc log function -type logFunc func(ctx context.Context, args ...interface{}) - -// logfFunc logf function -type logfFunc func(ctx context.Context, format string, args ...interface{}) - -var ( - // logger Unified log printer - logger UnifiedLogger -) - -// InitLogger initialize the log manager -func InitLogger(platform string) error { - - if platform == TelegrafPlatform { - logger = &telegrafLogger{} - HwLogConfig.LogFileName = defaultTelegrafLogPath - HwLogConfig.OnlyToFile = true - } else if platform == PrometheusPlatform { - logger = &generalLogger{} - } else { - return errors.New("platform is not supported:" + platform) - } - - if err := hwlog.InitRunLogger(HwLogConfig, context.Background()); err != nil { - fmt.Printf("hwlog init failed, error is %v\n", err) - return err - } - - logFuncs = map[Level]logFunc{ - DebugLevel: hwlog.RunLog.DebugWithCtx, - InfoLevel: hwlog.RunLog.InfoWithCtx, - WarnLevel: hwlog.RunLog.WarnWithCtx, - ErrorLevel: hwlog.RunLog.ErrorWithCtx, - } - - logfFuncs = map[Level]logfFunc{ - DebugLevel: hwlog.RunLog.DebugfWithCtx, - InfoLevel: hwlog.RunLog.InfofWithCtx, - WarnLevel: hwlog.RunLog.WarnfWithCtx, - ErrorLevel: hwlog.RunLog.ErrorfWithCtx, - } - return nil -} - -// LogOptions options for log -type LogOptions struct { - Domain string - ID interface{} - MaxCounts int -} - -// Config config for telegraf -type Config struct { - Acc telegraf.Accumulator -} - -// UnifiedLogger unified logger interface -type UnifiedLogger interface { - dynamicConfigure(Config) - log(ctx context.Context, level Level, args ...interface{}) - logf(ctx context.Context, level Level, format string, args ...interface{}) - logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, args ...interface{}) -} - -// Debug print log info with debug level -func Debug(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), DebugLevel, args...) -} - -// Info print log info with info level -func Info(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), InfoLevel, args...) -} - -// Warn print log info with warn level -func Warn(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), WarnLevel, args...) -} - -// Error print log info with error level -func Error(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), ErrorLevel, args...) -} - -// Debugf print log info with debug level -func Debugf(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), DebugLevel, format, args...) -} - -// Infof print log info with info level -func Infof(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), InfoLevel, format, args...) -} - -// Warnf print log info with warn level -func Warnf(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), WarnLevel, format, args...) -} - -// Errorf print log info with error level -func Errorf(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), ErrorLevel, format, args...) -} - -// LogfWithOptions print log info with error level -func LogfWithOptions(level Level, opts LogOptions, format string, args ...interface{}) { - logger.logfWithOptions(hwlog.DeepIncrease(context.Background()), level, opts, format, args...) -} - -// DynamicConfigure configure the logger -func DynamicConfigure(config Config) { - logger.dynamicConfigure(config) -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger_test.go b/mind-cluster/component/npu-exporter/utils/logger/logger_test.go deleted file mode 100644 index a08ad4b..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/logger_test.go +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "errors" - "testing" - - "ascend-common/common-utils/hwlog" -) - -// TestInitLogger tests the InitLogger function -func TestInitLogger(t *testing.T) { - tests := []struct { - name string - platform string - expected error - }{ - { - name: "Telegraf Platform", - platform: TelegrafPlatform, - expected: nil, - }, - { - name: "Prometheus Platform", - platform: PrometheusPlatform, - expected: nil, - }, - { - name: "Unsupported Platform", - platform: "Unsupported", - expected: errors.New("platform is not supported:Unsupported"), - }, - } - - HwLogConfig.LogLevel = 0 - HwLogConfig.MaxBackups = hwlog.DefaultMaxBackups - HwLogConfig.LogFileName = defaultLogFile - HwLogConfig.MaxAge = hwlog.DefaultMinSaveAge - - var noExistLevel Level = 5 - var args = "mock" - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := InitLogger(tt.platform) - if tt.expected == nil && err != nil { - t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) - } else if tt.expected != nil && err.Error() != tt.expected.Error() { - t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) - } - - logger.log(nil, DebugLevel, args) - logger.log(nil, InfoLevel, args) - logger.log(nil, WarnLevel, args) - logger.log(nil, noExistLevel, args) - logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") - - logger.logf(nil, DebugLevel, args) - logger.logf(nil, InfoLevel, args) - logger.logf(nil, WarnLevel, args) - logger.logf(nil, noExistLevel, args) - logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") - - }) - } -} - -func TestLoggerMethods(t *testing.T) { - - tests := []struct { - name string - method func(...interface{}) - level Level - args []interface{} - }{ - {"test Debug", Debug, DebugLevel, []interface{}{"debug message"}}, - {"test Info", Info, InfoLevel, []interface{}{"info message"}}, - {"test Warn", Warn, WarnLevel, []interface{}{"warn message"}}, - {"test Error", Error, ErrorLevel, []interface{}{"error message"}}, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - test.method(test.args...) - }) - } - - testsF := []struct { - name string - method func(string, ...interface{}) - level Level - format string - args []interface{} - }{ - {"test Debugf", Debugf, DebugLevel, "debug message %d", []interface{}{1}}, - {"test Infof", Infof, InfoLevel, "info message %d", []interface{}{1}}, - {"test Warnf", Warnf, WarnLevel, "warn message %d", []interface{}{1}}, - {"test Errorf", Errorf, ErrorLevel, "error message %d", []interface{}{1}}, - } - - for _, test := range testsF { - t.Run(test.name, func(t *testing.T) { - test.method(test.format, test.args...) - }) - } -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go b/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go deleted file mode 100644 index 56c2ac5..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "context" - "errors" - "fmt" - "strings" - - "github.com/influxdata/telegraf" - - "ascend-common/common-utils/hwlog" -) - -var defaultTelegrafLogPath = "/var/log/mindx-dl/npu-exporter/npu-plugin.log" -var dangerousChars = map[string]string{ - "\n": "\\n", - "\r": "\\r", - "\t": "\\t", -} - -type telegrafLogger struct { - acc telegraf.Accumulator -} - -// dynamicConfigure configures the logger -func (c *telegrafLogger) dynamicConfigure(config Config) { - c.acc = config.Acc -} - -// log logs with specified level -func (c *telegrafLogger) log(ctx context.Context, level Level, args ...interface{}) { - c.logf(hwlog.DeepIncrease(ctx), level, "%s", args...) -} - -// logf logs with specified level and format -func (c *telegrafLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { - sanitized := format - for char, replacement := range dangerousChars { - sanitized = strings.ReplaceAll(sanitized, char, replacement) - } - if level < InfoLevel || c.acc == nil { - fn, ok := logfFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), sanitized, args...) - return - } - - c.acc.AddError(errors.New(fmt.Sprintf(sanitized, args...))) -} - -// LogfWithOptions print log info with options -func (c *telegrafLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, - args ...interface{}) { - - if opts.MaxCounts == 0 { - opts.MaxCounts = hwlog.ProblemOccurMaxNumbers - } - - if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - c.logf(hwlog.DeepIncrease(ctx), level, format, args...) - } -} diff --git a/mind-cluster/component/npu-exporter/utils/utils.go b/mind-cluster/component/npu-exporter/utils/utils.go deleted file mode 100644 index b5da97c..0000000 --- a/mind-cluster/component/npu-exporter/utils/utils.go +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils for common utils -package utils - -import ( - "strings" - - "github.com/prometheus/client_golang/prometheus" -) - -// GetDescName parse metrics name from prometheus.Desc object -func GetDescName(desc *prometheus.Desc) string { - if desc == nil { - return "" - } - str := desc.String() - startIndex := strings.Index(str, "fqName: ") - if startIndex == -1 { - return "" - } - readfqName := str[startIndex+len("fqName: "):] - - endIndex := strings.Index(readfqName, ",") - if endIndex == -1 { - return "" - } - readfqName = readfqName[:endIndex] - - readfqName = strings.Trim(readfqName, "\"") - return readfqName -} - -// DoUpdateTelegraf update telegraf -func DoUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { - if fieldMap == nil { - return - } - fieldMap[GetDescName(desc)+extInfo] = value -} diff --git a/mind-cluster/component/npu-exporter/utils/utils_test.go b/mind-cluster/component/npu-exporter/utils/utils_test.go deleted file mode 100644 index 1a91d29..0000000 --- a/mind-cluster/component/npu-exporter/utils/utils_test.go +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils for common utils -package utils - -import ( - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" -) - -const ( - emptyString = "" - testMetricName = "test_metric" - testMetricName2 = "another_metric" - invalidDescStr = "invalid description" - noCommaDescStr = "fqName: test_metric" - normalDescStr = `fqName: "test_metric", help: "test help"` - normalDescStr2 = `fqName: another_metric, help: "another help"` - noQuoteDescStr = `fqName: test_metric, help: "test help"` - testHelp = "test help" -) - -func TestGetDescName(t *testing.T) { - convey.Convey("should return empty string when desc is nil", t, testGetDescNameNil) - convey.Convey("should return empty string when desc.String does not contain fqName prefix", t, - testGetDescNameNoFqName) - convey.Convey("should return empty string when desc.String does not contain comma", t, - testGetDescNameNoComma) - convey.Convey("should return metric name when desc.String contains valid format", t, - testGetDescNameValidFormat) -} - -func testGetDescNameNil() { - result := GetDescName(nil) - convey.So(result, convey.ShouldEqual, emptyString) -} - -func testGetDescNameNoFqName() { - desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) - patch := gomonkey.ApplyMethodReturn(desc, "String", invalidDescStr) - defer patch.Reset() - - result := GetDescName(desc) - convey.So(result, convey.ShouldEqual, emptyString) -} - -func testGetDescNameNoComma() { - desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) - patch := gomonkey.ApplyMethodReturn(desc, "String", noCommaDescStr) - defer patch.Reset() - - result := GetDescName(desc) - convey.So(result, convey.ShouldEqual, emptyString) -} - -func testGetDescNameValidFormat() { - testCases := []struct { - name string - descStr string - expected string - }{ - { - name: "should return metric name when desc.String contains normal format with quotes", - descStr: normalDescStr, - expected: testMetricName, - }, - { - name: "should return metric name when desc.String contains normal format without quotes", - descStr: noQuoteDescStr, - expected: testMetricName, - }, - { - name: "should return correct metric name when desc.String contains another metric", - descStr: normalDescStr2, - expected: testMetricName2, - }, - } - - for _, tc := range testCases { - desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) - patch := gomonkey.ApplyMethodReturn(desc, "String", tc.descStr) - - result := GetDescName(desc) - convey.So(result, convey.ShouldEqual, tc.expected) - - patch.Reset() - } -} diff --git a/mind-cluster/component/npu-exporter/versions/version.go b/mind-cluster/component/npu-exporter/versions/version.go deleted file mode 100644 index 63dba00..0000000 --- a/mind-cluster/component/npu-exporter/versions/version.go +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package versions record the program version -package versions - -var ( - // BuildVersion record the program build version - BuildVersion string - // BuildName record the program build name - BuildName string -) From 9ffb4dfe9dab61f9bf59d900f90cdb609221909b Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Wed, 21 Jan 2026 15:29:06 +0800 Subject: [PATCH 04/10] add README Prerequisites Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- README.md | 4 ++++ README_cn.md | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index 4c0dc48..6a669e2 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ Memory slicing is supported based on virtualization template, lease available te [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) +```bash +git submodule add https://gitcode.com/Ascend/mind-cluster.git +``` + ## Compile ```bash diff --git a/README_cn.md b/README_cn.md index 156ca53..a4988a4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -10,6 +10,11 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H 部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) +克隆子模块 mind-cluster +```bash +git submodule add https://gitcode.com/Ascend/mind-cluster.git +``` + ## 编译 ```bash From 5d6e5da6e4f810449d4bb20f493949f6e7fef175 Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Tue, 20 Jan 2026 16:34:38 +0800 Subject: [PATCH 05/10] feat: upgrade mind-cluster dependency and support Ascend 910C SuperPod device registration Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- cmd/main.go | 13 +- go.mod | 4 +- internal/manager/manager.go | 7 +- internal/server/server.go | 11 +- .../component/ascend-common/README.md | 8 + .../ascend-common/api/ascend-operator/LICENSE | 201 ++ .../api/ascend-operator/README.md | 164 ++ .../apis/batch/v1/ascendjob_types.go | 85 + .../apis/batch/v1/constants.go | 53 + .../ascend-operator/apis/batch/v1/defaults.go | 137 + .../ascend-operator/apis/batch/v1/register.go | 52 + .../apis/batch/v1/zz_generated.deepcopy.go | 137 + .../apis/batch/v1/zz_generated.defaults.go | 53 + .../client/clientset/versioned/clientset.go | 114 + .../clientset/versioned/scheme/register.go | 39 + .../versioned/typed/batch/v1/client.go | 110 + .../clientset/versioned/typed/batch/v1/job.go | 221 ++ .../externalversions/batch/interface.go | 49 + .../externalversions/batch/v1/interface.go | 48 + .../externalversions/batch/v1/job.go | 99 + .../informers/externalversions/factory.go | 207 ++ .../informers/externalversions/generic.go | 71 + .../internalinterfaces/factory_interfaces.go | 40 + .../listers/batch/v1/expansion_generated.go | 26 + .../client/listers/batch/v1/job.go | 108 + .../component/ascend-common/api/consts.go | 222 ++ .../ascend-common/api/default_name.go | 188 ++ .../ascend-common/api/publicfault.go | 32 + .../ascend-common/api/slownet/fault_net.go | 77 + .../ascend-common/api/superpoddevice.go | 36 + .../component/ascend-common/api/type.go | 30 + .../common-utils/cache/lrucache.go | 394 +++ .../common-utils/cache/lrucache_test.go | 304 +++ .../ascend-common/common-utils/hwlog/api.go | 310 +++ .../common-utils/hwlog/api_test.go | 165 ++ .../common-utils/hwlog/hwlog_adaptor.go | 174 ++ .../common-utils/hwlog/hwlog_adaptor_test.go | 126 + .../common-utils/hwlog/log_limiter.go | 156 ++ .../common-utils/hwlog/logger.go | 242 ++ .../common-utils/hwlog/logger_test.go | 217 ++ .../ascend-common/common-utils/hwlog/rolog.go | 447 ++++ .../common-utils/hwlog/rolog_test.go | 687 +++++ .../ascend-common/common-utils/hwlog/types.go | 49 + .../ascend-common/common-utils/hwlog/utils.go | 98 + .../common-utils/hwlog/utils_test.go | 38 + .../common-utils/limiter/limit_handler.go | 226 ++ .../limiter/limit_handler_test.go | 119 + .../common-utils/limiter/limit_listener.go | 161 ++ .../limiter/limit_listener_test.go | 125 + .../common-utils/limiter/limit_writer.go | 64 + .../common-utils/limiter/limit_writer_test.go | 37 + .../common-utils/rand/rand_linux.go | 71 + .../common-utils/rand/rand_linux_test.go | 54 + .../ascend-common/common-utils/rand/random.go | 28 + .../common-utils/rand/random_test.go | 32 + .../ascend-common/common-utils/utils/env.go | 35 + .../common-utils/utils/env_test.go | 51 + .../ascend-common/common-utils/utils/file.go | 176 ++ .../common-utils/utils/file_check.go | 240 ++ .../common-utils/utils/file_check_test.go | 194 ++ .../common-utils/utils/file_test.go | 169 ++ .../common-utils/utils/file_watcher.go | 85 + .../common-utils/utils/file_watcher_test.go | 81 + .../common-utils/utils/interface.go | 29 + .../common-utils/utils/interface_test.go | 36 + .../common-utils/utils/ip_utils.go | 98 + .../common-utils/utils/ip_utils_test.go | 182 ++ .../ascend-common/common-utils/utils/path.go | 382 +++ .../common-utils/utils/path_test.go | 232 ++ .../common-utils/utils/pwd_util.go | 75 + .../common-utils/utils/pwd_util_test.go | 59 + .../ascend-common/common-utils/utils/slice.go | 129 + .../common-utils/utils/slice_test.go | 536 ++++ .../common-utils/utils/strings.go | 75 + .../common-utils/utils/strings_test.go | 84 + .../ascend-common/devmanager/a310mgr.go | 25 + .../ascend-common/devmanager/a310pmgr.go | 35 + .../ascend-common/devmanager/a910mgr.go | 31 + .../devmanager/common/constants.go | 272 ++ .../ascend-common/devmanager/common/types.go | 435 ++++ .../ascend-common/devmanager/common/utils.go | 305 +++ .../devmanager/common/utils_test.go | 163 ++ .../devmanager/dcmi/constants.go | 78 + .../ascend-common/devmanager/dcmi/dcmi.go | 2213 +++++++++++++++++ .../devmanager/dcmi/dcmi_interface_api.h | 596 +++++ .../ascend-common/devmanager/devmanager.go | 1197 +++++++++ .../devmanager/devmanager_910a3_mock.go | 30 + .../devmanager/devmanager_910a3_mock_err.go | 43 + .../devmanager/devmanager_hccs_test.go | 166 ++ .../devmanager/devmanager_mock.go | 370 +++ .../devmanager/devmanager_mock_err.go | 369 +++ .../devmanager/devmanager_test.go | 78 + .../devmanager/hccn/hccn_tool.go | 335 +++ .../devmanager/hccn/hccn_tool_test.go | 49 + mind-cluster/component/ascend-common/go.mod | 55 + mind-cluster/component/ascend-common/go.sum | 492 ++++ .../component/npu-exporter/.gitignore | 1 + mind-cluster/component/npu-exporter/LICENSE | 201 ++ mind-cluster/component/npu-exporter/README.md | 42 + .../component/npu-exporter/build/Dockerfile | 21 + .../npu-exporter/build/Dockerfile-310P-1usoc | 31 + .../component/npu-exporter/build/build.sh | 80 + .../component/npu-exporter/build/build_ch.sh | 74 + .../build/metricConfiguration.json | 13 + .../build/npu-exporter-310P-1usoc.yaml | 167 ++ .../npu-exporter/build/npu-exporter.yaml | 140 ++ .../build/pluginConfiguration.json | 4 + .../npu-exporter/build/run_for_310P_1usoc.sh | 32 + .../component/npu-exporter/build/test.sh | 75 + .../npu-exporter/cmd/npu-exporter/main.go | 545 ++++ .../common/collector_for_container.go | 109 + .../common/collector_for_container_test.go | 137 + .../collector/common/constants.go | 140 ++ .../collector/common/metrics_collector.go | 192 ++ .../common/metrics_collector_test.go | 231 ++ .../collector/common/npu_collector.go | 423 ++++ .../collector/common/npu_collector_test.go | 547 ++++ .../npu-exporter/collector/common/types.go | 50 + .../collector/config/metrics_config.go | 208 ++ .../collector/config/metrics_config_test.go | 216 ++ .../collector/container/isula/isula_api.pb.go | 870 +++++++ .../collector/container/isula/isula_api.proto | 118 + .../container/isula/isula_api_grpc.pb.go | 107 + .../container/isula/isula_container.go | 39 + .../collector/container/isula/isulad.pb.go | 278 +++ .../collector/container/isula/isulad.proto | 35 + .../container/isula/isulad_grpc.pb.go | 105 + .../collector/container/parser.go | 630 +++++ .../collector/container/parser_test.go | 1027 ++++++++ .../collector/container/runtime_ops.go | 413 +++ .../collector/container/runtime_ops_test.go | 568 +++++ .../npu-exporter/collector/container/utils.go | 133 + .../collector/container/utils_test.go | 329 +++ .../collector/container/v1/containerd.pb.go | 310 +++ .../collector/container/v1/containerd.proto | 62 + .../collector/container/v1/spec.go | 59 + .../collector/metrics/collector_for_ddr.go | 142 ++ .../collector/metrics/collector_for_hbm.go | 228 ++ .../metrics/collector_for_hbm_test.go | 115 + .../collector/metrics/collector_for_hccs.go | 312 +++ .../metrics/collector_for_hccs_test.go | 150 ++ .../metrics/collector_for_network.go | 190 ++ .../collector/metrics/collector_for_npu.go | 453 ++++ .../metrics/collector_for_optical.go | 200 ++ .../collector/metrics/collector_for_pcie.go | 234 ++ .../collector/metrics/collector_for_roce.go | 263 ++ .../collector/metrics/collector_for_sio.go | 120 + .../metrics/collector_for_version.go | 56 + .../collector/metrics/collector_for_vnpu.go | 169 ++ .../metrics/collector_for_vnpu_test.go | 202 ++ .../collector/metrics/collector_test.go | 548 ++++ .../collector/metrics/common_utils.go | 193 ++ .../collector/metrics/common_utils_test.go | 165 ++ .../collector/testdata/prometheus_metrics | 166 ++ .../collector/testdata/prometheus_metrics2 | 6 + mind-cluster/component/npu-exporter/go.mod | 63 + mind-cluster/component/npu-exporter/go.sum | 561 +++++ .../npu-exporter/platforms/inputs/all/npu.go | 20 + .../platforms/inputs/npu/README.md | 107 + .../npu-exporter/platforms/inputs/npu/npu.go | 104 + .../platforms/inputs/npu/npu_test.go | 174 ++ .../platforms/inputs/npu/sample.conf | 9 + .../platforms/prom/prometheus_collector.go | 103 + .../prom/prometheus_collector_test.go | 159 ++ .../component/npu-exporter/plugins/README.md | 388 +++ .../plugins/collector_for_text_file.go | 358 +++ .../npu-exporter/plugins/register.go | 21 + .../utils/logger/general_logger.go | 76 + .../npu-exporter/utils/logger/logger.go | 174 ++ .../npu-exporter/utils/logger/logger_test.go | 119 + .../utils/logger/telegraf_logger.go | 82 + .../component/npu-exporter/utils/utils.go | 52 + .../npu-exporter/utils/utils_test.go | 103 + .../npu-exporter/versions/version.go | 23 + 174 files changed, 32608 insertions(+), 13 deletions(-) create mode 100644 mind-cluster/component/ascend-common/README.md create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/LICENSE create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/README.md create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go create mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go create mode 100644 mind-cluster/component/ascend-common/api/consts.go create mode 100644 mind-cluster/component/ascend-common/api/default_name.go create mode 100644 mind-cluster/component/ascend-common/api/publicfault.go create mode 100644 mind-cluster/component/ascend-common/api/slownet/fault_net.go create mode 100644 mind-cluster/component/ascend-common/api/superpoddevice.go create mode 100644 mind-cluster/component/ascend-common/api/type.go create mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache.go create mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/types.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils.go create mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go create mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random.go create mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice_test.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings.go create mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/a310mgr.go create mode 100644 mind-cluster/component/ascend-common/devmanager/a310pmgr.go create mode 100644 mind-cluster/component/ascend-common/devmanager/a910mgr.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/constants.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/types.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils.go create mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/constants.go create mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go create mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go create mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_test.go create mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go create mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go create mode 100644 mind-cluster/component/ascend-common/go.mod create mode 100644 mind-cluster/component/ascend-common/go.sum create mode 100644 mind-cluster/component/npu-exporter/.gitignore create mode 100644 mind-cluster/component/npu-exporter/LICENSE create mode 100644 mind-cluster/component/npu-exporter/README.md create mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile create mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc create mode 100644 mind-cluster/component/npu-exporter/build/build.sh create mode 100644 mind-cluster/component/npu-exporter/build/build_ch.sh create mode 100644 mind-cluster/component/npu-exporter/build/metricConfiguration.json create mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml create mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter.yaml create mode 100644 mind-cluster/component/npu-exporter/build/pluginConfiguration.json create mode 100644 mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh create mode 100644 mind-cluster/component/npu-exporter/build/test.sh create mode 100644 mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/constants.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/common/types.go create mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config.go create mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto create mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/parser.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/parser_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/utils.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/utils_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go create mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto create mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/spec.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils.go create mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go create mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics create mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 create mode 100644 mind-cluster/component/npu-exporter/go.mod create mode 100644 mind-cluster/component/npu-exporter/go.sum create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go create mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf create mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go create mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go create mode 100644 mind-cluster/component/npu-exporter/plugins/README.md create mode 100644 mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go create mode 100644 mind-cluster/component/npu-exporter/plugins/register.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/general_logger.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger_test.go create mode 100644 mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go create mode 100644 mind-cluster/component/npu-exporter/utils/utils.go create mode 100644 mind-cluster/component/npu-exporter/utils/utils_test.go create mode 100644 mind-cluster/component/npu-exporter/versions/version.go diff --git a/cmd/main.go b/cmd/main.go index c77c244..b233829 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,7 +17,6 @@ package main import ( - "context" "flag" "fmt" "os" @@ -29,7 +28,7 @@ import ( "github.com/Project-HAMi/ascend-device-plugin/internal/server" "github.com/Project-HAMi/ascend-device-plugin/version" "github.com/fsnotify/fsnotify" - "huawei.com/npu-exporter/v6/common-utils/hwlog" + "huawei.com/npu-exporter/utils/logger" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) @@ -117,11 +116,11 @@ func main() { checkFlags() klog.Infof("version: %s", version.GetVersion()) klog.Infof("using config file: %s", *configFile) - config := &hwlog.LogConfig{ - OnlyToStdout: true, - LogLevel: *hwLoglevel, - } - err := hwlog.InitRunLogger(config, context.Background()) + + logger.HwLogConfig.OnlyToStdout = true + logger.HwLogConfig.LogLevel = *hwLoglevel + + err := logger.InitLogger("Prometheus") if err != nil { klog.Fatalf("init huawei run logger failed, %v", err) } diff --git a/go.mod b/go.mod index 86c9b48..6ea6f74 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/Project-HAMi/ascend-device-plugin go 1.22.2 require ( + ascend-common v0.0.0 github.com/Project-HAMi/HAMi v0.0.0 github.com/fsnotify/fsnotify v1.7.0 google.golang.org/grpc v1.63.2 @@ -57,6 +58,7 @@ require ( ) replace ( + ascend-common => ./mind-cluster/component/ascend-common github.com/Project-HAMi/HAMi v0.0.0 => github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480 - huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 + huawei.com/npu-exporter => ./mind-cluster/component/npu-exporter ) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index db92bd8..d070ea9 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -20,9 +20,10 @@ import ( "fmt" "sort" + "ascend-common/devmanager" + "ascend-common/devmanager/dcmi" + "github.com/Project-HAMi/ascend-device-plugin/internal" - "huawei.com/npu-exporter/v6/devmanager" - "huawei.com/npu-exporter/v6/devmanager/dcmi" "k8s.io/klog/v2" ) @@ -45,7 +46,7 @@ type AscendManager struct { } func NewAscendManager() (*AscendManager, error) { - mgr, err := devmanager.AutoInit("") + mgr, err := devmanager.AutoInit("", 30) if err != nil { return nil, err } diff --git a/internal/server/server.go b/internal/server/server.go index 67dcb8d..22f0927 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -44,6 +44,7 @@ const ( // PodAllocAnno = "huawei.com/AscendDevices" NodeLockAscend = "hami.io/mutex.lock" Ascend910Prefix = "Ascend910" + Ascend910CType = "Ascend910C" ) var ( @@ -191,10 +192,16 @@ func (ps *PluginServer) registerKubelet() error { return nil } -func (ps *PluginServer) getDeviceNetworkID(idx int) (int, error) { +func (ps *PluginServer) getDeviceNetworkID(idx int, deviceType string) (int, error) { + // For Ascend910C devices, all modules (dies) are interconnected via HCCS + if deviceType == Ascend910CType { + return 0, nil + } + if idx > 3 { return 1, nil } + return 0, nil } @@ -214,7 +221,7 @@ func (ps *PluginServer) registerHAMi() error { Health: dev.Health, } if strings.HasPrefix(device.Type, Ascend910Prefix) { - NetworkID, err := ps.getDeviceNetworkID(i) + NetworkID, err := ps.getDeviceNetworkID(i, device.Type) if err != nil { return fmt.Errorf("get networkID error: %v", err) } diff --git a/mind-cluster/component/ascend-common/README.md b/mind-cluster/component/ascend-common/README.md new file mode 100644 index 0000000..fa7f1b8 --- /dev/null +++ b/mind-cluster/component/ascend-common/README.md @@ -0,0 +1,8 @@ +# AscendCommon + +# 组件介绍 +提供公共代码给其他组件使用,组件包括NPU-Exporter等。 + +# 说明 + +1. 编译NPU-Exporter等组件时,AscendCommon要放在同一目录下 \ No newline at end of file diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE b/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/README.md b/mind-cluster/component/ascend-common/api/ascend-operator/README.md new file mode 100644 index 0000000..20c2f61 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/README.md @@ -0,0 +1,164 @@ +# ascend-aperator-apis + +## 介绍 + +ascend-aperator-apis旨在为用户提供AscendJob API,及其Clientsets, Listers、Informers。使用户能轻松对AscendJob进行CRUD操作。 + +## 接口说明 + +1. 创建clientsets + + ```go + NewForConfig(c *rest.Config)(*Clientset, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------- | ------------------------------------------------------------ | + | c | Input | *rest.Config | 客户端配置文件,由k8s提供的接口生成。包括cluster host、证书等信息 | + | - | Output | *clientsets | Client集合,包括AscendJob client和discovery client | + | - | Output | error | 错误信息 | + +2. 创建AscendJob + + ```go + Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | job | Input | *v1.AscendJob | AscendJob对象指针 | + | opts | Input | metav1.CreateOptions | 创建选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +3. 获取AscendJob + + ```go + Get(ctx context.Context, name string, opts metav1.GetOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | name | Input | string | AscendJob名称 | + | opts | Input | metav1.GetOptions | 获取选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +4. 列举AscendJob + + ```go + List(ctx context.Context, opts metav1.ListOptions)(*v1.AscendJobList, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | --------------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | opts | Input | metav1.ListOptions | 列举选项 | + | - | Output | *v1.AscendJob | AscendJobList对象指针 | + | - | Output | error | 错误信息 | + +5. 观察AscendJob + + ```go + Watch((ctx context.Context, opts metav1.ListOptions)(watch.Interface, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | ------------------ | ---------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | opts | Input | metav1.ListOptions | 列举选项 | + | - | Output | watch.Interface | watch类接口 | + | - | Output | error | 错误信息 | + +6. 更新AscendJob + + ```go + Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | job | Input | *v1.AscendJob | AscendJob对象指针 | + | opts | Input | metav1.UpdateOptions | 更新选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +7. 更新AscendJob状态 + + ```go + UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | job | Input | *v1.AscendJob | AscendJob对象指针 | + | opts | Input | metav1.UpdateOptions | 更新选项 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +8. 补丁AscendJob + + ```go + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (*v1.AscendJob, error) + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ------------ | ------------ | --------------- | ----------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | name | Input | string | AscendJob名称 | + | pt | Input | types.PatchType | patch类型 | + | data | Input | []byte | patch信息 | + | subresources | Input | ...string | 子信息 | + | - | Output | *v1.AscendJob | AscendJob对象指针 | + | - | Output | error | 错误信息 | + +9. 删除AscendJob + + ```go + Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ---------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | name | Input | string | AscendJob名称 | + | opts | Input | metav1.DeleteOptions | 删除选项 | + | - | Output | error | 错误信息 | + +10. 批量删除AscendJob + + ```go + DeleteCollection(ctx context.Context,opts metav1.DeleteOptions, listOpts metav1.ListOptions) error + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ---------- | ------------ | -------------------- | ---------------- | + | ctx | Input | context.Context | 上下文,协程控制 | + | opts | Input | metav1.DeleteOptions | 删除选项 | + | listOpts | Input | metav1.ListOptions | 列举选项 | + | - | Output | error | 错误信息 | + +11. 创建informerFactory + + ```go + NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) sharedInformerFactory + ``` + + | Parameters | Input/Output | Parameter Type | Description | + | ------------- | ------------ | --------------------- | ------------------ | + | client | Input | versioned.Interface | client类接口 | + | defaultResync | Input | time.Duration | 默认的重新同步时间 | + | - | Output | sharedInformerFactory | informer类接口 | + +12. 创建informer + + ```go + sharedInformerFactory.Batch().V1().Jobs().Informer() + ``` + + + diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go new file mode 100644 index 0000000..7bd1d65 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go @@ -0,0 +1,85 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to define AscendJob object and its initialization. +package v1 + +import ( + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// AscendJob is the Schema for the AscendJob API +type AscendJob struct { + // Standard Kubernetes type metadata. + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired state of the AscendJob. + // +optional + Spec AscendJobSpec `json:"spec,omitempty"` + + // Most recently observed status of the AscendJob. + // Populated by the system. + // Read-only. + // +optional + Status commonv1.JobStatus `json:"status,omitempty"` +} + +// AscendJobSpec defines the desired state of AscendJob +type AscendJobSpec struct { + // RunPolicy encapsulates various runtime policies of the distributed training + // job, for example how to clean up resources and how long the job can stay + // active. + // +kubebuilder:validation:Optional + RunPolicy commonv1.RunPolicy `json:"runPolicy"` + + // SuccessPolicy defines the policy to mark the AscendJob as succeeded. + // Default to "", using the default rules. + // +optional + SuccessPolicy *SuccessPolicy `json:"successPolicy,omitempty"` + + // SchedulerName defines the job scheduler with gang-scheduling enabled + SchedulerName string `json:"schedulerName,omitempty"` + + /* A map of ReplicaType (type) to ReplicaSpec (value). Specifies the ML cluster configuration. + For example, + { + "Scheduler": ReplacaSpec, + "Worker": ReplicaSpec, + } + */ + ReplicaSpecs map[commonv1.ReplicaType]*commonv1.ReplicaSpec `json:"replicaSpecs"` +} + +// AscendJobList contains a list of AscendJob +type AscendJobList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []AscendJob `json:"items"` +} + +// SuccessPolicy is the success policy. +type SuccessPolicy string + +const ( + // SuccessPolicyDefault is the default policy of success + SuccessPolicyDefault SuccessPolicy = "" + // SuccessPolicyAllWorkers is the 'ALLWorkers' policy of success + SuccessPolicyAllWorkers SuccessPolicy = "AllWorkers" +) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go new file mode 100644 index 0000000..9341682 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go @@ -0,0 +1,53 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "github.com/kubeflow/common/pkg/apis/common/v1" +) + +const ( + // GroupName is the group name used in this package. + GroupName = "mindxdl.gitee.com" + + // FrameworkKey the key of the laebl + FrameworkKey = "framework" + + // DefaultPort is default value of the port. + DefaultPort = 2222 + + // MindSporeFrameworkName is the name of ML Framework + MindSporeFrameworkName = "mindspore" + // MindSporeReplicaTypeScheduler is the type for Scheduler of distribute ML + MindSporeReplicaTypeScheduler v1.ReplicaType = "Scheduler" + + // PytorchFrameworkName is the name of ML Framework + PytorchFrameworkName = "pytorch" + // PytorchReplicaTypeMaster is the type for Scheduler of distribute ML + PytorchReplicaTypeMaster v1.ReplicaType = "Master" + + // TensorflowFrameworkName is the name of ML Framework + TensorflowFrameworkName = "tensorflow" + // TensorflowReplicaTypeChief is the type for Scheduler of distribute ML + TensorflowReplicaTypeChief v1.ReplicaType = "Chief" + + // ReplicaTypeWorker this is also used for non-distributed AscendJob + ReplicaTypeWorker v1.ReplicaType = "Worker" + + // DefaultRestartPolicy is default RestartPolicy for MSReplicaSpec. + DefaultRestartPolicy = v1.RestartPolicyNever +) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go new file mode 100644 index 0000000..4d5c124 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go @@ -0,0 +1,137 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "errors" + "fmt" + "strings" + + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + + "ascend-common/api" +) + +// Int32 is a helper routine that allocates a new int32 value +// to store v and returns a pointer to it. +func Int32(v int32) *int32 { + return &v +} + +// addDefaultingFuncs is used to register default funcs +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +// setDefaultPort sets the default ports for mindxdl container. +func setDefaultPort(spec *v1.PodSpec) { + index := 0 + for i, container := range spec.Containers { + if container.Name == api.DefaultContainerName { + index = i + break + + } + } + hasASJobPort := false + for _, port := range spec.Containers[index].Ports { + if port.Name == api.DefaultPortName { + hasASJobPort = true + break + } + } + if !hasASJobPort { + spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{ + Name: api.DefaultPortName, + ContainerPort: DefaultPort, + }) + } +} + +func setDefaultReplicas(spec *commonv1.ReplicaSpec) { + if spec.Replicas == nil { + spec.Replicas = Int32(1) + } + if spec.RestartPolicy == "" { + spec.RestartPolicy = DefaultRestartPolicy + } +} + +// setTypeNamesToCamelCase sets the name of all replica types from any case to correct case. +func setTypeNamesToCamelCase(job *AscendJob) { + setTypeNameToCamelCase(job, MindSporeReplicaTypeScheduler) + setTypeNameToCamelCase(job, ReplicaTypeWorker) + setTypeNameToCamelCase(job, PytorchReplicaTypeMaster) + setTypeNameToCamelCase(job, TensorflowReplicaTypeChief) +} + +// setTypeNameToCamelCase sets the name of the replica type from any case to correct case. +// E.g. from ps to PS; from WORKER to Worker. +func setTypeNameToCamelCase(job *AscendJob, typ commonv1.ReplicaType) { + for t := range job.Spec.ReplicaSpecs { + if strings.EqualFold(string(t), string(typ)) && t != typ { + spec := job.Spec.ReplicaSpecs[t] + delete(job.Spec.ReplicaSpecs, t) + job.Spec.ReplicaSpecs[typ] = spec + return + } + } +} + +// SetDefaultsAscendJob sets any unspecified values to defaults. +func SetDefaultsAscendJob(job *AscendJob) { + // Set default cleanpod policy to Running. + if job == nil { + return + } + + if job.Spec.RunPolicy.CleanPodPolicy == nil { + running := commonv1.CleanPodPolicyNone + job.Spec.RunPolicy.CleanPodPolicy = &running + } + // Set default success policy to "". + if job.Spec.SuccessPolicy == nil { + defaultPolicy := SuccessPolicyDefault + job.Spec.SuccessPolicy = &defaultPolicy + } + + // Update the key of replicaSpecs to camel case. + setTypeNamesToCamelCase(job) + + for rt, spec := range job.Spec.ReplicaSpecs { + // Set default replicas to 1. + setDefaultReplicas(spec) + // Set default port to ml container. + if rt == MindSporeReplicaTypeScheduler || rt == PytorchReplicaTypeMaster || rt == TensorflowReplicaTypeChief { + setDefaultPort(&spec.Template.Spec) + } + } +} + +// GetJobFramework get framework name of ascendjob +func GetJobFramework(job *AscendJob) (string, error) { + if job == nil || job.Labels == nil { + return "", errors.New("job or job labels is nil") + } + frame, ok := job.Labels[FrameworkKey] + if !ok { + return "", fmt.Errorf("job<%s-%s> label framework is not set", job.Namespace, job.Name) + } + return frame, nil +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go new file mode 100644 index 0000000..5813e39 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go @@ -0,0 +1,52 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + // SchemeGroupVersion is the group version used to register these objects. + SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1"} + // SchemeBuilder points to a list of functions added to Scheme. + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) + +// Resource takes an unqualified resource and returns a Group-qualified GroupResource. +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &AscendJob{}, + &AscendJobList{}, + ) + + v1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} + +func init() { + SchemeBuilder.Register(addDefaultingFuncs) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go new file mode 100644 index 0000000..695038b --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go @@ -0,0 +1,137 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1 + +import ( + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AscendJob) DeepCopyInto(out *AscendJob) { + if in == nil { + return + } + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJob. +func (in *AscendJob) DeepCopy() *AscendJob { + if in == nil { + return nil + } + out := new(AscendJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AscendJob) DeepCopyObject() runtime.Object { + if in == nil { + return nil + } + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AscendJobList) DeepCopyInto(out *AscendJobList) { + if in == nil { + return + } + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]AscendJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobList. +func (in *AscendJobList) DeepCopy() *AscendJobList { + if in == nil { + return nil + } + out := new(AscendJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AscendJobList) DeepCopyObject() runtime.Object { + if in == nil { + return nil + } + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AscendJobSpec) DeepCopyInto(out *AscendJobSpec) { + if in == nil { + return + } + *out = *in + in.RunPolicy.DeepCopyInto(&out.RunPolicy) + if in.SuccessPolicy != nil { + in, out := &in.SuccessPolicy, &out.SuccessPolicy + *out = new(SuccessPolicy) + **out = **in + } + if in.ReplicaSpecs != nil { + in, out := &in.ReplicaSpecs, &out.ReplicaSpecs + *out = make(map[commonv1.ReplicaType]*commonv1.ReplicaSpec, len(*in)) + for key, val := range *in { + var outVal *commonv1.ReplicaSpec + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(commonv1.ReplicaSpec) + (*in).DeepCopyInto(*out) + } + (*out)[key] = outVal + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobSpec. +func (in *AscendJobSpec) DeepCopy() *AscendJobSpec { + if in == nil { + return nil + } + out := new(AscendJobSpec) + in.DeepCopyInto(out) + return out +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go new file mode 100644 index 0000000..e9b774a --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go @@ -0,0 +1,53 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by defaulter-gen. DO NOT EDIT. + +package v1 + +import ( + "errors" + + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + if scheme == nil { + return errors.New("scheme is nil") + } + scheme.AddTypeDefaultingFunc(&AscendJob{}, func(obj interface{}) { SetObjectDefaults_AscendJob(obj.(*AscendJob)) }) + scheme.AddTypeDefaultingFunc(&AscendJobList{}, func(obj interface{}) { SetObjectDefaults_AscendJobList(obj.(*AscendJobList)) }) + return nil +} + +func SetObjectDefaults_AscendJob(in *AscendJob) { + SetDefaultsAscendJob(in) +} + +func SetObjectDefaults_AscendJobList(in *AscendJobList) { + if in == nil { + return + } + for i := range in.Items { + a := &in.Items[i] + SetObjectDefaults_AscendJob(a) + } +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go new file mode 100644 index 0000000..0d4add4 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go @@ -0,0 +1,114 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package versioned is used to define the ClientSet interface and struct, and its initialization. +package versioned + +import ( + "fmt" + "net/http" + + "k8s.io/client-go/discovery" + "k8s.io/client-go/rest" + "k8s.io/client-go/util/flowcontrol" + + "ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1" +) + +// Interface is the interface definition for Clientset. +type Interface interface { + Discovery() discovery.DiscoveryInterface + BatchV1() v1.BatchV1Interface +} + +// Clientset contains the clients for groups. Each group has exactly one +// version included in a Clientset. +type Clientset struct { + *discovery.DiscoveryClient + batchV1 *v1.BatchV1Client +} + +// BatchV1 retrieves the BatchV1alpha1Client +func (c *Clientset) BatchV1() v1.BatchV1Interface { + if c == nil { + return nil + } + return c.batchV1 +} + +// Discovery retrieves the DiscoveryClient +func (c *Clientset) Discovery() discovery.DiscoveryInterface { + if c == nil { + return nil + } + return c.DiscoveryClient +} + +// NewForConfig creates a new Clientset for the given config. +// If config's RateLimiter is not set and QPS and Burst are acceptable, +// NewForConfig will generate a rate-limiter in configShallowCopy. +// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), +// where httpClient was generated with rest.HTTPClientFor(c). +func NewForConfig(c *rest.Config) (*Clientset, error) { + configShallowCopy := *c + + // share the transport between all clients + httpClient, err := rest.HTTPClientFor(&configShallowCopy) + if err != nil { + return nil, err + } + + return NewForConfigAndClient(&configShallowCopy, httpClient) +} + +// NewForConfigAndClient creates a new Clientset for the given config and http client. +// Note the http client provided takes precedence over the configured transport values. +// If config's RateLimiter is not set and QPS and Burst are acceptable, +// NewForConfigAndClient will generate a rate-limiter in configShallowCopy. +func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { + if c == nil || httpClient == nil { + return nil, fmt.Errorf("nil pointer") + } + configShallowCopy := *c + if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { + if configShallowCopy.Burst <= 0 { + return nil, fmt.Errorf("burst is required to be greater than 0 " + + "when RateLimiter is not set and QPS is set to greater than 0") + } + configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) + } + + var cs Clientset + var err error + cs.batchV1, err = v1.NewForConfigAndClient(&configShallowCopy, httpClient) + if err != nil { + return nil, err + } + cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) + if err != nil { + return nil, err + } + return &cs, nil +} + +// New creates a new Clientset for the given RESTClient. +func New(c rest.Interface) *Clientset { + var cs Clientset + cs.batchV1 = v1.New(c) + + cs.DiscoveryClient = discovery.NewDiscoveryClient(c) + return &cs +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go new file mode 100644 index 0000000..58a99b0 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go @@ -0,0 +1,39 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package scheme is used to add runtime.Scheme +package scheme + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + "ascend-common/api/ascend-operator/apis/batch/v1" +) + +// RuntimeScheme is a Scheme object instance. +var RuntimeScheme = runtime.NewScheme() + +// Codecs is a CodecFactory object instance. +var Codecs = serializer.NewCodecFactory(RuntimeScheme) + +// ParameterCodec is a parameterCodec object instance. +var ParameterCodec = runtime.NewParameterCodec(RuntimeScheme) + +func init() { + utilruntime.Must(v1.AddToScheme(RuntimeScheme)) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go new file mode 100644 index 0000000..7dd8264 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go @@ -0,0 +1,110 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to define some client- and job-related interfaces, initialization operations, +// and method implementations. +package v1 + +import ( + "errors" + "net/http" + + "k8s.io/client-go/rest" + + "ascend-common/api/ascend-operator/apis/batch/v1" + "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" +) + +// BatchV1Interface is a batch client interface. +type BatchV1Interface interface { + RESTClient() rest.Interface + JobsGetter +} + +// BatchV1Client is a client structure. +type BatchV1Client struct { + restClient rest.Interface +} + +// Jobs returns a JobInterface object instance. +func (c *BatchV1Client) Jobs(namespace string) JobInterface { + if c == nil { + return nil + } + return newJobs(c, namespace) +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *BatchV1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} + +// NewForConfig creates a new BatchV1alpha1Client for the given config. +// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), +// where httpClient was generated with rest.HTTPClientFor(c). +func NewForConfig(c *rest.Config) (*BatchV1Client, error) { + if c == nil { + return nil, errors.New(nilPointError) + } + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + httpClient, err := rest.HTTPClientFor(&config) + if err != nil { + return nil, err + } + return NewForConfigAndClient(&config, httpClient) +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// NewForConfigAndClient creates a new BatchV1alpha1Client for the given config and http client. +// Note the http client provided takes precedence over the configured transport values. +func NewForConfigAndClient(c *rest.Config, h *http.Client) (*BatchV1Client, error) { + if c == nil || h == nil { + return nil, errors.New(nilPointError) + } + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientForConfigAndClient(&config, h) + if err != nil { + return nil, err + } + return &BatchV1Client{restClient: client}, nil +} + +// New creates a new BatchV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *BatchV1Client { + return &BatchV1Client{restClient: c} +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go new file mode 100644 index 0000000..a6527ad --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go @@ -0,0 +1,221 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "errors" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/rest" + + "ascend-common/api" + "ascend-common/api/ascend-operator/apis/batch/v1" + "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" +) + +const ( + nilPointError = "nil pointer" +) + +// JobsGetter has a method to return a JobInterface. +// A group's client should implement this interface. +type JobsGetter interface { + Jobs(namespace string) JobInterface +} + +// JobInterface has methods to work with Job resources. +type JobInterface interface { + Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) + Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) + UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) + Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error + Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) + List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) + Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, + subresources ...string) (result *v1.AscendJob, err error) + // JobExpansion +} + +// jobs implements JobInterface +type jobs struct { + client rest.Interface + ns string +} + +func (j *jobs) Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Post(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&opts, scheme.ParameterCodec). + Body(job). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, + error) { + if j == nil || job == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Put(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(job.Name). + VersionedParams(&opts, scheme.ParameterCodec). + Body(job). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, + error) { + if j == nil || job == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Put(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(job.Name). + SubResource("status"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(job). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + if j == nil { + return errors.New(nilPointError) + } + return j.client.Delete(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(name). + Body(&opts). + Do(ctx). + Error() +} + +func (j *jobs) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error { + if j == nil { + return errors.New(nilPointError) + } + var timeout time.Duration + if listOpts.TimeoutSeconds != nil { + timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second + } + return j.client.Delete(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&listOpts, scheme.ParameterCodec). + Timeout(timeout). + Body(&opts). + Do(ctx). + Error() +} + +func (j *jobs) Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Get(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(name). + VersionedParams(&opts, scheme.ParameterCodec). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + result := &v1.AscendJobList{} + err := j.client.Get(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Do(ctx). + Into(result) + return result, err +} + +func (j *jobs) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + opts.Watch = true + return j.client.Get(). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Watch(ctx) +} + +func (j *jobs) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, + subresources ...string) (*v1.AscendJob, error) { + if j == nil { + return nil, errors.New(nilPointError) + } + result := &v1.AscendJob{} + err := j.client.Patch(pt). + Namespace(j.ns). + Resource(api.AscendJobsLowerCase). + Name(name). + SubResource(subresources...). + VersionedParams(&opts, scheme.ParameterCodec). + Body(data). + Do(ctx). + Into(result) + return result, err +} + +// newJobs returns a Jobs +func newJobs(c *BatchV1Client, namespace string) *jobs { + return &jobs{ + client: c.RESTClient(), + ns: namespace, + } +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go new file mode 100644 index 0000000..78b5d12 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go @@ -0,0 +1,49 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package batch is used to define interfaces. +package batch + +import ( + "ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1" + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to each of this group's versions. +type Interface interface { + // V1 provides access to shared informers for resources in V1alpha1. + V1() v1.Interface +} + +type group struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, + tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// V1 returns a new v1alpha1.Interface. +func (g *group) V1() v1.Interface { + if g == nil { + return nil + } + return v1.New(g.factory, g.namespace, g.tweakListOptions) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go new file mode 100644 index 0000000..a4f0466 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go @@ -0,0 +1,48 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to implement job informer-related methods. +package v1 + +import ( + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // Jobs returns a JobInformer. + Jobs() JobInformer +} + +type version struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, + tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// Jobs returns a JobInformer. +func (v *version) Jobs() JobInformer { + if v == nil { + return nil + } + return &jobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go new file mode 100644 index 0000000..e5f0b1c --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go @@ -0,0 +1,99 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/cache" + + batchv1 "ascend-common/api/ascend-operator/apis/batch/v1" + "ascend-common/api/ascend-operator/client/clientset/versioned" + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" + batchlister "ascend-common/api/ascend-operator/client/listers/batch/v1" +) + +// JobInformer provides access to a shared informer and lister for +// Jobs. +type JobInformer interface { + Informer() cache.SharedIndexInformer + Lister() batchlister.JobLister +} + +type jobInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewJobInformer constructs a new informer for Job type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, + indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredJobInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredJobInformer constructs a new informer for Job type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, + indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.BatchV1().Jobs(namespace).List(context.TODO(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.BatchV1().Jobs(namespace).Watch(context.TODO(), options) + }, + }, + &batchv1.AscendJob{}, + resyncPeriod, + indexers, + ) +} + +func (f *jobInformer) defaultInformer(client versioned.Interface, + resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{ + cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *jobInformer) Informer() cache.SharedIndexInformer { + if f == nil || f.factory == nil { + return nil + } + return f.factory.InformerFor(&batchv1.AscendJob{}, f.defaultInformer) +} + +func (f *jobInformer) Lister() batchlister.JobLister { + if f == nil { + return nil + } + return batchlister.NewJobLister(f.Informer().GetIndexer()) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go new file mode 100644 index 0000000..5fec15f --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go @@ -0,0 +1,207 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package externalversions + +import ( + "reflect" + "sync" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/cache" + + "ascend-common/api/ascend-operator/client/clientset/versioned" + "ascend-common/api/ascend-operator/client/informers/externalversions/batch" + "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" +) + +// SharedInformerFactory provides shared informers for resources in all known +// API group versions. +type SharedInformerFactory interface { + internalinterfaces.SharedInformerFactory + ForResource(resource schema.GroupVersionResource) (GenericInformer, error) + WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool + + Batch() batch.Interface +} + +// SharedInformerOption defines the functional option type for SharedInformerFactory. +type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory + +type sharedInformerFactory struct { + client versioned.Interface + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc + lock sync.Mutex + defaultResync time.Duration + customResync map[reflect.Type]time.Duration + + informers map[reflect.Type]cache.SharedIndexInformer + // startedInformers is used for tracking which informers have been started. + // This allows Start() to be called multiple times safely. + startedInformers map[reflect.Type]bool +} + +// WithCustomResyncConfig sets a custom resync period for the specified informer types. +func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + if factory == nil { + return factory + } + + if factory.customResync == nil { + factory.customResync = make(map[reflect.Type]time.Duration) + } + + for k, v := range resyncConfig { + factory.customResync[reflect.TypeOf(k)] = v + } + return factory + } +} + +// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. +func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + if factory == nil { + return nil + } + factory.tweakListOptions = tweakListOptions + return factory + } +} + +// WithNamespace limits the SharedInformerFactory to the specified namespace. +func WithNamespace(namespace string) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + if factory == nil { + return nil + } + factory.namespace = namespace + return factory + } +} + +// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. +func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { + return NewSharedInformerFactoryWithOptions(client, defaultResync) +} + +// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. +func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, + options ...SharedInformerOption) SharedInformerFactory { + factory := &sharedInformerFactory{ + client: client, + namespace: v1.NamespaceAll, + defaultResync: defaultResync, + informers: make(map[reflect.Type]cache.SharedIndexInformer), + startedInformers: make(map[reflect.Type]bool), + customResync: make(map[reflect.Type]time.Duration), + } + + // Apply all options + for _, opt := range options { + factory = opt(factory) + } + + return factory +} + +// Start initializes all requested informers. +func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { + if f == nil { + return + } + f.lock.Lock() + defer f.lock.Unlock() + + if f.startedInformers == nil { + f.startedInformers = make(map[reflect.Type]bool) + } + + for informerType, informer := range f.informers { + if !f.startedInformers[informerType] { + go informer.Run(stopCh) + f.startedInformers[informerType] = true + } + } +} + +// WaitForCacheSync waits for all started informers' cache were synced. +func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { + informers := func() map[reflect.Type]cache.SharedIndexInformer { + if f == nil { + return nil + } + f.lock.Lock() + defer f.lock.Unlock() + + informers := map[reflect.Type]cache.SharedIndexInformer{} + for informerType, informer := range f.informers { + if f.startedInformers[informerType] { + informers[informerType] = informer + } + } + return informers + }() + + res := map[reflect.Type]bool{} + for informType, informer := range informers { + res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) + } + return res +} + +// InternalInformerFor returns the SharedIndexInformer for obj using an internal +// client. +func (f *sharedInformerFactory) InformerFor(obj runtime.Object, + newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { + if f == nil { + return nil + } + + f.lock.Lock() + defer f.lock.Unlock() + + informerType := reflect.TypeOf(obj) + informer, exists := f.informers[informerType] + if exists { + return informer + } + + resyncPeriod, exists := f.customResync[informerType] + if !exists { + resyncPeriod = f.defaultResync + } + + informer = newFunc(f.client, resyncPeriod) + if f.informers == nil { + f.informers = make(map[reflect.Type]cache.SharedIndexInformer) + } + f.informers[informerType] = informer + + return informer +} + +func (f *sharedInformerFactory) Batch() batch.Interface { + if f == nil { + return nil + } + return batch.New(f, f.namespace, f.tweakListOptions) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go new file mode 100644 index 0000000..95db6d0 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go @@ -0,0 +1,71 @@ +/* +Copyright 2023 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package externalversions + +import ( + "errors" + "fmt" + + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/cache" + + "ascend-common/api" + "ascend-common/api/ascend-operator/apis/batch/v1" +) + +// GenericInformer is type of SharedIndexInformer which will locate and delegate to other +// sharedInformers based on type +type GenericInformer interface { + Informer() cache.SharedIndexInformer + Lister() cache.GenericLister +} + +type genericInformer struct { + informer cache.SharedIndexInformer + resource schema.GroupResource +} + +// Informer returns the SharedIndexInformer. +func (f *genericInformer) Informer() cache.SharedIndexInformer { + if f == nil { + return nil + } + return f.informer +} + +// Lister returns the GenericLister. +func (f *genericInformer) Lister() cache.GenericLister { + if f == nil { + return nil + } + return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) +} + +// ForResource gives generic access to a shared informer of the matching type +// extend this to unknown resources with a client pool +func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { + if f == nil { + return nil, errors.New("nil pointer") + } + switch resource { + case v1.SchemeGroupVersion.WithResource(api.AscendJobsLowerCase): + return &genericInformer{resource: resource.GroupResource(), informer: f.Batch().V1().Jobs().Informer()}, nil + default: + } + + return nil, fmt.Errorf("no informer found for %v", resource) +} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go new file mode 100644 index 0000000..5602b78 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -0,0 +1,40 @@ +/* +Copyright 2019 Bloomberg Finance LP. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package internalinterfaces is used to define informer-related interfaces. +package internalinterfaces + +import ( + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/cache" + + "ascend-common/api/ascend-operator/client/clientset/versioned" +) + +// NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. +type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer + +// SharedInformerFactory a small interface to allow for adding an informer without an import cycle +type SharedInformerFactory interface { + Start(stopCh <-chan struct{}) + InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer +} + +// TweakListOptionsFunc is a function that transforms a v1.ListOptions. +type TweakListOptionsFunc func(*v1.ListOptions) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go new file mode 100644 index 0000000..9ed431c --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go @@ -0,0 +1,26 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 is used to define job-related interfaces. +package v1 + +// JobListerExpansion allows custom methods to be added to +// JobLister. +type JobListerExpansion interface{} + +// JobNamespaceListerExpansion allows custom methods to be added to +// JobNamespaceLister. +type JobNamespaceListerExpansion interface{} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go new file mode 100644 index 0000000..084a913 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go @@ -0,0 +1,108 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "errors" + + k8serr "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" + + "ascend-common/api/ascend-operator/apis/batch/v1" +) + +// JobLister helps list Jobs. +// All objects returned here must be treated as read-only. +type JobLister interface { + // List lists all Jobs in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1.AscendJob, err error) + // Jobs returns an object that can list and get Jobs. + Jobs(namespace string) JobNamespaceLister + JobListerExpansion +} + +// jobLister implements the JobLister interface. +type jobLister struct { + indexer cache.Indexer +} + +// NewJobLister returns a new JobLister. +func NewJobLister(indexer cache.Indexer) JobLister { + return &jobLister{indexer: indexer} +} + +// List lists all Jobs in the indexer. +func (s *jobLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { + if s == nil { + return nil, errors.New("nil pointer") + } + var ret []*v1.AscendJob + err := cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1.AscendJob)) + }) + return ret, err +} + +// Jobs returns an object that can list and get Jobs. +func (s *jobLister) Jobs(namespace string) JobNamespaceLister { + if s == nil { + return nil + } + return jobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// JobNamespaceLister helps list and get Jobs. +// All objects returned here must be treated as read-only. +type JobNamespaceLister interface { + // List lists all Jobs in the indexer for a given namespace. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1.AscendJob, err error) + // Get retrieves the Job from the indexer for a given namespace and name. + // Objects returned here must be treated as read-only. + Get(name string) (*v1.AscendJob, error) + JobNamespaceListerExpansion +} + +// jobNamespaceLister implements the JobNamespaceLister +// interface. +type jobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all Jobs in the indexer for a given namespace. +func (s jobNamespaceLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { + var ret []*v1.AscendJob + err := cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1.AscendJob)) + }) + return ret, err +} + +// Get retrieves the Job from the indexer for a given namespace and name. +func (s jobNamespaceLister) Get(name string) (*v1.AscendJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, k8serr.NewNotFound(v1.Resource("job"), name) + } + return obj.(*v1.AscendJob), nil +} diff --git a/mind-cluster/component/ascend-common/api/consts.go b/mind-cluster/component/ascend-common/api/consts.go new file mode 100644 index 0000000..01881ce --- /dev/null +++ b/mind-cluster/component/ascend-common/api/consts.go @@ -0,0 +1,222 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api common const +package api + +// Env +const ( + NodeNameEnv = "NODE_NAME" + + // PtWorldSizeEnv the total number of npu used for the task for PyTorch + PtWorldSizeEnv = "WORLD_SIZE" + // PtLocalWorldSizeEnv number of npu used per pod for PyTorch + PtLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" + // PtLocalRankEnv logic id List of npu used by pod for PyTorch + PtLocalRankEnv = "LOCAL_RANK" + + // TfWorkerSizeEnv the total number of npu used for the task for TensorFlow + TfWorkerSizeEnv = "CM_WORKER_SIZE" + // TfLocalWorkerEnv number of npu used per pod for TensorFlow + TfLocalWorkerEnv = "CM_LOCAL_WORKER" + + // MsWorkerNumEnv the total number of npu used for the task for MindSpore + MsWorkerNumEnv = "MS_WORKER_NUM" + // MsLocalWorkerEnv number of npu used per pod for MindSpore + MsLocalWorkerEnv = "MS_LOCAL_WORKER" +) + +// NameSpace +const ( + DLNamespace = "mindx-dl" + ClusterNS = "cluster-system" + KubeNS = "kube-system" +) + +// Node +const ( + // NPUChipMemoryLabel label value is npu chip memory + NPUChipMemoryLabel = "mind-cluster/npu-chip-memory" + + // NodeSNAnnotation annotation value is node sn + NodeSNAnnotation = "product-serial-number" + // BaseDevInfoAnno annotation value is device base info + BaseDevInfoAnno = "baseDeviceInfos" + + // AcceleratorTypeKey the node label key of accelerator type + AcceleratorTypeKey = "accelerator-type" + // AcceleratorTypeModule910A3SuperPod for 910A3-SuperPod hardware + AcceleratorTypeModule910A3SuperPod = "module-a3-16-super-pod" +) + +// Pod +const ( + // PodUsedHardwareTypeAnno annotation value is the hardware type that real used in pod + PodUsedHardwareTypeAnno = "mind-cluster/hardware-type" + // PodRankIndexAnno annotation value is rank index of the pod + PodRankIndexAnno = "hccl/rankIndex" + // SuperPodIDAnno annotation key of the super pod id + SuperPodIDAnno = "super-pod-id" + + // Hotswitch Annotations + + // InHotSwitchFlowKey in hot switch flow key + InHotSwitchFlowKey = "inHotSwitchFlow" + // InHotSwitchFlowValue in hot switch flow true + InHotSwitchFlowValue = "true" + // BackupNewPodNameKey backup new pod name key + BackupNewPodNameKey = "backupNewPodName" + // BackupSourcePodNameKey backup source pod name key + BackupSourcePodNameKey = "backupSourcePodName" + // NeedOperatorOpeKey need operator ope key + NeedOperatorOpeKey = "needOperatorOpe" + // NeedVolcanoOpeKey need volcano ope key + NeedVolcanoOpeKey = "needVolcanoOpe" + // OpeTypeDelete ope type delete + OpeTypeDelete = "delete" + // OpeTypeCreate ope type create + OpeTypeCreate = "create" + // PodTypeKey pod type key + PodTypeKey = "podType" + // PodTypeBackup pod type backup + PodTypeBackup = "backup" + // DefaultRetryTimes default retry times + DefaultRetryTimes = 3 + // MasterPodRank master pod rank + MasterPodRank = "0" +) + +// PodGroup +const ( + // AtlasTaskLabel label value task kind, eg. ascend-910, ascend-{xxx}b + AtlasTaskLabel = "ring-controller.atlas" +) + +// ConfigMap +const ( + // DeviceInfoCMDataKey device-info-cm data key, record device info + DeviceInfoCMDataKey = "DeviceInfoCfg" + // SwitchInfoCMDataKey device-info-cm data key, record switch info + SwitchInfoCMDataKey = "SwitchInfoCfg" + // NodeInfoCMDataKey node-info-cm data key, record node info + NodeInfoCMDataKey = "NodeInfo" + // PubFaultCMDataKey public fault cm data key, record public fault info + PubFaultCMDataKey = "PublicFault" + + // CIMCMLabelKey cm label key, who uses these cms + CIMCMLabelKey = "mx-consumer-cim" + // PubFaultCMLabelKey public fault cm label key + PubFaultCMLabelKey = "mc-consumer-publicfault" +) + +const ( + // FaultJobCmName fault job cm name + FaultJobCmName = "fault-job-info" +) + +const ( + // PodScheduleLabel pod schedule label + PodScheduleLabel = "pod-rescheduling" + // ProcessScheduleLabel process schedule label + ProcessScheduleLabel = "process-recover-enable" + // RecoverStrategyKey recover strategy key in job annotation + RecoverStrategyKey = "recover-strategy" +) + +// process schedule strategy +const ( + // RecoverStrategy recover strategy + RecoverStrategy = "recover" + // RetryStrategy retry strategy + RetryStrategy = "retry" + // InPlaceStrategy recover in place strategy + InPlaceStrategy = "recover-in-place" + // DumpStrategy dump strategy + DumpStrategy = "dump" + // ExitStrategy exit strategy + ExitStrategy = "exit" + // ElasticTraining elastic-training strategy + ElasticTraining = "elastic-training" +) + +// process schedule common env +const ( + // ProcessRecoverEnv process recover env + ProcessRecoverEnv = "PROCESS_RECOVER" + // ElasticRecoverEnv elastic process recover env + ElasticRecoverEnv = "ELASTIC_PROCESS_RECOVER_ENABLE" + // EnableRestartEnv enable restart env + EnableRestartEnv = "ENABLE_RESTART_FAULT_PROCESS" +) + +// process schedule pytorch env +const ( + // HighAvailableEnv high available env + HighAvailableEnv = "HIGH_AVAILABILITY" + // PtCloseWatchDogKey pt close watch dog key + PtCloseWatchDogKey = "HCCL_ASYNC_ERROR_HANDLING" + // PtCloseWatchDogValue pt close watch dog value + PtCloseWatchDogValue = "0" +) + +// process schedule ms env +const ( + // MsRecoverEnv ms recover env + MsRecoverEnv = "MS_ENABLE_TFT" + // EnableMS enable ms + EnableMS = "MINDIO_FOR_MINDSPORE" + // MsDumpStrategy ms dump strategy + MsDumpStrategy = "TTP:1" + // MsUceStrategy ms uce strategy + MsUceStrategy = "UCE:1" + // MsArfStrategy ms arf strategy + MsArfStrategy = "ARF:1" + // MsHcceStrategy ms hcce strategy + MsHcceStrategy = "HCCE:1" + // MsRscStrategy ms rsc strategy + MsRscStrategy = "RSC:1" + // MsCloseWatchDogKey ms close watch dog key + MsCloseWatchDogKey = "MS_ENABLE_THM" + // MsCloseWatchDogValue ms close watch dog value + MsCloseWatchDogValue = `{HCCL_WATCHDOG:0}` +) + +const ( + //EnableFunc Enable Func + EnableFunc = "on" + // EnableFlag enable flag + EnableFlag = "1" + // PytorchFramework framework + PytorchFramework = "pytorch" + // MindSporeFramework framework + MindSporeFramework = "mindspore" +) + +const ( + // RescheduleInPlaceKey reschedule in place key + RescheduleInPlaceKey = "reschedule-in-place" + // RescheduleInPlaceValue reschedule in place value + RescheduleInPlaceValue = "true" +) + +const ( + // DeviceResetTimeout device reset timeout + DeviceResetTimeout = "deviceResetTimeout" + // DefaultDeviceResetTimeout default device reset timeout is 60 seconds + DefaultDeviceResetTimeout = 60 + // MinDeviceResetTimeout min device reset timeout is 10 seconds + MinDeviceResetTimeout = 10 + // MaxDeviceResetTimeout max device reset timeout is 600 seconds + MaxDeviceResetTimeout = 600 +) + +const ( + // SubHealthyStrategy config in pod group label for subHealthy fault strategy + SubHealthyStrategy = "subHealthyStrategy" + // SubHealthyHotSwitch strategy name of hot switch + SubHealthyHotSwitch = "hotSwitch" +) + +const ( + // MinAvailableKey decide minAvailable of task + MinAvailableKey = "huawei.com/schedule_minAvailable" +) diff --git a/mind-cluster/component/ascend-common/api/default_name.go b/mind-cluster/component/ascend-common/api/default_name.go new file mode 100644 index 0000000..7f0ae6c --- /dev/null +++ b/mind-cluster/component/ascend-common/api/default_name.go @@ -0,0 +1,188 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api common brand moniker +package api + +// common +const ( + // Pod910DeviceAnno annotation value is for generating 910 hccl rank table + Pod910DeviceAnno = "ascend.kubectl.kubernetes.io/ascend-910-configuration" + + // ResourceNamePrefix pre resource name + ResourceNamePrefix = "huawei.com/" + // PodRealAlloc pod annotation key, means pod real mount device + PodRealAlloc = "AscendReal" + + // PodAnnotationAscendReal pod annotation ascend real + PodAnnotationAscendReal = "huawei.com/AscendReal" + + // Ascend brand name + Ascend = "Ascend" + // AscendJob job kind is AscendJob + AscendJob = "AscendJob" + // AscendJobsLowerCase for ascend jobs lowercase + AscendJobsLowerCase = "ascendjobs" + + // AscendOperator ascend-Operator + AscendOperator = "ascend-Operator" +) + +// common 910 +const ( + // Ascend910 for 910 chip + Ascend910 = "Ascend910" + // Ascend910Lowercase for 910 chip lowercase + Ascend910Lowercase = "ascend910" + // HuaweiAscend910 ascend 910 chip with prefix + HuaweiAscend910 = "huawei.com/Ascend910" + // Ascend910MinuxPrefix name prefix of ascend 910 chip + Ascend910MinuxPrefix = "Ascend910-" + // Ascend910MinuxCase minus type of ascend 910 chip + Ascend910MinuxCase = "ascend-910" + // Ascend910No 910 chip number + Ascend910No = "910" +) + +// common 910 A1 +const ( + // Ascend910A ascend 910A chip + Ascend910A = "Ascend910" + // Ascend910APattern regular expression for 910A + Ascend910APattern = `^910` +) + +// common 910 A2 +const ( + // Ascend910B ascend 910B chip + Ascend910B = "Ascend910B" + // Ascend910BPattern regular expression for 910B + Ascend910BPattern = `^(910B\d{1}|A2G\d{1})` +) + +// common 910 A3 +const ( + // Ascend910A3 ascend Ascend910A3 chip + Ascend910A3 = "Ascend910A3" +) + +// common 310 +const ( + // Ascend310 ascend 310 chip + Ascend310 = "Ascend310" + // Ascend310Lowercase ascend 310 chip lowercase + Ascend310Lowercase = "ascend310" + // Ascend310No 310 chip number + Ascend310No = "310" + // HuaweiAscend310 ascend 310 chip with prefix + HuaweiAscend310 = "huawei.com/Ascend310" + // Ascend310MinuxPrefix name prefix of ascend 310 chip + Ascend310MinuxPrefix = "Ascend310-" +) + +// common 310B +const ( + // Ascend310B ascend 310B chip + Ascend310B = "Ascend310B" + // Ascend310BNo 310B chip number + Ascend310BNo = "310B" +) + +// common 310P +const ( + // Ascend310P ascend 310P chip + Ascend310P = "Ascend310P" + // Ascend310PLowercase ascend 310P chip lowercase + Ascend310PLowercase = "ascend310P" + // Ascend310PNo 310P chip number + Ascend310PNo = "310P" + // Ascend310PPattern regular expression for 310P + Ascend310PPattern = `^(310P\d{0,1}|I2\d{0,1})` + // HuaweiAscend310P ascend 310P chip with prefix + HuaweiAscend310P = "huawei.com/Ascend310P" + // Ascend310PMinuxPrefix name prefix of ascend 310P chip + Ascend310PMinuxPrefix = "Ascend310P-" +) + +// device plugin +const ( + // Use310PMixedInsert use 310P Mixed insert + Use310PMixedInsert = "use310PMixedInsert" + // Ascend310PMix dp use310PMixedInsert parameter usage + Ascend310PMix = "ascend310P-V, ascend310P-VPro, ascend310P-IPro" + // A300IA2Label the value of the A300I A2 node label + A300IA2Label = "card-910b-infer" + // A300IDuoLabel the value of the A300I Duo node label + A300IDuoLabel = "card-300i-duo" + //UseAscendDocker UseAscendDocker parameter + UseAscendDocker = "useAscendDocker" +) + +// docker runtime +const ( + // AscendDockerRuntime ascend-docker-runtime + AscendDockerRuntime = "ascend-docker-runtime" + // AscendDockerHook ascend-docker-hook + AscendDockerHook = "ascend-docker-hook" + // AscendDockerDestroy ascend-docker-destroy + AscendDockerDestroy = "ascend-docker-destroy" + // AscendDockerCli ascend-docker-cli + AscendDockerCli = "ascend-docker-cli" + + // AscendDockerRuntimeEnv env variable + AscendDockerRuntimeEnv = "ASCEND_DOCKER_RUNTIME" + // AscendVisibleDevicesEnv env variable + AscendVisibleDevicesEnv = "ASCEND_VISIBLE_DEVICES" + // AscendRuntimeOptionsEnv env variable + AscendRuntimeOptionsEnv = "ASCEND_RUNTIME_OPTIONS" + // AscendRuntimeMountsEnv env variable + AscendRuntimeMountsEnv = "ASCEND_RUNTIME_MOUNTS" + // AscendAllowLinkEnv env variable + AscendAllowLinkEnv = "ASCEND_ALLOW_LINK" + // AscendVnpuSpescEnv env variable + AscendVnpuSpescEnv = "ASCEND_VNPU_SPECS" + + // RunTimeLogDir dir path of runtime + RunTimeLogDir = "/var/log/ascend-docker-runtime/" + // HookRunLogPath run log path of hook + HookRunLogPath = "/var/log/ascend-docker-runtime/hook-run.log" + // InstallHelperRunLogPath run log path of install helper + InstallHelperRunLogPath = "/var/log/ascend-docker-runtime/install-helper-run.log" + // RunTimeRunLogPath run log path of runtime + RunTimeRunLogPath = "/var/log/ascend-docker-runtime/runtime-run.log" + + // RunTimeDConfigPath config path + RunTimeDConfigPath = "/etc/ascend-docker-runtime.d" +) + +// npu exporter +const ( + // DevicePathPattern device path pattern + DevicePathPattern = `^/dev/davinci\d+$` + // HccsBWProfilingTimeStr preset parameter name + HccsBWProfilingTimeStr = "hccsBWProfilingTime" + // Hccs log options domain value + Hccs = "hccs" + // Prefix pre statistic info + Prefix = "npu_chip_info_hccs_statistic_info_" + // BwPrefix pre bandwidth info + BwPrefix = "npu_chip_info_hccs_bandwidth_info_" + // AscendDeviceInfo + AscendDeviceInfo = "ASCEND_VISIBLE_DEVICES" +) + +const ( + // AscendJobKind is the kind name + AscendJobKind = "AscendJob" + // DefaultContainerName the default container name for AscendJob. + DefaultContainerName = "ascend" + // DefaultPortName is name of the port used to communicate between other process. + DefaultPortName = "ascendjob-port" + // ControllerName is the name of controller,used in log. + ControllerName = "ascendjob-controller" + // OperatorName name of operator + OperatorName = "ascend-operator" + // LogModuleName name of log module + LogModuleName = "hwlog" + // OperatorLogFilePath Operator log file name + OperatorLogFilePath = "/var/log/mindx-dl/ascend-operator/ascend-operator.log" +) diff --git a/mind-cluster/component/ascend-common/api/publicfault.go b/mind-cluster/component/ascend-common/api/publicfault.go new file mode 100644 index 0000000..8561145 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/publicfault.go @@ -0,0 +1,32 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api structs for public fault +package api + +// PubFaultInfo struct for public fault input +type PubFaultInfo struct { + Id string `json:"id"` + TimeStamp int64 `json:"timestamp"` + Version string `json:"version"` + Resource string `json:"resource"` + Faults []Fault `json:"faults"` +} + +// Fault public fault cm item Fault +type Fault struct { + FaultId string `json:"faultId"` + FaultType string `json:"faultType"` + FaultCode string `json:"faultCode"` + FaultTime int64 `json:"faultTime"` + Assertion string `json:"assertion"` + FaultLocation map[string]string `json:"faultLocation"` + Influence []Influence `json:"influence"` + Description string `json:"description"` +} + +// Influence public fault cm item Influence +type Influence struct { + NodeName string `json:"nodeName"` + NodeSN string `json:"nodeSN"` + DeviceIds []int32 `json:"deviceIds"` +} diff --git a/mind-cluster/component/ascend-common/api/slownet/fault_net.go b/mind-cluster/component/ascend-common/api/slownet/fault_net.go new file mode 100644 index 0000000..eacde6a --- /dev/null +++ b/mind-cluster/component/ascend-common/api/slownet/fault_net.go @@ -0,0 +1,77 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package slownet for net fault detect common +package slownet + +import ( + "fmt" + "os" + "path/filepath" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" +) + +const ( + rasNetRootPathKey = "RAS_NET_ROOT_PATH" + netFaultSubPath = "cluster" + detectConf = "cathelper.conf" +) + +// GetRasNetRootPath get ras net fault detect root path from env +func GetRasNetRootPath() (string, error) { + rootPath := os.Getenv(rasNetRootPathKey) + if len(rootPath) == 0 { + return "", fmt.Errorf("env %s not exists, please config it before starting", rasNetRootPathKey) + } + if !utils.IsDir(rootPath) { + return "", fmt.Errorf("env %s=%s, which is not dir", rasNetRootPathKey, rootPath) + } + safeRootPath, err := utils.CheckPath(rootPath) + if err != nil { + return "", fmt.Errorf("env %s=%s, which is invalid, err: %v", rasNetRootPathKey, rootPath, err) + } + return safeRootPath, nil +} + +// GetPingListFilePath get ping list task info file for ping mesh +func GetPingListFilePath(superPodId, serverIndex string) (string, error) { + rootPath, err := GetRasNetRootPath() + if err != nil { + return "", err + } + return filepath.Join(rootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), + fmt.Sprintf("ping_list_%s.json", serverIndex)), nil +} + +// GetSuperPodInfoFilePath get super pod info file path +func GetSuperPodInfoFilePath(superPodID, superPodPrefix string) (string, error) { + rootPath, err := GetRasNetRootPath() + if err != nil { + hwlog.RunLog.Errorf("get ras net root path failed, err : %v", err) + return "", err + } + superPodPathName := fmt.Sprintf("%s-%s", superPodPrefix, superPodID) + fileName := fmt.Sprintf("%s.json", superPodPathName) + filePath := filepath.Join(rootPath, netFaultSubPath, superPodPathName, fileName) + if _, errInfo := utils.CheckPath(filePath); errInfo != nil { + hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) + return "", errInfo + } + return filePath, nil +} + +// GetConfigPathForDetect the config path for network fault detect so +func GetConfigPathForDetect(superPodId string) (string, error) { + rasNetRootPath, err := GetRasNetRootPath() + if err != nil { + hwlog.RunLog.Errorf("get ras net root path failed, err: %v", err) + return "", err + } + confPath := filepath.Join(rasNetRootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), detectConf) + if _, errInfo := utils.CheckPath(confPath); errInfo != nil { + hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) + return "", errInfo + } + return confPath, nil +} diff --git a/mind-cluster/component/ascend-common/api/superpoddevice.go b/mind-cluster/component/ascend-common/api/superpoddevice.go new file mode 100644 index 0000000..4039dcb --- /dev/null +++ b/mind-cluster/component/ascend-common/api/superpoddevice.go @@ -0,0 +1,36 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api structs for SuperPodDevice +package api + +import "k8s.io/apimachinery/pkg/util/sets" + +// NpuBaseInfo is the base info of npu +type NpuBaseInfo struct { + IP string + SuperDeviceID uint32 +} + +// NodeDevice node device info +type NodeDevice struct { + NodeName string + ServerID string + ServerType string `json:"-"` + DeviceMap map[string]string // key: dev phyID, value: superPod device id +} + +// SuperPodDevice super node device info, key is superPodID, value is NodeDevice +type SuperPodDevice struct { + Version string + SuperPodID string + NodeDeviceMap map[string]*NodeDevice +} + +// SuperPodFaultInfos super pod fault info +type SuperPodFaultInfos struct { + SdIds []string + FaultNodes sets.String + NodeNames []string + FaultTimes int64 + JobId string `json:"JobId,omitempty"` +} diff --git a/mind-cluster/component/ascend-common/api/type.go b/mind-cluster/component/ascend-common/api/type.go new file mode 100644 index 0000000..9a2cde1 --- /dev/null +++ b/mind-cluster/component/ascend-common/api/type.go @@ -0,0 +1,30 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package api common const +package api + +// ResetCmInfo is the reset config info of a task +type ResetCmInfo struct { + RankList []*DevFaultnfo + UpdateTime int64 + RetryTime int + FaultFlushing bool + GracefulExit int + RestartFaultProcess bool +} + +// DevFaultnfo is the device info of a task +type DevFaultnfo struct { + RankId int + FaultInfo +} + +// FaultInfo is the fault info of device +type FaultInfo struct { + LogicId int32 + Status string + Policy string + InitialPolicy string + ErrorCode []int64 + ErrorCodeHex string +} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go new file mode 100644 index 0000000..0c0d420 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go @@ -0,0 +1,394 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package cache implement a memory-based LRU local cache +package cache + +import ( + "container/list" + "errors" + "fmt" + "math" + "sync" + "time" +) + +const ( + segmentCount = 16 + int64One int64 = 1 + int64Zero int64 = 0 + negInt64One int64 = -1 + intTwo = 2 + hashInit uint32 = 2166136261 + prime32 uint32 = 16777619 + twentyYears time.Duration = 20 * 365 * 24 * time.Hour +) + +var ( + notInitErr = errors.New("not initializes") + paraErr = errors.New("parameter error") +) + +type cacheEle struct { + key string + data interface{} + expireTime int64 +} + +type lruCache struct { + maxSize int + elemIndex map[string]*list.Element + *list.List + mu sync.Mutex +} + +// ConcurrencyLRUCache is a memory-based LRU local cache, default total 16 segment to improve concurrent performance +// LRU is not real least recently used for the total cache,but just for each buket +// we just need a proper method to clear cache +type ConcurrencyLRUCache struct { + segment int + cacheBuket [segmentCount]*lruCache +} + +// Set create or update an element using key +// key: The identity of an element +// value: new value of the element +// expireTime: expire time, positive int64 or -1 which means never overdue +func (cl *ConcurrencyLRUCache) Set(key string, value interface{}, expireTime time.Duration) error { + if cl == nil || cl.cacheBuket[0] == nil { + return notInitErr + } + if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { + return paraErr + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].setValue(key, value, expireTime) +} + +// Get get the value of a cached element by key. If key do not exist, this function will return nil and an error msg +// key: The identity of an element +// return: +// value: the cached value, nil if key do not exist +// err: error info, nil if value is not nil +func (cl *ConcurrencyLRUCache) Get(key string) (interface{}, error) { + if cl == nil || cl.cacheBuket[0] == nil { + return nil, notInitErr + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return nil, errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].getValue(key) +} + +// Delete delete the value by key, no error returned +func (cl *ConcurrencyLRUCache) Delete(key string) { + if cl == nil || cl.cacheBuket[0] == nil { + return + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return + } + cl.cacheBuket[cacheIndex].delValue(key) +} + +// SetIfNX if the key not exist or expired, will set the new value to cache and return true ,otherwise return false +func (cl *ConcurrencyLRUCache) SetIfNX(key string, value interface{}, expireTime time.Duration) bool { + if cl == nil || cl.cacheBuket[0] == nil { + return false + } + if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { + return false + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return false + } + return cl.cacheBuket[cacheIndex].setIfNotExist(key, value, expireTime) +} + +// INCR add one to the value(must int64) of the key , if the key not exist, initialize with 0 and then add one +func (cl *ConcurrencyLRUCache) INCR(key string, expireTime time.Duration) (int64, error) { + if err := validate(cl, expireTime); err != nil { + return 0, err + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return 0, errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].increment(key, expireTime) +} + +// DECR minus one to the value(must int64) of the key,if the key not exist, initialize with 0 and then minus one +func (cl *ConcurrencyLRUCache) DECR(key string, expireTime time.Duration) (int64, error) { + if err := validate(cl, expireTime); err != nil { + return 0, err + } + cacheIndex := cl.index(key) + if cacheIndex < 0 || cacheIndex >= segmentCount { + return 0, errors.New("index out of valid value") + } + return cl.cacheBuket[cacheIndex].decrement(key, expireTime) +} + +func validate(cl *ConcurrencyLRUCache, expireTime time.Duration) error { + if cl == nil || cl.cacheBuket[0] == nil { + return paraErr + } + if expireTime <= 0 && expireTime != time.Duration(negInt64One) { + return paraErr + } + return nil +} + +// index calculate the key hashcode and index the right buket +func (cl *ConcurrencyLRUCache) index(key string) int { + var hash = hashInit + for i := 0; i < len(key); i++ { + hash *= prime32 + hash ^= uint32(key[i]) + } + return int(hash & (uint32(cl.segment) - 1)) +} + +// New create an instance of ConcurrencyLRUCache +// maxEntries the cache size, will to convert to (n/16+n%16>0?1:0)*16 +func New(maxEntries int) *ConcurrencyLRUCache { + if maxEntries <= 0 { + return nil + } + size := maxEntries / segmentCount + remain := maxEntries % segmentCount + if remain > 0 { + size += 1 + } + var cache [segmentCount]*lruCache + for i := 0; i < segmentCount; i++ { + cache[i] = &lruCache{ + maxSize: size, + elemIndex: make(map[string]*list.Element, segmentCount), + List: list.New(), + mu: sync.Mutex{}, + } + } + return &ConcurrencyLRUCache{ + segment: segmentCount, + cacheBuket: cache, + } +} + +func (c *lruCache) setValue(key string, value interface{}, expireTime time.Duration) error { + if c == nil || c.elemIndex == nil { + return errors.New("not initializes") + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + // if the cache not exist + c.setInner(key, value, expireTime) + return nil + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + return errors.New("cacheElement convert failed") + } + c.MoveToFront(v) + pkgElement(ele, value, expireTime) + return nil +} + +func pkgElement(ele *cacheEle, value interface{}, expireTime time.Duration) { + ele.data = value + if expireTime == time.Duration(negInt64One) { + ele.expireTime = negInt64One + return + } + ele.expireTime = time.Now().UnixNano() + int64(expireTime) +} + +func (c *lruCache) getValue(key string) (interface{}, error) { + if c == nil || c.elemIndex == nil { + return nil, errors.New("not initializes") + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + return nil, errors.New("no value found") + } + c.MoveToFront(v) + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + return nil, errors.New("cacheElement convert failed") + } + if ele.expireTime != negInt64One && time.Now().UnixNano() > ele.expireTime { + // if cache expired + c.safeDeleteByKey(key, v) + return nil, errors.New("the key was expired") + } + return ele.data, nil +} + +// Delete delete an element +func (c *lruCache) delValue(key string) { + if c == nil || c.elemIndex == nil { + return + } + c.mu.Lock() + defer c.mu.Unlock() + if v, ok := c.elemIndex[key]; ok { + c.safeDeleteByKey(key, v) + } +} + +func (c *lruCache) setIfNotExist(key string, value interface{}, expireTime time.Duration) bool { + if c == nil || c.elemIndex == nil { + return false + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + // if the cache not exist + c.setInner(key, value, expireTime) + return true + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + return false + } + c.MoveToFront(v) + if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { + return false + } + // if cache expired + pkgElement(ele, value, expireTime) + return true +} + +func (c *lruCache) increment(key string, expireTime time.Duration) (int64, error) { + if c == nil || c.elemIndex == nil { + return 0, notInitErr + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + c.setInner(key, int64One, expireTime) + return int64One, nil + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + c.setInner(key, int64One, expireTime) + return int64One, nil + } + c.MoveToFront(v) + if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { + newValue, ok := ele.data.(int64) + if !ok || newValue == math.MaxInt64 { + return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) + } + newValue++ + pkgElement(ele, newValue, expireTime) + return newValue, nil + } + // if cache expired + pkgElement(ele, int64One, expireTime) + return int64One, nil +} + +func (c *lruCache) decrement(key string, expireTime time.Duration) (int64, error) { + if c == nil || c.elemIndex == nil { + return 0, notInitErr + } + c.mu.Lock() + defer c.mu.Unlock() + v, ok := c.elemIndex[key] + if !ok { + // if the cache not exist + c.setInner(key, negInt64One, expireTime) + return negInt64One, nil + } + ele, ok := v.Value.(*cacheEle) + if !ok { + c.safeDeleteByKey(key, v) + c.setInner(key, negInt64One, expireTime) + return negInt64One, nil + } + c.MoveToFront(v) + if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { + newValue, ok := ele.data.(int64) + if !ok || newValue == math.MinInt64 { + return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) + } + newValue-- + pkgElement(ele, newValue, expireTime) + return newValue, nil + } + // if cache expired + pkgElement(ele, negInt64One, expireTime) + return negInt64One, nil +} + +func (c *lruCache) setInner(key string, value interface{}, expireTime time.Duration) { + if c == nil { + return + } + if c.Len()+1 > c.maxSize { + c.safeRemoveOldest() + } + newElem := &cacheEle{ + key: key, + data: value, + expireTime: negInt64One, + } + if expireTime != time.Duration(negInt64One) { + newElem.expireTime = time.Now().UnixNano() + int64(expireTime) + } + e := c.PushFront(newElem) + c.elemIndex[key] = e +} + +func (c *lruCache) safeDeleteByKey(key string, v *list.Element) { + if c == nil { + return + } + c.List.Remove(v) + delete(c.elemIndex, key) +} + +func (c *lruCache) safeRemoveOldest() { + if c == nil { + return + } + v := c.List.Back() + if v == nil { + return + } + c.List.Remove(v) + ele, ok := v.Value.(*cacheEle) + if !ok { + return + } + delete(c.elemIndex, ele.key) +} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go new file mode 100644 index 0000000..a8b5ea0 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go @@ -0,0 +1,304 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package cache implement a memory-based LRU local cache +package cache + +import ( + "container/list" + "fmt" + "math" + "sync" + "testing" + "time" + + "github.com/smartystreets/goconvey/convey" +) + +const ( + cacheTime = 500 + goRoutineCount = 10 +) + +func TestSet(t *testing.T) { + cache := New(1) + convey.Convey("test lru cacheTime", t, func() { + cache.Set("testkey1", "1", cacheTime*time.Millisecond) + v, err := cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + time.Sleep(cacheTime * time.Millisecond) + v, err = cache.Get("testkey1") + convey.So(v, convey.ShouldEqual, nil) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("test set twice", t, func() { + cache.Set("testkey1", "1", time.Minute) + v, err := cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + cache.Set("testkey1", "2", time.Minute) + v, err = cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "2") + }) + convey.Convey("SET failed", t, func() { + c := &lruCache{} + err := c.setValue("test", "1", time.Minute) + convey.So(err.Error(), convey.ShouldEqual, "not initializes") + _, err = c.getValue("test") + convey.So(err.Error(), convey.ShouldEqual, "not initializes") + }) + convey.Convey("SET not expired", t, func() { + cache.Set("testkey2", "1", time.Second) + err := cache.Set("testkey2", "1", time.Duration(negInt64One)) + convey.So(err, convey.ShouldEqual, nil) + v, err := cache.Get("testkey2") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + }) + convey.Convey("SET parameter error", t, func() { + err := cache.Set("testkey2", "1", -time.Second) + convey.So(err.Error(), convey.ShouldEqual, "parameter error") + }) +} + +func TestDelete(t *testing.T) { + cache := New(1) + convey.Convey("test lru delete", t, func() { + cache.Set("testkey1", "1", time.Minute) + v, err := cache.Get("testkey1") + convey.So(err, convey.ShouldEqual, nil) + convey.So(v, convey.ShouldEqual, "1") + cache.Delete("testkey1") + v, err = cache.Get("testkey1") + convey.So(v, convey.ShouldEqual, nil) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("Delete no thing", t, func() { + c := &lruCache{} + c.delValue("test") + }) +} + +func TestSetIfNX(t *testing.T) { + cache := New(1) + convey.Convey("SetIfNX set parameter error", t, func() { + r := cache.SetIfNX("testkey1", "1", -time.Millisecond) + convey.So(r, convey.ShouldEqual, false) + }) + convey.Convey("SetIfNX set success", t, func() { + r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, true) + }) + convey.Convey("SetIfNX set success failed", t, func() { + r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, false) + }) + time.Sleep(cacheTime * time.Millisecond) + convey.Convey("SetIfNX set success", t, func() { + r := cache.SetIfNX("testkey1", "1", time.Second) + convey.So(r, convey.ShouldEqual, true) + }) + convey.Convey("SetIfNX expireTime -1", t, func() { + r := cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) + convey.So(r, convey.ShouldEqual, true) + r = cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) + convey.So(r, convey.ShouldEqual, false) + }) + +} + +func TestSetIfNXConcurrencyTest(t *testing.T) { + cache := New(1) + convey.Convey("SetIfNX concurrency test", t, func() { + var count = 0 + count = testSetIfNX(cache, count) + convey.So(count, convey.ShouldEqual, 1) + }) +} + +func testSetIfNX(cache *ConcurrencyLRUCache, count int) int { + l := sync.Mutex{} + wg := sync.WaitGroup{} + wg.Add(goRoutineCount) + for i := 0; i < goRoutineCount; i++ { + go func() { + r := cache.SetIfNX("testkey2", "1", time.Second) + if r { + l.Lock() + count++ + l.Unlock() + } + wg.Done() + }() + } + wg.Wait() + return count +} + +func TestINCRConcurrencyTest(t *testing.T) { + cache := New(1) + convey.Convey("INCR concurrency test", t, func() { + max := testIncr(cache) + convey.So(max, convey.ShouldEqual, goRoutineCount) + }) +} + +func testIncr(cache *ConcurrencyLRUCache) int64 { + var max = int64Zero + l := sync.Mutex{} + wg := sync.WaitGroup{} + wg.Add(goRoutineCount) + for i := 0; i < goRoutineCount; i++ { + go func() { + r, err := cache.INCR("testkey1", time.Second) + if err != nil { + return + } + l.Lock() + if r > max { + max = r + } + l.Unlock() + wg.Done() + }() + } + wg.Wait() + return max +} + +func TestDECRConcurrencyTest(t *testing.T) { + cache := New(1) + cache.Set("testkey1", int64(goRoutineCount), time.Minute) + convey.Convey("INCR concurrency test", t, func() { + min := testDecr(cache) + convey.So(min, convey.ShouldEqual, 0) + }) +} + +func testDecr(cache *ConcurrencyLRUCache) int64 { + var min = int64(math.MaxInt) + l := sync.Mutex{} + wg := sync.WaitGroup{} + wg.Add(goRoutineCount) + for i := 0; i < goRoutineCount; i++ { + go func() { + r, err := cache.DECR("testkey1", time.Second) + if err != nil { + return + } + l.Lock() + if r < min { + min = r + } + l.Unlock() + wg.Done() + }() + } + wg.Wait() + return min +} + +func TestINCR(t *testing.T) { + cache := New(1) + convey.Convey("not initializes", t, func() { + c := &lruCache{} + _, err := c.increment("test", time.Minute) + convey.So(err, convey.ShouldEqual, notInitErr) + }) + convey.Convey("parameter error", t, func() { + _, err := cache.INCR("testkey", -time.Minute) + convey.So(err, convey.ShouldEqual, paraErr) + }) + convey.Convey("INCR success", t, func() { + r, err := cache.INCR("testkey", time.Minute) + convey.So(r, convey.ShouldEqual, 1) + convey.So(err, convey.ShouldEqual, nil) + r, err = cache.INCR("testkey", time.Minute) + convey.So(r, convey.ShouldEqual, intTwo) + }) + + convey.Convey("INCR success when exits", t, func() { + cache.Set("testkey1", int64Zero, cacheTime*time.Millisecond) + r, err := cache.INCR("testkey1", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, 1) + convey.So(err, convey.ShouldEqual, nil) + time.Sleep(cacheTime * time.Millisecond) + r, err = cache.INCR("testkey1", time.Minute) + convey.So(r, convey.ShouldEqual, 1) + }) +} + +func TestDECR(t *testing.T) { + cache := New(1) + convey.Convey("not initializes", t, func() { + c := &lruCache{} + _, err := c.decrement("test", time.Minute) + convey.So(err, convey.ShouldEqual, notInitErr) + }) + convey.Convey("parameter error", t, func() { + _, err := cache.DECR("testkey1", -time.Minute) + convey.So(err, convey.ShouldEqual, paraErr) + }) + convey.Convey("SetIfNX set success", t, func() { + r, err := cache.DECR("testkey1", time.Minute) + convey.So(r, convey.ShouldEqual, negInt64One) + convey.So(err, convey.ShouldEqual, nil) + cache.Set("testkey1", int64One, time.Minute) + r, err = cache.DECR("testkey1", time.Minute) + convey.So(r, convey.ShouldEqual, 0) + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("Decr success when exits", t, func() { + cache.Set("testkey2", int64One, cacheTime*time.Millisecond) + r, err := cache.DECR("testkey2", cacheTime*time.Millisecond) + convey.So(r, convey.ShouldEqual, 0) + convey.So(err, convey.ShouldEqual, nil) + time.Sleep(cacheTime * time.Millisecond) + r, err = cache.DECR("testkey2", time.Minute) + convey.So(err, convey.ShouldEqual, nil) + convey.So(r, convey.ShouldEqual, negInt64One) + }) +} + +func TestLRU(t *testing.T) { + convey.Convey("not initializes", t, func() { + c := &lruCache{ + maxSize: intTwo, + elemIndex: make(map[string]*list.Element, segmentCount), + List: list.New(), + mu: sync.Mutex{}, + } + c.setValue("test", "1", time.Minute) + c.setValue("test1", "1", time.Minute) + c.setValue("test2", "1", time.Minute) + _, err := c.getValue("test") + convey.So(err.Error(), convey.ShouldEqual, "no value found") + }) +} + +func BenchmarkSetIfNx(b *testing.B) { + cache := New(1) + for n := 0; n < b.N; n++ { + cache.SetIfNX(fmt.Sprintf("key%d", n), "xx", time.Second) + } +} + +func BenchmarkINCR(b *testing.B) { + cache := New(1) + for n := 0; n < b.N; n++ { + cache.INCR("sdds", time.Second) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api.go new file mode 100644 index 0000000..65de3e7 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/api.go @@ -0,0 +1,310 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "context" + "fmt" + "io" + "log" + "os" + "path" +) + +const ( + logDebugLv = iota - 1 + logInfoLv + logWarnLv + logErrorLv + logCriticalLv +) + +type logger struct { + lgDebug *log.Logger + lgInfo *log.Logger + lgWarn *log.Logger + lgError *log.Logger + lgCritical *log.Logger + lgCtrl *LogLimiter + lgLevel int + lgMaxLine int +} + +func (lg *logger) initLogWriter(w io.Writer) { + lg.lgDebug = log.New(w, "[DEBUG] ", log.Ldate|log.Lmicroseconds) + lg.lgInfo = log.New(w, "[INFO] ", log.Ldate|log.Lmicroseconds) + lg.lgWarn = log.New(w, "[WARN] ", log.Ldate|log.Lmicroseconds) + lg.lgError = log.New(w, "[ERROR] ", log.Ldate|log.Lmicroseconds) + lg.lgCritical = log.New(w, "[Critical] ", log.Ldate|log.Lmicroseconds) +} + +func (lg *logger) setLoggerLevel(lv int) { + if lv < minLogLevel || lv > maxLogLevel { + lg.lgLevel = 0 + return + } + lg.lgLevel = lv +} + +func (lg *logger) setLoggerMaxLine(lml int) { + if lml <= 0 || lml > maxEachLineLen { + lg.lgMaxLine = defaultMaxEachLineLen + return + } + lg.lgMaxLine = lml +} + +func (lg *logger) setLoggerWriter(config *LogConfig) { + rollLogger := &Logs{ + FileName: config.LogFileName, + Capacity: config.FileMaxSize, // megabytes + SaveVolume: config.MaxBackups, + SaveTime: config.MaxAge, // days + } + logWriter := &LogLimiter{ + Logs: rollLogger, + ExpiredTime: config.ExpiredTime, // seconds + CacheSize: config.CacheSize, + } + if config.OnlyToStdout { + lg.initLogWriter(os.Stdout) + return + } + if config.OnlyToFile { + lg.initLogWriter(logWriter) + return + } + writer := io.MultiWriter(os.Stdout, logWriter) + lg.initLogWriter(writer) + lg.lgCtrl = logWriter +} + +func (lg *logger) setLogger(config *LogConfig) error { + if err := validateLogConfigFiled(config); err != nil { + return err + } + lg.setLoggerWriter(config) + lg.setLoggerLevel(config.LogLevel) + lg.setLoggerMaxLine(config.MaxLineLength) + msg := fmt.Sprintf("%s's logger init success", path.Base(config.LogFileName)) + // skip change file mode and fs notify + if config.OnlyToStdout { + msg = fmt.Sprintf("%s, only to stdout", msg) + return nil + } + lg.Info(msg) + if err := os.Chmod(config.LogFileName, LogFileMode); err != nil { + lg.Errorf("change file mode failed: %v", err) + return fmt.Errorf("set log file mode failed") + } + return nil +} + +func (lg *logger) isInit() bool { + return lg.lgDebug != nil && lg.lgInfo != nil && lg.lgWarn != nil && lg.lgError != nil && lg.lgCritical != nil +} + +// Debug record debug not format +func (lg *logger) Debug(args ...interface{}) { + lg.DebugWithCtx(nil, args...) +} + +// Debugf record debug +func (lg *logger) Debugf(format string, args ...interface{}) { + lg.DebugfWithCtx(nil, format, args...) +} + +// DebugWithCtx record Debug not format +func (lg *logger) DebugWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logDebugLv { + return + } + if lg.validate() { + printHelper(lg.lgDebug, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// DebugfWithCtx record Debug format +func (lg *logger) DebugfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logDebugLv { + return + } + if lg.validate() { + printHelper(lg.lgDebug, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// Info record info not format +func (lg *logger) Info(args ...interface{}) { + lg.InfoWithCtx(nil, args...) +} + +// Infof record info +func (lg *logger) Infof(format string, args ...interface{}) { + lg.InfofWithCtx(nil, format, args...) +} + +// InfoWithCtx record Info not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) InfoWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logInfoLv { + return + } + if lg.validate() { + printHelper(lg.lgInfo, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// InfofWithCtx record Info format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) InfofWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logInfoLv { + return + } + if lg.validate() { + printHelper(lg.lgInfo, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// Warn record warn not format +func (lg *logger) Warn(args ...interface{}) { + lg.WarnWithCtx(nil, args...) +} + +// Warnf record warn +func (lg *logger) Warnf(format string, args ...interface{}) { + lg.WarnfWithCtx(nil, format, args...) +} + +// WarnWithCtx record Warn not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) WarnWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logWarnLv { + return + } + if lg.validate() { + printHelper(lg.lgWarn, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// WarnfWithCtx record Warn format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) WarnfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logWarnLv { + return + } + if lg.validate() { + printHelper(lg.lgWarn, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// WarnfWithLimit record warn for default times (default 3),domain is for logType of msg, +// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt +func (lg *logger) WarnfWithLimit(domain string, id interface{}, format string, args ...interface{}) { + if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + lg.WarnfWithCtx(nil, format, args...) + } +} + +// Error record error not format +func (lg *logger) Error(args ...interface{}) { + lg.ErrorWithCtx(nil, args...) +} + +// Errorf record error +func (lg *logger) Errorf(format string, args ...interface{}) { + lg.ErrorfWithCtx(nil, format, args...) +} + +// ErrorfWithLimit record error for default times (default 3),domain is for logType of msg, +// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt +func (lg *logger) ErrorfWithLimit(domain string, id interface{}, format string, args ...interface{}) { + if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + lg.ErrorfWithCtx(nil, format, args...) + } +} + +// ErrorfWithSpecifiedCounts record error for specified times,domain is for logType of msg, +// id is a unique identifier of this logType,maxCounts is for max print counts, +// you can reset the counter by call ResetErrCnt +func (lg *logger) ErrorfWithSpecifiedCounts(domain string, id interface{}, maxCounts int, + format string, args ...interface{}) { + if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, maxCounts); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + lg.ErrorfWithCtx(nil, format, args...) + } +} + +// ErrorWithCtx record Error not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) ErrorWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logErrorLv { + return + } + if lg.validate() { + printHelper(lg.lgError, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// ErrorfWithCtx record Error format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) ErrorfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logErrorLv { + return + } + if lg.validate() { + printHelper(lg.lgError, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +// Critical record critical not format +func (lg *logger) Critical(args ...interface{}) { + lg.CriticalWithCtx(nil, args...) +} + +// Criticalf record Critical log format +func (lg *logger) Criticalf(format string, args ...interface{}) { + lg.CriticalfWithCtx(nil, format, args...) +} + +// CriticalWithCtx record Critical not format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) CriticalWithCtx(ctx context.Context, args ...interface{}) { + if lg.lgLevel > logCriticalLv { + return + } + if lg.validate() { + printHelper(lg.lgCritical, fmt.Sprint(args...), lg.lgMaxLine, ctx) + } +} + +// CriticalfWithCtx record Critical format with context, if you have no ctx, please use the method with not ctx +func (lg *logger) CriticalfWithCtx(ctx context.Context, format string, args ...interface{}) { + if lg.lgLevel > logCriticalLv { + return + } + if lg.validate() { + printHelper(lg.lgCritical, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) + } +} + +func (lg *logger) validate() bool { + if lg == nil || !lg.isInit() { + fmt.Println("Fatal function's logger is nil") + return false + } + return true +} + +// FlushMem writes the contents of the memory to the disk +func (lg *logger) FlushMem() error { + return lg.lgCtrl.Flush() +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go new file mode 100644 index 0000000..ecdcef6 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go @@ -0,0 +1,165 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "fmt" + "io/fs" + "os" + "path" + "path/filepath" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/utils" +) + +func TestNewLogger(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test setLogger func", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + // test for log file + mockPathCheck := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { + return "", nil + }) + mockMkdir := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { + return nil + }) + defer mockPathCheck.Reset() + defer mockMkdir.Reset() + lgConfig = &LogConfig{ + LogFileName: path.Join(filepath.Dir(os.Args[0]), "t.log"), + OnlyToFile: true, + MaxBackups: DefaultMaxBackups, + MaxAge: DefaultMinSaveAge, + CacheSize: DefaultCacheSize, + ExpiredTime: DefaultExpiredTime, + } + err = lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestLoggerPrint(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test logger print func", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + LogLevel: -1, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + lg.Debug("test debug") + lg.Debugf("test debugf") + lg.Info("test info") + lg.Infof("test infof") + lg.Warn("test warn") + lg.Warnf("test warnf") + lg.Error("test error") + lg.Errorf("test errorf") + lg.Critical("test critical") + lg.Criticalf("test criticalf") + lg.setLoggerLevel(maxLogLevel + 1) + lg.Debug("test debug") + lg.Debugf("test debugf") + lg.Info("test info") + lg.Infof("test infof") + lg.Warn("test warn") + lg.Warnf("test warnf") + lg.Error("test error") + lg.Errorf("test errorf") + lg.Critical("test critical") + lg.Criticalf("test criticalf") + }) + }) +} +func TestLoggerPrintWithLimit(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test logger print func with limit", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + LogLevel: -1, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + domain := "hccs" + logicId := 1 + + errFormat := "collect failed ,err:%v" + collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + ResetErrCnt(domain, logicId) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) + }) + }) +} + +func TestWarnfWithLimit(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test warn logger print func with limit", func() { + lgConfig := &LogConfig{ + OnlyToStdout: true, + LogLevel: -1, + } + lg := new(logger) + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + domain := "hccs" + logicId := 1 + + errFormat := "collect failed ,err:%v" + collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + ResetErrCnt(domain, logicId) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) + }) + }) +} + +func TestValidate(t *testing.T) { + convey.Convey("test api", t, func() { + convey.Convey("test validate", func() { + lg := new(logger) + res := lg.validate() + convey.So(res, convey.ShouldBeFalse) + lgConfig := &LogConfig{ + OnlyToStdout: true, + } + err := lg.setLogger(lgConfig) + convey.So(err, convey.ShouldBeNil) + res = lg.validate() + convey.So(res, convey.ShouldBeTrue) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go new file mode 100644 index 0000000..5e5c567 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go @@ -0,0 +1,174 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "context" + "errors" +) + +// RunLog run logger +var RunLog *logger + +// InitRunLogger initialize run logger +func InitRunLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("run logger config is nil") + } + if RunLog != nil && RunLog.isInit() { + RunLog.Warn("run logger is been initialized") + return nil + } + RunLog = new(logger) + if RunLog == nil { + return errors.New("malloc new logger flied") + } + if err := RunLog.setLogger(config); err != nil { + return err + } + if !RunLog.isInit() { + return errors.New("run logger init failed") + } + return nil +} + +// OpLog operate logger +var OpLog *logger + +// InitOperateLogger initialize operate logger +func InitOperateLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("operate logger config is nil") + } + if OpLog != nil && OpLog.isInit() { + OpLog.Warn("operate logger is been initialized") + return nil + } + OpLog = new(logger) + if OpLog == nil { + return errors.New("malloc new logger flied") + } + if err := OpLog.setLogger(config); err != nil { + return err + } + if !OpLog.isInit() { + return errors.New("operate logger init failed") + } + return nil +} + +// SecLog security logger +var SecLog *logger + +// InitSecurityLogger initialize security logger +func InitSecurityLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("security logger config is nil") + } + if SecLog != nil && SecLog.isInit() { + SecLog.Warn("security logger is been initialized") + return nil + } + SecLog = new(logger) + if SecLog == nil { + return errors.New("malloc new logger flied") + } + if err := SecLog.setLogger(config); err != nil { + return err + } + if !SecLog.isInit() { + return errors.New("security logger init failed") + } + return nil +} + +// UserLog user logger +var UserLog *logger + +// InitUserLogger initialize user logger +func InitUserLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("user logger config is nil") + } + if UserLog != nil && UserLog.isInit() { + UserLog.Warn("user logger is been initialized") + return nil + } + UserLog = new(logger) + if UserLog == nil { + return errors.New("malloc new logger flied") + } + if err := UserLog.setLogger(config); err != nil { + return err + } + if !UserLog.isInit() { + return errors.New("user logger init failed") + } + return nil +} + +// DebugLog debug logger +var DebugLog *logger + +// InitDebugLogger initialize debug logger +func InitDebugLogger(config *LogConfig, ctx context.Context) error { + if config == nil { + return errors.New("debug logger config is nil") + } + if DebugLog != nil && DebugLog.isInit() { + DebugLog.Warn("debug logger is been initialized") + return nil + } + DebugLog = new(logger) + if DebugLog == nil { + return errors.New("malloc new logger flied") + } + if err := DebugLog.setLogger(config); err != nil { + return err + } + if !DebugLog.isInit() { + return errors.New("debug logger init failed") + } + return nil +} + +// CustomLogger custom logger +type CustomLogger struct { + *logger +} + +// NewCustomLogger create a new custom logger +func NewCustomLogger(config *LogConfig, ctx context.Context) (*CustomLogger, error) { + if config == nil { + return nil, errors.New("custom logger config is nil") + } + log := new(logger) + if err := log.setLogger(config); err != nil { + return nil, err + } + if !log.isInit() { + return nil, errors.New("logger init failed") + } + return &CustomLogger{logger: log}, nil +} + +// SetCustomLogger set custom logger +func SetCustomLogger(log *logger) *CustomLogger { + if log == nil { + return nil + } + return &CustomLogger{logger: log} +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go new file mode 100644 index 0000000..a32e9be --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go @@ -0,0 +1,126 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "context" + "errors" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestInitRunLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init run log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitRunLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("run logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitRunLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitRunLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestNewCustomLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init custom log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + _, err := NewCustomLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("custom logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + _, err = NewCustomLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + _, err = NewCustomLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitOperateLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init operate log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitOperateLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("operate logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitOperateLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitOperateLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitSecurityLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init security log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitSecurityLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("security logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitSecurityLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitSecurityLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitUserLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init user log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitUserLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("user logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitUserLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitUserLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} + +func TestInitDebugLogger(t *testing.T) { + convey.Convey("test hwlog adaptor", t, func() { + convey.Convey("test init debug log", func() { + ctx, cancel := context.WithCancel(context.TODO()) + err := InitDebugLogger(nil, ctx) + convey.So(err, convey.ShouldBeError, errors.New("debug logger config is nil")) + lgConfig := &LogConfig{OnlyToStdout: true} + err = InitDebugLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + // repeat initialize + err = InitDebugLogger(lgConfig, ctx) + convey.So(err, convey.ShouldBeNil) + cancel() + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go b/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go new file mode 100644 index 0000000..88cfb9d --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go @@ -0,0 +1,156 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "fmt" + "sync" + "time" + + "ascend-common/common-utils/cache" +) + +const ( + // MaxCacheSize indicates the maximum log cache size + MaxCacheSize = 100 * 1024 + // MaxExpiredTime indicates the maximum log cache expired time + MaxExpiredTime = 60 * 60 + // DefaultCacheSize indicates the default log cache size + DefaultCacheSize = 10 * 1024 + // DefaultExpiredTime indicates the default log cache expired time + DefaultExpiredTime = 1 + cutPreLen = 46 + // ProblemOccurMaxNumbers indicates the maximum number of times that the same problem can occur + ProblemOccurMaxNumbers = 3 +) + +var ( + errorMap sync.Map +) + +// LogLimiter encapsulates Logs and provides the log traffic limiting capability +// to prevent too many duplicate logs. +type LogLimiter struct { + // Logs is a log rotate instance + Logs *Logs + logCache *cache.ConcurrencyLRUCache + logMu sync.Mutex + doOnce sync.Once + + logExpiredTime time.Duration + // CacheSize indicates the size of log cache + CacheSize int + // ExpiredTime indicates the expired time of log cache + ExpiredTime int +} + +// Write implements io.Writer. It encapsulates the Write method of Los and uses +// the lru cache to prevent duplicate log writing. +func (l *LogLimiter) Write(d []byte) (int, error) { + if l == nil { + return 0, fmt.Errorf("log limiter pointer does not exist") + } + + l.logMu.Lock() + defer l.logMu.Unlock() + + if l.ExpiredTime == 0 || l.CacheSize == 0 { + return l.Logs.Write(d) + } + + l.doOnce.Do(func() { + l.validateLimiterConf() + l.logCache = cache.New(l.CacheSize) + l.logExpiredTime = time.Duration(int64(l.ExpiredTime) * int64(time.Second)) + }) + + if l.logCache == nil { + l.logCache = cache.New(DefaultCacheSize) + } + if !l.logCache.SetIfNX(string(d[cutPreLen:]), "v", l.logExpiredTime) { + return 0, nil + } + + return l.Logs.Write(d) +} + +// Close implements io.Closer. It encapsulates the Close method of Logs. +func (l *LogLimiter) Close() error { + if l == nil { + return fmt.Errorf("log limiter pointer does not exist") + } + + l.logMu.Lock() + defer l.logMu.Unlock() + + return l.Logs.Close() +} + +// Flush encapsulates the Flush method of Logs. +func (l *LogLimiter) Flush() error { + if l == nil { + return fmt.Errorf("log limiter pointer does not exist") + } + + l.logMu.Lock() + defer l.logMu.Unlock() + + return l.Logs.Flush() +} + +// validateLimiterConf verifies the external input parameters in the LogLimiter. +func (l *LogLimiter) validateLimiterConf() { + if l.CacheSize < 0 || l.CacheSize > MaxCacheSize { + l.CacheSize = DefaultCacheSize + } + if l.ExpiredTime < 0 || l.ExpiredTime > MaxExpiredTime { + l.ExpiredTime = DefaultExpiredTime + } +} + +func getKey(domain string, id interface{}) string { + return fmt.Sprintf("%d_%s", id, domain) +} + +// IsNeedPrintWithSpecifiedCounts check whether print the error message, +// if the error message (domain_id as a unique identifier) has been printed +// for problemOccurMaxNumbers times, return false +func IsNeedPrintWithSpecifiedCounts(domain string, id interface{}, problemOccurMaxNumbers int) (bool, string) { + key := getKey(domain, id) + cnt, _ := errorMap.LoadOrStore(key, 0) + intCnt, ok := cnt.(int) + extraErrLog := "" + if !ok { + // the counter type is abnormal, print by default + return true, extraErrLog + } + if intCnt >= problemOccurMaxNumbers { + return false, extraErrLog + } + intCnt += 1 + errorMap.Store(key, intCnt) + if intCnt == problemOccurMaxNumbers { + extraErrLog = fmt.Sprintf(".The error log has been printed for %v times "+ + "and will not be printed any more", problemOccurMaxNumbers) + } + return true, extraErrLog + +} + +// ResetErrCnt reset the error count +func ResetErrCnt(domain string, id interface{}) { + errorMap.Delete(getKey(domain, id)) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go new file mode 100644 index 0000000..f659fbc --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go @@ -0,0 +1,242 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "errors" + "fmt" + "os" + "path" + "regexp" + "strings" + + "github.com/fsnotify/fsnotify" + + "ascend-common/common-utils/utils" +) + +const ( + // DefaultFileMaxSize the default maximum size of a single log file is 20 MB + DefaultFileMaxSize = 20 + // DefaultMinSaveAge the minimum storage duration of backup logs is 7 days + DefaultMinSaveAge = 7 + // DefaultMaxSaveAge the maximum storage duration of backup logs is 700 days + DefaultMaxSaveAge = 700 + // DefaultMaxBackups the default number of backup log + DefaultMaxBackups = 30 + // LogFileMode log file mode + LogFileMode os.FileMode = 0640 + // BackupLogFileMode backup log file mode + BackupLogFileMode os.FileMode = 0400 + // LogDirMode log dir mode + LogDirMode = 0750 + backUpLogRegex = `^.+-[0-9]{4}-[0-9]{2}-[0-9T]{5}-[0-9]{2}-[0-9]{2}\.[0-9]{2,4}` + bitsize = 64 + stackDeep = 3 + pathLen = 2 + minLogLevel = -1 + maxLogLevel = 3 + maxEachLineLen = 1048576 + defaultMaxEachLineLen = 256 +) + +// LogConfig log module config +type LogConfig struct { + // log file path + LogFileName string + // only write to std out, default value: false + OnlyToStdout bool + // only write to file, default value: false + OnlyToFile bool + // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 + LogLevel int + // size of a single log file (MB), default value: 20MB + FileMaxSize int + // MaxLineLength Max length of each log line, default value: 256 + MaxLineLength int + // maximum number of backup log files, default value: 30 + MaxBackups int + // maximum number of days for backup log files, default value: 7 + MaxAge int + // whether backup files need to be compressed, default value: false + IsCompress bool + // expiration time for log cache, default value: 1s + ExpiredTime int + // Size of log cache space, default: 10240 + CacheSize int +} + +var reg = regexp.MustCompile(backUpLogRegex) + +type validateFunc func(config *LogConfig) error + +func checkDir(fileDir string) error { + if !utils.IsExist(fileDir) { + if err := os.MkdirAll(fileDir, LogDirMode); err != nil { + return fmt.Errorf("create dirs failed") + } + return nil + } + if err := os.Chmod(fileDir, LogDirMode); err != nil { + return fmt.Errorf("change log dir mode failed") + } + return nil +} + +func createFile(filePath string) error { + fileName := path.Base(filePath) + if !utils.IsExist(filePath) { + f, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, LogFileMode) + if err != nil { + return fmt.Errorf("create file(%s) failed", fileName) + } + defer func() { + if err := f.Close(); err != nil { + fmt.Printf("close file failed: %v\n", err) + return + } + }() + } + return nil +} + +func checkAndCreateLogFile(filePath string) error { + if !utils.IsFile(filePath) { + return fmt.Errorf("config path is not file") + } + fileDir := path.Dir(filePath) + if err := checkDir(fileDir); err != nil { + return err + } + if err := createFile(filePath); err != nil { + return err + } + return nil +} + +func validateLogConfigFileMaxSize(config *LogConfig) error { + if config.FileMaxSize == 0 { + config.FileMaxSize = DefaultFileMaxSize + return nil + } + if config.FileMaxSize < 0 || config.FileMaxSize > DefaultFileMaxSize { + return fmt.Errorf("the size of a single log file range is (0, 20] MB") + } + + return nil +} + +func validateLogConfigBackups(config *LogConfig) error { + if config.MaxBackups <= 0 || config.MaxBackups > DefaultMaxBackups { + return fmt.Errorf("the number of backup log file range is (0, 30]") + } + return nil +} + +func validateLogConfigMaxAge(config *LogConfig) error { + fmt.Printf("MaxAge %s", config.MaxAge) + if config.MaxAge < DefaultMinSaveAge || config.MaxAge > DefaultMaxSaveAge { + return fmt.Errorf("the maxage of backup logs range is [7,700]") + } + return nil +} + +func validateLogLevel(config *LogConfig) error { + if config.LogLevel < minLogLevel || config.LogLevel > maxLogLevel { + return fmt.Errorf("the log level range should be [-1, 3]") + } + return nil +} + +func validateMaxLineLength(config *LogConfig) error { + if config.MaxLineLength == 0 { + config.MaxLineLength = defaultMaxEachLineLen + return nil + } + if config.MaxLineLength < 0 || config.MaxLineLength > maxEachLineLen { + return fmt.Errorf("the max length of each log line should be in the range (0, 1048576]") + } + return nil +} + +func getValidateFuncList() []validateFunc { + var funcList []validateFunc + funcList = append(funcList, validateLogConfigFileMaxSize, validateLogConfigBackups, validateMaxLineLength, + validateLogConfigMaxAge, validateLogLevel, validateLogConfigLimiter) + return funcList +} + +func validateLogConfigFiled(config *LogConfig) error { + if config.OnlyToStdout { + return nil + } + if _, err := utils.CheckPath(config.LogFileName); err != nil && err != os.ErrNotExist { + return fmt.Errorf("config log path is not absolute path: %v", err) + } + if strings.Contains(config.LogFileName, "..") || strings.Contains(config.LogFileName, "./") { + return errors.New("log path include invalid char") + } + + if err := checkAndCreateLogFile(config.LogFileName); err != nil { + return err + } + validateFuncList := getValidateFuncList() + for _, vaFunc := range validateFuncList { + if err := vaFunc(config); err != nil { + return err + } + } + + return nil +} + +func validateLogConfigLimiter(config *LogConfig) error { + if config.ExpiredTime < 0 || config.ExpiredTime > MaxExpiredTime { + return fmt.Errorf("the expired time of log cache range is [0, 3600], the value 0 disables the limiter") + } + if config.CacheSize < 0 || config.CacheSize > MaxCacheSize { + return fmt.Errorf("the size of log cache range is [0, 102400], the value 0 disables the limiter") + } + return nil +} + +func changeFileMode(l *logger, event fsnotify.Event, logFileFullPath string) { + if l == nil { + fmt.Println("changeFileMode logger is nil") + return + } + var logMode = LogFileMode + logPath := path.Dir(logFileFullPath) + changedFileName := path.Base(event.Name) + if isTargetLog(changedFileName) { + logMode = BackupLogFileMode + } + changedLogFilePath := path.Join(logPath, changedFileName) + if !utils.IsExist(changedLogFilePath) { + return + } + fPath, err := utils.CheckPath(changedLogFilePath) + if err != nil { + l.Errorf("wrong file path: %v", err) + return + } + if errChmod := os.Chmod(fPath, logMode); errChmod != nil { + l.Errorf("set file mode failed, filename: %s", changedFileName) + } +} +func isTargetLog(fileName string) bool { + return reg.MatchString(fileName) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go new file mode 100644 index 0000000..f91b663 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go @@ -0,0 +1,217 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "errors" + "io/fs" + "os" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/fsnotify/fsnotify" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/utils" +) + +func TestCheckDir(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test check dir func", func() { + mockStat := gomonkey.ApplyFunc(os.Stat, func(_ string) (fs.FileInfo, error) { + return nil, os.ErrNotExist + }) + mockMkDir := gomonkey.ApplyFunc(os.MkdirAll, func(_ string, _ fs.FileMode) error { + return nil + }) + defer mockStat.Reset() + defer mockMkDir.Reset() + err := checkDir("log") + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestCreateFile(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test create file", func() { + mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { + return false + }) + mockCreate := gomonkey.ApplyFunc(os.Create, func(_ string) (*os.File, error) { + return nil, nil + }) + defer mockExist.Reset() + defer mockCreate.Reset() + err := createFile("log") + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestCheckAndCreateLogFile(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test checkAndCreateLogFile func", func() { + mockCreate := gomonkey.ApplyFunc(createFile, func(_ string) error { + return nil + }) + defer mockCreate.Reset() + err := checkAndCreateLogFile("log") + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestValidateLogConfigFileMaxSize(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate max size func", func() { + conf := &LogConfig{} + err := validateLogConfigFileMaxSize(conf) + convey.So(err, convey.ShouldBeNil) + convey.So(conf.FileMaxSize, convey.ShouldEqual, DefaultFileMaxSize) + conf.FileMaxSize = -1 + err = validateLogConfigFileMaxSize(conf) + convey.So(err, convey.ShouldBeError) + conf.FileMaxSize = DefaultFileMaxSize + 1 + err = validateLogConfigFileMaxSize(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateLogConfigBackups(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate backups func", func() { + conf := &LogConfig{MaxBackups: DefaultMaxBackups} + err := validateLogConfigBackups(conf) + convey.So(err, convey.ShouldBeNil) + conf.MaxBackups = 0 + err = validateLogConfigBackups(conf) + convey.So(err, convey.ShouldBeError) + conf.FileMaxSize = DefaultMaxBackups + 1 + err = validateLogConfigBackups(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateLogConfigMaxAge(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate max age func", func() { + conf := &LogConfig{MaxAge: DefaultMinSaveAge} + err := validateLogConfigMaxAge(conf) + convey.So(err, convey.ShouldBeNil) + conf.MaxAge = 0 + err = validateLogConfigMaxAge(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateLogLevel(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate log level func", func() { + conf := &LogConfig{} + err := validateLogLevel(conf) + convey.So(err, convey.ShouldBeNil) + conf.LogLevel = minLogLevel - 1 + err = validateLogLevel(conf) + convey.So(err, convey.ShouldBeError) + conf.LogLevel = maxLogLevel + 1 + err = validateLogLevel(conf) + convey.So(err, convey.ShouldBeError) + }) + }) +} + +func TestValidateMaxLineLength(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate max line length func", func() { + conf := &LogConfig{} + err := validateMaxLineLength(conf) + convey.So(err, convey.ShouldBeNil) + convey.So(conf.MaxLineLength, convey.ShouldEqual, defaultMaxEachLineLen) + conf.MaxLineLength = -1 + err = validateMaxLineLength(conf) + convey.So(err, convey.ShouldNotBeNil) + conf.MaxLineLength = maxEachLineLen + 1 + err = validateMaxLineLength(conf) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestValidateLogConfigFiled(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test validate config filed func", func() { + mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { + return "", nil + }) + mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { + return nil + }) + defer mockCheckPath.Reset() + defer mockCheckAndCreate.Reset() + conf := &LogConfig{ + MaxBackups: DefaultMaxBackups, + MaxAge: DefaultMinSaveAge, + CacheSize: DefaultCacheSize, + ExpiredTime: DefaultExpiredTime, + } + err := validateLogConfigFiled(conf) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("test validate config filed func, log file is relative path", func() { + mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { + return "", nil + }) + mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { + return nil + }) + defer mockCheckPath.Reset() + defer mockCheckAndCreate.Reset() + conf := &LogConfig{ + MaxBackups: DefaultMaxBackups, + MaxAge: DefaultMinSaveAge, + CacheSize: DefaultCacheSize, + ExpiredTime: DefaultExpiredTime, + LogFileName: "../", + } + err := validateLogConfigFiled(conf) + expErr := errors.New("log path include invalid char") + convey.So(err, convey.ShouldResemble, expErr) + }) + }) +} + +func TestChangeFileMode(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test changeFileMode func", func() { + changeFileMode(nil, fsnotify.Event{}, "log") + mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { + return true + }) + mockChmod := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { + return nil + }) + defer mockExist.Reset() + defer mockChmod.Reset() + lg := new(logger) + evt := fsnotify.Event{Name: "run-2022-01-01T00-00-00.123.log"} + changeFileMode(lg, evt, "log") + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go new file mode 100644 index 0000000..cc07bb2 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go @@ -0,0 +1,447 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "time" +) + +const ( + oneDaySeconds = 24 * 60 * 60 + defaultCapacity = 20 + timeFormat = "2006-01-02T15-04-05.000" + kilobytes = 1024 + defaultDirPermission = 0750 + defaultFilePermission = 0600 + defaultBackupPermission = 0400 + maxCapacity = 20 + minSaveVolume = 1 + maxSaveVolume = 30 + maxSaveTime = 700 + minSaveTime = 7 +) + +// Logs is an io.WriteCloser. +type Logs struct { + file *os.File + mutex sync.Mutex + rmOnce sync.Once + + // FileName is the file where logs are written. + FileName string `json:"filename" yaml:"filename"` + + // Capacity is the maximum number of bytes before the log file + // is rotated, and the default value is 128 megabytes. + Capacity int `json:"capacity" yaml:"capacity"` + + // SaveTime is the maximum number of days for retaining old log + // files. It calculates the retention time based on the timestamp + // of the old log file name and the current time. + SaveTime int `json:"savetime" yaml:"savetime"` + + // SaveVolume is the maximum number of old log files that can be + // retained. It saves all old files by default. + SaveVolume int `json:"savevolume" yaml:"savevolume"` + + // UTC determines whether to use the local time of the computer + // or the UTC time as the timestamp in the formatted backup file. + LocalOrUTC bool `json:"localorutc" yaml:"localorutc"` + + length int64 + rmCh chan bool +} + +// logFile is a struct that is used to return filename and +// timestamp. +type logFile struct { + fileInfo os.FileInfo + timeStamp time.Time +} + +var ( + // mByte is used to convert capacity into bytes. + mByte = kilobytes * kilobytes +) + +// Write implements io.Writer. If a write would not cause the size of +// the log file to exceed Capacity, the log file is written normally. +// If a write would cause the size of the log file to exceed Capacity, +// but the write length is less than Capacity, the log file is closed, +// renamed to include a timestamp of the current time, and a new log +// is created using the original log file name. If the length of a write +// is greater than the Capacity, an error is returned. +func (l *Logs) Write(d []byte) (int, error) { + if l == nil { + return 0, fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + + writeLenth := int64(len(d)) + if writeLenth > l.maxLenth() { + return 0, fmt.Errorf("the write lenth %d is greater than the maximum file size %d", + writeLenth, l.maxLenth(), + ) + } + + if l.file == nil { + if err := l.openOrCreateFile(writeLenth); err != nil { + return 0, err + } + } + fileInfo, err := l.file.Stat() + if err != nil { + return 0, err + } + l.length = fileInfo.Size() + if writeLenth+l.length > l.maxLenth() { + if err := l.roll(); err != nil { + return 0, err + } + } + + n, err := l.file.Write(d) + if err != nil { + return 0, err + } + l.length += int64(n) + return n, err +} + +// Roll causes Logs to close the existing log file and create a new log +// file immediately. The purpose of this function is to provide rotation +// outside the normal rotation rule, e.g. in response to SIGHUP. After +// rotation, the deletion of the old log files is initiated. +func (l *Logs) Roll() error { + if l == nil { + return fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + return l.roll() +} + +// Close implements io.Closer. It closses the current log file. +func (l *Logs) Close() error { + if l == nil { + return fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + + return l.close() +} + +// Flush persist the contents of the current memory. +func (l *Logs) Flush() error { + if l == nil { + return fmt.Errorf("logs pointer does not exist") + } + + l.mutex.Lock() + defer l.mutex.Unlock() + if l.file == nil { + return nil + } + return l.file.Sync() +} + +// maxLenth return the number of bytes of the maximum log size +// before rotating. +func (l *Logs) maxLenth() int64 { + if l.Capacity > 0 && l.Capacity < maxCapacity { + return int64(l.Capacity) * int64(mByte) + } + return int64(defaultCapacity * mByte) +} + +// fileName return the name of the log file. +func (l *Logs) fileName() string { + if l.FileName != "" { + return l.FileName + } + logName := filepath.Base(os.Args[0]) + "-mindx-dl.log" + return filepath.Join(os.TempDir(), logName) +} + +// openOrCreateFile opens the log file if it exists and the +// current write would not exceed the Capacity. It will create +// a new file if there is no such file or the write would exceed +// the Capacity. +func (l *Logs) openOrCreateFile(writeLen int64) error { + l.remove() + + name := l.fileName() + message, err := os.Stat(name) + if os.IsNotExist(err) { + return l.create() + } + + if err != nil { + return fmt.Errorf("failed to get log file message: %v", err) + } + + if writeLen+message.Size() >= l.maxLenth() { + return l.roll() + } + + f, err := os.OpenFile(name, os.O_APPEND|os.O_WRONLY, defaultFilePermission) + if err != nil { + return l.create() + } + l.file = f + l.length = message.Size() + return nil +} + +// create creates a new log file for writing, and backs up the +// old log file. The file is closed when this method is invoked +// by default. +func (l *Logs) create() error { + if err := os.MkdirAll(l.getDir(), defaultDirPermission); err != nil { + return fmt.Errorf("unable to create directory for new log file: %v", err) + } + + fileName, fileMode := l.fileName(), os.FileMode(defaultFilePermission) + if message, err := os.Stat(fileName); err == nil { + fileMode = message.Mode() + backupName := l.backup() + if err := os.Rename(fileName, backupName); err != nil { + return fmt.Errorf("failed to rename the log file: %v", err) + } + if err := os.Chmod(backupName, defaultBackupPermission); err != nil { + return fmt.Errorf("failed to change backup log file permission: %v", err) + } + } + newFile, err := os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, fileMode) + if err != nil { + return fmt.Errorf("unable to open new log file: %v", err) + } + l.length, l.file = 0, newFile + return nil +} + +// backup generates a backup file name based on the original file +// name and inserts a timestamp between the file name and extension. +// The timestamp uses the UTC time by default. +func (l *Logs) backup() string { + prefix, extension := l.getPreAndExt() + return filepath.Join(l.getDir(), fmt.Sprintf("%s%s%s", prefix, l.getTimestamp(), extension)) +} + +// getDir returns the directory for the current filename. +func (l *Logs) getDir() string { + return filepath.Dir(l.fileName()) +} + +// getPreAndExt returns the prefix name and extension name +// from Logs's filename. +func (l *Logs) getPreAndExt() (string, string) { + name := filepath.Base(l.fileName()) + extension := filepath.Ext(name) + prefix := name[:len(name)-len(extension)] + "-" + return prefix, extension +} + +// getTimestamp returns the timestamp of current time, and +// uses UTC time by default. +func (l *Logs) getTimestamp() string { + t := time.Now() + if !l.LocalOrUTC { + t = t.UTC() + } + return t.Format(timeFormat) +} + +// roll rotates the log file, close the existing log file and +// create a new one immediately. After rotating, this method +// deletes the old log files according to the configuration. +func (l *Logs) roll() error { + if err := l.close(); err != nil { + return err + } + if err := l.create(); err != nil { + return err + } + l.remove() + return nil +} + +// close closes the file if it is open. +func (l *Logs) close() error { + if l.file == nil { + return nil + } + err := l.file.Sync() + if err != nil { + return err + } + err = l.file.Close() + l.file = nil + return err +} + +// remove delete outdated log files, starting the remove +// goroutine if necessary. +func (l *Logs) remove() { + l.rmOnce.Do(func() { + l.rmCh = make(chan bool, 1) + go l.removeRun() + }) + select { + case l.rmCh <- true: + default: + } +} + +// removeRun manages the deletion of the old log files after +// rotating, which runs in a goroutine. +func (l *Logs) removeRun() { + for range l.rmCh { + if err := l.removeRunOnce(); err != nil { + fmt.Println("failed to remove runonce: ", err) + } + } +} + +// removeRunOnce performs removal of outdated log files. +// Old log files are removed if the number of old files +// exceed the Capacity or the retention time of old files +// is greater than SaveTime. +func (l *Logs) removeRunOnce() error { + if l.SaveVolume == 0 && l.SaveTime == 0 { + return nil + } + + if err := checkParam(l.SaveVolume, l.SaveTime); err != nil { + return err + } + + oldFiles, err := l.oldFilesList() + if err != nil { + return err + } + + var removeFiles []logFile + if l.SaveTime > 0 { + delTime := time.Now().Unix() - int64(l.SaveTime)*oneDaySeconds + var remainingFiles []logFile + for _, f := range oldFiles { + if f.timeStamp.Unix() <= delTime { + removeFiles = append(removeFiles, f) + continue + } + remainingFiles = append(remainingFiles, f) + } + oldFiles = remainingFiles + } + + if l.SaveVolume > 0 && l.SaveVolume < len(oldFiles) { + saved := make(map[string]struct{}, len(oldFiles)) + var remainingFiles []logFile + for _, f := range oldFiles { + saved[f.fileInfo.Name()] = struct{}{} + if l.SaveVolume >= len(saved) { + remainingFiles = append(remainingFiles, f) + continue + } + removeFiles = append(removeFiles, f) + } + oldFiles = remainingFiles + } + + for _, f := range removeFiles { + rmError := os.Remove(filepath.Join(l.getDir(), f.fileInfo.Name())) + if rmError != nil { + err = rmError + } + } + return err +} + +// oldFilesList returns the list of backup log files sorted +// by ModTime. These backup log files are stored in the same +// directory as the current log file. +func (l *Logs) oldFilesList() ([]logFile, error) { + logFiles, err := ioutil.ReadDir(l.getDir()) + if err != nil { + return nil, fmt.Errorf("unable to open the log file directory: %v", err) + } + + prefix, extension := l.getPreAndExt() + + var oldFiles []logFile + + for _, file := range logFiles { + if file.IsDir() { + continue + } + if timeStamp, err := l.extractTime(file.Name(), prefix, extension); err == nil { + oldFiles = append(oldFiles, logFile{fileInfo: file, timeStamp: timeStamp}) + continue + } + } + sort.Slice(oldFiles, func(i, j int) bool { + if i < 0 || i > len(oldFiles) || j < 0 || j > len(oldFiles) { + return false + } + return oldFiles[i].timeStamp.After(oldFiles[j].timeStamp) + }) + + return oldFiles, nil +} + +// extractTime extracts the formatted time from file name by +// stripping the prefix and extension of the file name. This +// prevents fileName from being confused with time.parse. +func (l *Logs) extractTime(name, prefix, extension string) (time.Time, error) { + if !strings.HasSuffix(name, extension) { + return time.Time{}, errors.New("unmatched extension") + } + + if !strings.HasPrefix(name, prefix) { + return time.Time{}, errors.New("unmatched prefix") + } + + timeStamp := name[len(prefix) : len(name)-len(extension)] + return time.Parse(timeFormat, timeStamp) +} + +// checkParam checks whether the parameters are correct +func checkParam(volume int, time int) error { + if volume != 0 { + if volume < minSaveVolume || volume > maxSaveVolume { + return fmt.Errorf("the value of savevolume is incorrect") + } + } + if time != 0 { + if time < minSaveTime || time > maxSaveTime { + return fmt.Errorf("the value of savetime is incorrect") + } + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go new file mode 100644 index 0000000..67807bd --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go @@ -0,0 +1,687 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +const ( + testDirPermission = 0700 + testFilePermission = 0600 + testMByte = 1 + testCapacity = 10 + testCapacity2 = 100 + testCapacity3 = 5 + testSaveTime = 10 + testSaveTime2 = 7 + testSaveVolume = 3 + testSaveVolume2 = 1 + fileCountOne = 1 + fileCountTwo = 2 + fileCountFour = 4 + waitTime = 50 + oneDayHour = 24 + sevenDays = 7 + fourteenDays = 14 + twentyOneDays = 21 + testYear = 2014 + testMonth = 5 + testDay = 4 + testHour = 14 + testMin = 44 + testSec = 33 + testNsec = 555000000 +) + +// TestCreate for test the function of create log file +func TestCreate(t *testing.T) { + convey.Convey("TestCreate", t, func() { + dir := makeTempDir("TestCrate") + defer os.RemoveAll(dir) + l := &Logs{ + FileName: getLogFile(dir), + } + defer l.Close() + + input := []byte("foobarfoobar!") + fileWrite(input, l) + existWithContent(input, getLogFile(dir)) + fileCount(fileCountOne, dir) + }) +} + +// TestOpenFile for test the function of open log file +func TestOpenFile(t *testing.T) { + convey.Convey("TestOpenFile", t, func() { + dir := makeTempDir("TestOpenFile") + defer os.RemoveAll(dir) + fileName := getLogFile(dir) + data := []byte("foo!") + err := ioutil.WriteFile(fileName, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + existWithContent(data, fileName) + + l := &Logs{ + FileName: fileName, + } + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(append(data, b...), fileName) + fileCount(fileCountOne, dir) + }) +} + +// TestWriteTooLong for test the processing of the overlong write error +func TestWriteTooLong(t *testing.T) { + convey.Convey("TestWriteTooLong", t, func() { + mByte = testMByte + dir := makeTempDir("TestWriteTooLong") + defer os.RemoveAll(dir) + + l := &Logs{ + FileName: getLogFile(dir), + Capacity: testCapacity3, + } + defer l.Close() + + b := []byte("barrrrrrrrrrrrrrrrr!") + n, err := l.Write(b) + convey.So(err, convey.ShouldNotBeNil) + convey.So(0, convey.ShouldEqual, n) + convey.So(err.Error(), convey.ShouldEqual, fmt.Sprintf( + "the write lenth %d is greater than the maximum file size %d", len(b), l.Capacity)) + _, err = os.Stat(getLogFile(dir)) + convey.So(err, shouldNotBeExist) + }) +} + +// TestMakeLogDir for test the function of make log file directory +func TestMakeLogDir(t *testing.T) { + convey.Convey("TestMakeLogDir", t, func() { + dir := time.Now().Format("TestMakeLogDir" + timeFormat) + dir = filepath.Join(os.TempDir(), dir) + defer os.RemoveAll(dir) + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + } + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, getLogFile(dir)) + fileCount(fileCountOne, dir) + }) +} + +// TestDefaultFileName for test default log file name +func TestDefaultFileName(t *testing.T) { + convey.Convey("TestDefaultFileName", t, func() { + dir := os.TempDir() + fileName := filepath.Join(dir, filepath.Base(os.Args[0])+"-mindx-dl.log") + defer os.Remove(fileName) + + l := &Logs{} + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, fileName) + }) +} + +// TestAutoRoll for test the automatic log rolling +func TestAutoRoll(t *testing.T) { + convey.Convey("TestAutoRoll", t, func() { + mByte = testMByte + dir := makeTempDir("TestAutoRoll") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + } + defer l.Close() + + b := []byte("aoo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + defer patch1.Reset() + + b2 := []byte("foooooo!") + fileWrite(b2, l) + existWithContent(b2, fileName) + existWithContent(b, getBackupFile(dir, time.Now())) + fileCount(fileCountTwo, dir) + }) +} + +// TestFirstWriteRoll for test the log rolling on first write +func TestFirstWriteRoll(t *testing.T) { + convey.Convey("TestFirstWriteRoll", t, func() { + mByte = testMByte + dir := makeTempDir("TestFirstWriteRoll") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + } + defer l.Close() + + start := []byte("boooooo!") + err := ioutil.WriteFile(fileName, start, testFilePermission) + convey.So(err, convey.ShouldBeNil) + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + defer patch1.Reset() + + b := []byte("fooo!") + fileWrite(b, l) + existWithContent(b, fileName) + existWithContent(start, getBackupFile(dir, time.Now())) + fileCount(fileCountTwo, dir) + }) +} + +// TestSaveVolumeCase1 for test the deleting log files that exceed the volume +func TestSaveVolumeCase1(t *testing.T) { + convey.Convey("TestSaveVolumeCase1", t, func() { + mByte = testMByte + dir := makeTempDir("TestSaveVolumeCase1") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + SaveVolume: testSaveVolume2, + } + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b2 := []byte("foooooo!") + fileWrite(b2, l) + secondFileName := getBackupFile(dir, time.Now()) + existWithContent(b, secondFileName) + existWithContent(b2, fileName) + fileCount(fileCountTwo, dir) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + defer patch2.Reset() + b3 := []byte("baaaaaar!") + fileWrite(b3, l) + thirdFileName := getBackupFile(dir, time.Now()) + existWithContent(b2, thirdFileName) + existWithContent(b3, fileName) + <-time.After(time.Millisecond * waitTime) + fileCount(fileCountTwo, dir) + existWithContent(b2, thirdFileName) + convey.So(secondFileName, shouldNotExist) + }) +} + +// TestSaveVolumeCase2 for test the deleting log files that exceed the volume when a non-log file exists +func TestSaveVolumeCase2(t *testing.T) { + convey.Convey("TestSaveVolumeCase2", t, func() { + mByte = testMByte + dir := makeTempDir("TestSaveVolumeCase2") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{FileName: fileName, Capacity: testCapacity, SaveVolume: testSaveVolume2} + defer l.Close() + + b := []byte("boo!") + fileWrite(b, l) + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b2 := []byte("baaaaaar!") + fileWrite(b2, l) + secondFileName := getBackupFile(dir, time.Now()) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + notLogFile := getLogFile(dir) + ".foo" + err := ioutil.WriteFile(notLogFile, []byte("data"), testFilePermission) + convey.So(err, convey.ShouldBeNil) + notLogFileDir := getBackupFile(dir, time.Now()) + err = os.Mkdir(notLogFileDir, testDirPermission) + convey.So(err, convey.ShouldBeNil) + + patch2.Reset() + patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time3 := currentTime + return time3.Add(time.Hour * oneDayHour * twentyOneDays) + }) + defer patch3.Reset() + thirdFileName := getBackupFile(dir, time.Now()) + b3 := []byte("baaaaaaz!") + fileWrite(b3, l) + existWithContent(b2, thirdFileName) + <-time.After(time.Millisecond * waitTime) + fileCount(fileCountFour, dir) + existWithContent(b3, fileName) + convey.So(secondFileName, shouldNotExist) + convey.So(notLogFile, shouldExist) + convey.So(notLogFileDir, shouldExist) + }) +} + +// TestCleanupExistingBackupFiles fot test the clearing the current backup log files +func TestCleanupExistingBackupFiles(t *testing.T) { + convey.Convey("TestCleanupExistingBackupFiles", t, func() { + mByte = testMByte + dir := makeTempDir("TestCleanupExistingBackupFiles") + defer os.RemoveAll(dir) + currentTime := time.Now() + + data := []byte("data") + backup := getBackupFile(dir, time.Now()) + err := ioutil.WriteFile(backup, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + backup = getBackupFile(dir, time.Now()) + err = ioutil.WriteFile(backup, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + fileName := getLogFile(dir) + err = ioutil.WriteFile(fileName, data, testFilePermission) + convey.So(err, convey.ShouldBeNil) + + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + SaveVolume: testSaveVolume2, + } + defer l.Close() + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + defer patch2.Reset() + b2 := []byte("foooooo!") + fileWrite(b2, l) + + <-time.After(time.Millisecond * waitTime) + + fileCount(fileCountTwo, dir) + }) +} + +// TestSaveTime for test the deleting log files that exceed the time +func TestSaveTime(t *testing.T) { + convey.Convey("TestSaveTime", t, func() { + mByte = testMByte + dir := makeTempDir("TestSaveTime") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + Capacity: testCapacity, + SaveTime: testSaveTime2, + } + defer l.Close() + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b := []byte("zoo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + b2 := []byte("foooooo!") + fileWrite(b2, l) + existWithContent(b, getBackupFile(dir, time.Now())) + + <-time.After(waitTime * time.Millisecond) + + fileCount(fileCountTwo, dir) + existWithContent(b2, fileName) + existWithContent(b, getBackupFile(dir, time.Now())) + + patch2.Reset() + patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time3 := currentTime + return time3.Add(time.Hour * oneDayHour * twentyOneDays) + }) + defer patch3.Reset() + b3 := []byte("baaaaar!") + fileWrite(b3, l) + existWithContent(b2, getBackupFile(dir, time.Now())) + + <-time.After(waitTime * time.Millisecond) + + fileCount(fileCountTwo, dir) + existWithContent(b3, fileName) + existWithContent(b2, getBackupFile(dir, time.Now())) + }) +} + +// TestOldLogFilesList for test the obtaining the list of old log files +func TestOldLogFilesList(t *testing.T) { + convey.Convey("TestOldLogFilesList", t, func() { + mByte = testMByte + dir := makeTempDir("TestOldLogFiles") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + data := []byte("data") + err := ioutil.WriteFile(fileName, data, testDirPermission) + convey.So(err, convey.ShouldBeNil) + t1, err := time.Parse(timeFormat, currentTime.UTC().Format(timeFormat)) + convey.So(err, convey.ShouldBeNil) + backup := getBackupFile(dir, currentTime) + err = ioutil.WriteFile(backup, data, testDirPermission) + convey.So(err, convey.ShouldBeNil) + + patch := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + defer patch.Reset() + t2, err := time.Parse(timeFormat, time.Now().UTC().Format(timeFormat)) + convey.So(err, convey.ShouldBeNil) + backup2 := getBackupFile(dir, time.Now()) + err = ioutil.WriteFile(backup2, data, testDirPermission) + convey.So(err, convey.ShouldBeNil) + + l := &Logs{FileName: fileName} + files, err := l.oldFilesList() + convey.So(err, convey.ShouldBeNil) + convey.So(fileCountTwo, convey.ShouldEqual, len(files)) + convey.So(t2, convey.ShouldEqual, files[0].timeStamp) + convey.So(t1, convey.ShouldEqual, files[1].timeStamp) + }) +} + +// TestExtractTime for test obtaining log file timestamp +func TestExtractTime(t *testing.T) { + convey.Convey("TestExtractTime", t, func() { + l := &Logs{FileName: "/var/log/myfoo/foo.log"} + prefix, extention := l.getPreAndExt() + + tests := []struct { + fileName string + want time.Time + wantErr bool + }{ + {"foo-2014-05-04T14-44-33.555.log", time.Date( + testYear, testMonth, testDay, testHour, testMin, testSec, testNsec, time.UTC), false}, + {"foo-2014-05-04T14-44-33.555", time.Time{}, true}, + {"2014-05-04T14-44-33.555.log", time.Time{}, true}, + {"foo.log", time.Time{}, true}, + } + + for _, test := range tests { + got, err := l.extractTime(test.fileName, prefix, extention) + convey.So(got, convey.ShouldEqual, test.want) + convey.So(err != nil, convey.ShouldEqual, test.wantErr) + } + }) +} + +// TestLocalTime for test the situation that current time is the local time +func TestLocalTime(t *testing.T) { + convey.Convey("TestLocalTime", t, func() { + mByte = testMByte + dir := makeTempDir("TestLocalTime") + defer os.RemoveAll(dir) + currentTime := time.Now() + + l := &Logs{ + FileName: getLogFile(dir), + Capacity: testCapacity, + LocalOrUTC: true, + } + defer l.Close() + + patch := gomonkey.ApplyFunc(time.Now, func() time.Time { + return currentTime + }) + defer patch.Reset() + b := []byte("boo!") + fileWrite(b, l) + + b2 := []byte("fooooooo!") + fileWrite(b2, l) + existWithContent(b2, getLogFile(dir)) + existWithContent(b, getBackupFileLocal(dir, currentTime)) + }) +} + +// TestRoll for test rolling +func TestRoll(t *testing.T) { + convey.Convey("TestRoll", t, func() { + dir := makeTempDir("TestRotate") + defer os.RemoveAll(dir) + currentTime := time.Now() + + fileName := getLogFile(dir) + l := &Logs{ + FileName: fileName, + SaveVolume: testSaveVolume2, + Capacity: testCapacity2, // megabytes + } + defer l.Close() + + patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time1 := currentTime + return time1.Add(time.Hour * oneDayHour * sevenDays) + }) + b := []byte("boo!") + fileWrite(b, l) + existWithContent(b, fileName) + fileCount(fileCountOne, dir) + + patch1.Reset() + patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time2 := currentTime + return time2.Add(time.Hour * oneDayHour * fourteenDays) + }) + err := l.Roll() + convey.So(err, convey.ShouldBeNil) + + <-time.After(waitTime * time.Millisecond) + + filename2 := getBackupFile(dir, time.Now()) + existWithContent(b, filename2) + existWithContent([]byte{}, fileName) + fileCount(fileCountTwo, dir) + + patch2.Reset() + patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { + time3 := currentTime + return time3.Add(time.Hour * oneDayHour * twentyOneDays) + }) + defer patch3.Reset() + err = l.Roll() + convey.So(err, convey.ShouldBeNil) + + <-time.After(waitTime * time.Millisecond) + + filename3 := getBackupFile(dir, time.Now()) + existWithContent([]byte{}, filename3) + existWithContent([]byte{}, fileName) + fileCount(fileCountTwo, dir) + + b2 := []byte("foooooo!") + fileWrite(b2, l) + existWithContent(b2, fileName) + }) +} + +// TestJson for test JSON conversion +func TestJson(t *testing.T) { + convey.Convey("TestJson", t, func() { + data := []byte(` + { + "filename": "foo", + "capacity": 10, + "savetime": 10, + "savevolume": 3, + "localorutc": true + }`[1:]) + + l := Logs{} + err := json.Unmarshal(data, &l) + convey.So(err, convey.ShouldBeNil) + convey.So("foo", convey.ShouldEqual, l.FileName) + convey.So(testCapacity, convey.ShouldEqual, l.Capacity) + convey.So(testSaveTime, convey.ShouldEqual, l.SaveTime) + convey.So(testSaveVolume, convey.ShouldEqual, l.SaveVolume) + convey.So(true, convey.ShouldEqual, l.LocalOrUTC) + }) +} + +// makeTempDir creates a file in the OS temp directory to keep parallel test +func makeTempDir(name string) string { + dir := time.Now().Format(name + timeFormat) + dir = filepath.Join(os.TempDir(), dir) + err := os.Mkdir(dir, testDirPermission) + convey.So(err, convey.ShouldBeNil) + return dir +} + +// existWithContent checks that the given file exists and has the correct content +func existWithContent(content []byte, dir string) { + info, err := os.Stat(dir) + convey.So(err, convey.ShouldBeNil) + convey.So(int64(len(content)), convey.ShouldEqual, info.Size()) + + b, err := ioutil.ReadFile(dir) + convey.So(err, convey.ShouldBeNil) + convey.So(content, convey.ShouldResemble, b) +} + +// getLogFile returns the log file name in the given directory for the current fake time +func getLogFile(dir string) string { + return filepath.Join(dir, "foobar.log") +} + +func getBackupFile(dir string, t time.Time) string { + return filepath.Join(dir, "foobar-"+t.UTC().Format(timeFormat)+".log") +} + +func getBackupFileLocal(dir string, t time.Time) string { + return filepath.Join(dir, "foobar-"+t.Format(timeFormat)+".log") +} + +// fileCount checks that the number of files in the directory is exp. +func fileCount(exp int, dir string) { + files, err := ioutil.ReadDir(dir) + convey.So(err, convey.ShouldBeNil) + convey.So(len(files), convey.ShouldEqual, exp) +} + +func fileWrite(b []byte, l *Logs) { + n, err := l.Write(b) + convey.So(err, convey.ShouldBeNil) + convey.So(len(b), convey.ShouldEqual, n) +} + +func shouldNotBeExist(actual interface{}, expected ...interface{}) string { + err, ok := actual.(error) + if !ok { + return "incorrect parameter type" + } + if os.IsNotExist(err) { + return "" + } + return "File exists, but should not have been created" +} +func shouldNotExist(actual interface{}, expected ...interface{}) string { + path, ok := actual.(string) + if !ok { + return "incorrect parameter type" + } + _, err := os.Stat(path) + if os.IsNotExist(err) { + return "" + } + return fmt.Sprintf("expected to get os.IsNotExist, but instead got %v", err) +} + +func shouldExist(actual interface{}, expected ...interface{}) string { + path, ok := actual.(string) + if !ok { + return "incorrect parameter type" + } + _, err := os.Stat(path) + if err != nil { + return fmt.Sprintf("expected file to exist, but got error from os.Stat: %v", err) + } + return "" +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/types.go b/mind-cluster/component/ascend-common/common-utils/hwlog/types.go new file mode 100644 index 0000000..e97c80b --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/types.go @@ -0,0 +1,49 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import "errors" + +// ContextKey especially for context value +// to solve problem of "should not use basic type untyped string as key in context.WithValue" +type ContextKey string + +// String the implement of String method +func (c ContextKey) String() string { + return string(c) +} + +const ( + // UserID used for context value key of "ID" + UserID ContextKey = "UserID" + // ReqID used for context value key of "requestID" + ReqID ContextKey = "RequestID" + // extraDeepKey used for context value key of "extraDeepKey" + extraDeepKey ContextKey = "extraDeepKey" +) + +// SelfLogWriter used this to replace some opensource log +type SelfLogWriter struct { +} + +// Write implement the interface of io.writer +func (l *SelfLogWriter) Write(p []byte) (int, error) { + if RunLog == nil { + return -1, errors.New("hwlog is not initialized") + } + RunLog.Info(string(p)) + return len(p), nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go new file mode 100644 index 0000000..40955f4 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go @@ -0,0 +1,98 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog provides the capability of processing Huawei log rules. +package hwlog + +import ( + "bytes" + "context" + "fmt" + "log" + "runtime" + "strings" +) + +// printHelper helper function for log printing +func printHelper(lg *log.Logger, msg string, maxLogLength int, ctx ...context.Context) { + str := getCallerInfo(ctx...) + trimMsg := strings.Replace(msg, "\r", " ", -1) + trimMsg = strings.Replace(trimMsg, "\n", " ", -1) + runeArr := []rune(trimMsg) + if length := len(runeArr); length > maxLogLength { + trimMsg = string(runeArr[:maxLogLength]) + } + lg.Println(str + trimMsg) +} + +// getCallerInfo gets the caller's information +func getCallerInfo(ctx ...context.Context) string { + var deep = stackDeep + var userID interface{} + var traceID interface{} + for _, c := range ctx { + if c == nil { + deep++ + continue + } + userID = c.Value(UserID) + traceID = c.Value(ReqID) + if val := c.Value(extraDeepKey); val != nil { + currentVal, _ := val.(int) // security type assertions, invalid values are automatically zeroed + deep += currentVal + } + } + var funcName string + pc, codePath, codeLine, ok := runtime.Caller(deep) + if ok { + funcName = runtime.FuncForPC(pc).Name() + } + p := strings.Split(codePath, "/") + l := len(p) + if l == pathLen { + funcName = p[l-1] + } else if l > pathLen { + funcName = fmt.Sprintf("%s/%s", p[l-pathLen], p[l-1]) + } + callerPath := fmt.Sprintf("%s:%d", funcName, codeLine) + goroutineID := getGoroutineID() + str := fmt.Sprintf("%-8s%s ", goroutineID, callerPath) + if userID != nil || traceID != nil { + str = fmt.Sprintf("%s{%#v}-{%#v} ", str, userID, traceID) + } + return str +} + +// getCallerGoroutineID gets the goroutineID +func getGoroutineID() string { + b := make([]byte, bitsize, bitsize) + b = b[:runtime.Stack(b, false)] + b = bytes.TrimPrefix(b, []byte("goroutine ")) + b = b[:bytes.IndexByte(b, ' ')] + return string(b) +} + +// DeepIncrease increases the stack depth by 1 +func DeepIncrease(ctx context.Context) context.Context { + if ctx == nil { + return context.WithValue(context.Background(), extraDeepKey, 1) + } + + var currentVal int + if val := ctx.Value(extraDeepKey); val != nil { + currentVal, _ = val.(int) // security type assertions, invalid values are automatically zeroed + } + + return context.WithValue(ctx, extraDeepKey, currentVal+1) +} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go new file mode 100644 index 0000000..ca2bda2 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go @@ -0,0 +1,38 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hwlog test file +package hwlog + +import ( + "context" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestUtilsFunc(t *testing.T) { + convey.Convey("test utils", t, func() { + convey.Convey("test utils func", func() { + lg := new(logger) + conf := &LogConfig{OnlyToStdout: true} + userCtx := context.TODO() + userCtx = context.WithValue(userCtx, UserID, 0) + userCtx = context.WithValue(userCtx, ReqID, 0) + err := lg.setLogger(conf) + convey.So(err, convey.ShouldBeNil) + printHelper(lg.lgInfo, "test", defaultMaxEachLineLen) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go new file mode 100644 index 0000000..fdab9a8 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go @@ -0,0 +1,226 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limiter +package limiter + +import ( + "context" + "errors" + "fmt" + "math" + "net/http" + "regexp" + "strconv" + "strings" + "syscall" + "time" + + "ascend-common/common-utils/cache" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" +) + +const ( + kilo = 1000.0 + // DefaultDataLimit default http body limit size + DefaultDataLimit = 1024 * 1024 * 10 + defaultMaxConcurrency = 1024 + maxStringLen = 20 + // DefaultCacheSize default cache size + DefaultCacheSize = 1024 * 100 + arrLen = 2 + // IPReqLimitReg ip request limit regex string + IPReqLimitReg = "^[1-9]\\d{0,2}/[1-9]\\d{0,2}$" +) + +type limitHandler struct { + concurrency chan struct{} + httpHandler http.Handler + log bool + method string + limitBytes int64 + ipExpiredTime time.Duration + ipCache *cache.ConcurrencyLRUCache +} + +// HandlerConfig the configuration of the limitHandler +type HandlerConfig struct { + // PrintLog whether you need print access log, when use gin framework, suggest to set false,otherwise set true + PrintLog bool + // Method only allow setting http method pass + Method string + // LimitBytes set the max http body size + LimitBytes int64 + // TotalConCurrency set the program total concurrent http request + TotalConCurrency int + // IPConCurrency set the signle IP concurrent http request "2/1sec" + IPConCurrency string + // CacheSize the local cacheSize + CacheSize int +} + +// StatusResponseWriter the writer record the http status +type StatusResponseWriter struct { + http.ResponseWriter + http.Hijacker + Status int +} + +// WriteHeader override the WriteHeader method +func (w *StatusResponseWriter) WriteHeader(status int) { + w.ResponseWriter.WriteHeader(status) + w.Status = status +} + +// ServeHTTP implement http.Handler +func (h *limitHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { + req.Body = http.MaxBytesReader(w, req.Body, h.limitBytes) + ctx := initContext(req) + path := req.URL.Path + clientUserAgent := req.UserAgent() + clientIP := utils.ClientIP(req) + if clientIP != "" && h.ipCache != nil { + if !h.ipCache.SetIfNX(fmt.Sprintf("key-%s", clientIP), "v", h.ipExpiredTime) { + hwlog.RunLog.WarnfWithCtx(ctx, "Single IP request reject:%s: %s <%3d> |%15s |%s |%d ", req.Method, + path, http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) + http.Error(w, "503 too busy", http.StatusServiceUnavailable) + return + } + } + select { + case _, ok := <-h.concurrency: + if !ok { + // channel closed and no need return token + return + } + if h.method != "" && req.Method != h.method { + http.NotFound(w, req) + // recover token to the bucket + h.concurrency <- struct{}{} + return + } + hwlog.RunLog.Debugf("token count:%d", len(h.concurrency)) + start := time.Now() + statusRes := newResponse(w) + h.httpHandler.ServeHTTP(statusRes, req) + stop := time.Since(start) + h.concurrency <- struct{}{} + latency := int(math.Ceil(float64(stop.Nanoseconds()) / kilo / kilo)) + if h.log { + hwlog.RunLog.InfofWithCtx(ctx, "%s %s: %s <%3d> (%dms) |%15s |%s |%d", req.Proto, req.Method, path, + statusRes.Status, latency, clientIP, clientUserAgent, syscall.Getuid()) + } + default: + hwlog.RunLog.WarnfWithCtx(ctx, "Total reject request:%s: %s <%3d> |%15s |%s |%d ", req.Method, path, + http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) + http.Error(w, "503 too busy", http.StatusServiceUnavailable) + } +} + +func newResponse(w http.ResponseWriter) *StatusResponseWriter { + jk, ok := w.(http.Hijacker) + if !ok { + hwlog.RunLog.Warn("hijack not implement") + } + statusRes := &StatusResponseWriter{ + ResponseWriter: w, + Status: http.StatusOK, + Hijacker: jk, + } + return statusRes +} + +func initContext(req *http.Request) context.Context { + ctx := context.Background() + reqID := req.Header.Get(hwlog.ReqID.String()) + if reqID != "" { + ctx = context.WithValue(context.Background(), hwlog.ReqID, reqID) + } + id := req.Header.Get(hwlog.UserID.String()) + if id != "" { + ctx = context.WithValue(ctx, hwlog.UserID, id) + } + return ctx +} + +// NewLimitHandler new a bucket-token limiter +func NewLimitHandler(maxConcur, maxConcurrency int, handler http.Handler, printLog bool) (http.Handler, error) { + return NewLimitHandlerWithMethod(maxConcur, maxConcurrency, handler, printLog, "") +} + +// NewLimitHandlerWithMethod new a bucket-token limiter with specific http method +func NewLimitHandlerWithMethod(maxConcur, maxConcurrency int, handler http.Handler, printLog bool, + httpMethod string) (http.Handler, error) { + if maxConcur < 1 || maxConcur > maxConcurrency { + return nil, errors.New("maxConcurrency parameter error") + } + conchan := make(chan struct{}, maxConcur) + return createHandler(conchan, handler, printLog, httpMethod, DefaultDataLimit), nil +} + +func createHandler(ch chan struct{}, handler http.Handler, printLog bool, + httpMethod string, bodySizeLimit int64) *limitHandler { + h := &limitHandler{ + concurrency: ch, + httpHandler: handler, + log: printLog, + method: httpMethod, + limitBytes: bodySizeLimit, + ipExpiredTime: time.Duration(-1), + } + for i := 0; i < cap(ch); i++ { + h.concurrency <- struct{}{} + } + return h +} + +// NewLimitHandlerV2 new a bucket-token limiter which contains limit request by IP +func NewLimitHandlerV2(handler http.Handler, conf *HandlerConfig) (http.Handler, error) { + if conf == nil { + return nil, errors.New("parameter error") + } + if conf.TotalConCurrency < 1 || conf.TotalConCurrency > defaultMaxConcurrency { + return nil, errors.New("totalConCurrency parameter error") + } + if len(conf.Method) > maxStringLen { + return nil, errors.New("method parameter error") + } + if conf.CacheSize <= 0 { + hwlog.RunLog.Info("use default cache size") + conf.CacheSize = DefaultCacheSize + } + reg := regexp.MustCompile(IPReqLimitReg) + if !reg.Match([]byte(conf.IPConCurrency)) { + return nil, errors.New("IPConCurrency parameter error") + } + conchan := make(chan struct{}, conf.TotalConCurrency) + h := createHandler(conchan, handler, conf.PrintLog, conf.Method, conf.LimitBytes) + arr := strings.Split(conf.IPConCurrency, "/") + if len(arr) != arrLen || arr[0] == "0" { + return nil, errors.New("IPConCurrency parameter error") + } + arr1, err := strconv.ParseInt(arr[1], 0, 0) + if err != nil { + return nil, fmt.Errorf("IPConCurrency parameter(%s) error, parse to int failed: %v", arr[1], err) + } + arr0, err := strconv.ParseInt(arr[0], 0, 0) + if err != nil || arr0 == 0 { + return nil, fmt.Errorf("IPConCurrency parameter(%s) error,parse to int failed: %v", arr[0], err) + } + h.ipExpiredTime = time.Duration(arr1 * int64(time.Second) / arr0) + h.ipCache = cache.New(DefaultCacheSize) + return h, nil + +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go new file mode 100644 index 0000000..69dbb8e --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go @@ -0,0 +1,119 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limiter +package limiter + +import ( + "context" + "net/http" + "net/url" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" +) + +func init() { + config := hwlog.LogConfig{ + OnlyToStdout: true, + } + hwlog.InitRunLogger(&config, context.TODO()) +} +func TestServeHTTP(t *testing.T) { + convey.Convey("test limitHandler serveHTTP", t, func() { + h, w, r := initVarable() + convey.Convey("header contains reqID and userID,", func() { + mock := gomonkey.ApplyMethodFunc(h.httpHandler, "ServeHTTP", func(http.ResponseWriter, + *http.Request) { + return + }) + defer mock.Reset() + h.ServeHTTP(w.ResponseWriter, r) + convey.So(len(h.concurrency), convey.ShouldEqual, 1) + }) + convey.Convey("token channel close,", func() { + mock := gomonkey.ApplyFunc(http.Error, func(http.ResponseWriter, string, int) { + return + }) + defer mock.Reset() + _, ok := <-h.concurrency + if !ok { + return + } + h.ServeHTTP(w.ResponseWriter, r) + convey.So(len(h.concurrency), convey.ShouldEqual, 0) + }) + }) +} + +func initVarable() (*limitHandler, StatusResponseWriter, *http.Request) { + lh, err := NewLimitHandler(1, len2, http.DefaultServeMux, false) + if err != nil { + return nil, StatusResponseWriter{}, nil + } + v, ok := lh.(*limitHandler) + if !ok { + return nil, StatusResponseWriter{}, nil + } + w := StatusResponseWriter{ + ResponseWriter: nil, + Status: 0, + } + r := &http.Request{ + URL: &url.URL{ + Path: "test.com", + }, + Header: map[string][]string{"userID": {"1"}, "reqID": {"requestIDxxxx"}}, + Method: "GET", + } + return v, w, r +} + +func TestNewLimitHandlerV2(t *testing.T) { + conf := &HandlerConfig{ + PrintLog: false, + Method: "", + LimitBytes: DefaultDataLimit, + TotalConCurrency: defaultMaxConcurrency, + IPConCurrency: "2/1", + CacheSize: DefaultCacheSize, + } + convey.Convey("normal situation,no err return", t, func() { + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("IPConCurrency parameter error", t, func() { + conf.IPConCurrency = "2021/1" + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("cacheSize parameter error", t, func() { + conf.CacheSize = 0 + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("method parameter error", t, func() { + conf.Method = "20/iajsdkjas2jhjdklsjkldjsdfasd1" + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("TotalConCurrency parameter error", t, func() { + conf.TotalConCurrency = 0 + _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) + convey.So(err, convey.ShouldNotEqual, nil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go new file mode 100644 index 0000000..b81d511 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go @@ -0,0 +1,161 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limit listener, refer to "golang.org/x/net/netutil" and +// change the acquire method, if acquire failed, return false immediately +package limiter + +import ( + "errors" + "fmt" + "net" + "strings" + "sync" + "time" + + "ascend-common/common-utils/cache" + "ascend-common/common-utils/hwlog" +) + +const ( + maxConnection = 1024 + maxIPConnection = 512 + + largeMaxConnection = 16384 +) + +func commonLimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { + if IPConnLimit < 0 || IPConnLimit > maxIPConnection { + return nil, errors.New("the parameter IPConnLimit is illegal") + } + bucket := make(chan struct{}, totalConnLimit) + ll := &localLimitListener{ + Listener: l, + buckets: bucket, + ipConnLimit: int64(IPConnLimit), + } + if cacheSize > 0 { + ll.ipCache = cache.New(cacheSize) + } + return ll, nil +} + +// LimitListener returns a Listener that accepts at most n connections at the same time +func LimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { + if totalConnLimit < 0 || totalConnLimit > maxConnection { + return nil, errors.New("the parameter totalConnLimit is illegal") + } + return commonLimitListener(l, totalConnLimit, IPConnLimit, cacheSize) +} + +type localLimitListener struct { + net.Listener + buckets chan struct{} + closeOnce sync.Once + ipCache *cache.ConcurrencyLRUCache + ipConnLimit int64 +} + +// acquire acquires the limiting semaphore. Returns true if successfully +// accquired, false if the listener is closed or reach the max limit +func (l *localLimitListener) acquire() bool { + select { + case l.buckets <- struct{}{}: + return true + default: + return false + } +} +func (l *localLimitListener) release() { <-l.buckets } + +// Accept implement net.Listener interface +func (l *localLimitListener) Accept() (net.Conn, error) { + c, err := l.Listener.Accept() + if err != nil { + return nil, err + } + // ip connection limit + ip, cacheKey := getIpAndKey(c) + if ip != "" && l.ipCache != nil { + if counts, err := l.ipCache.INCR(cacheKey, -1); err == nil && counts > l.ipConnLimit { + hwlog.RunLog.Warn("ip connections reach max limit, connection will to force closed") + return closeImmediately(c, l.ipCache), nil + } + } + // total tcp connection limit + if l.acquire() { + return &limitListenerConn{Conn: c, release: l.release, ipCache: l.ipCache}, nil + } + hwlog.RunLog.Warn("limit forbidden, connection will to force closed") + return closeImmediately(c, l.ipCache), nil + +} + +func getIpAndKey(c net.Conn) (string, string) { + ipWithPort := c.RemoteAddr().String() + if ipWithPort != "" { + s := strings.Split(ipWithPort, ":") + return s[0], fmt.Sprintf("key-conn-%s", s[0]) + } + return "", "" +} + +func closeImmediately(c net.Conn, lruCache *cache.ConcurrencyLRUCache) net.Conn { + // once the connection reach the max limit, force close the connection + tcpConn, ok := c.(*net.TCPConn) + if ok { + if err := tcpConn.SetLinger(0); err != nil { + hwlog.RunLog.Warnf("Error when setting linger: %s", err) + } + } + + err := c.Close() + if err != nil { + hwlog.RunLog.Warn(err) + } + return &limitListenerConn{Conn: c, release: func() {}, ipCache: lruCache} +} + +// Close implement net.Listener interface +func (l *localLimitListener) Close() error { + err := l.Listener.Close() + l.closeOnce.Do(func() { close(l.buckets) }) + return err +} + +type limitListenerConn struct { + net.Conn + releaseOnce sync.Once + release func() + ipCache *cache.ConcurrencyLRUCache +} + +// Close override net.Conn interface +func (l *limitListenerConn) Close() error { + err := l.Conn.Close() + if err != nil { + hwlog.RunLog.Debugf("close grpc connect failed: %v", err) + return fmt.Errorf("close grpc connect failed: %v", err) + } + l.releaseOnce.Do(l.release) + ip, cacheKey := getIpAndKey(l.Conn) + if ip != "" && l.ipCache != nil { + d, err := l.ipCache.DECR(cacheKey, time.Hour) + if err != nil { + hwlog.RunLog.Error(err) + } + hwlog.RunLog.Debugf("decrement ip connections %d", d) + } + return err +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go new file mode 100644 index 0000000..631e1bb --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go @@ -0,0 +1,125 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a token bucket limiter +package limiter + +import ( + "errors" + "net" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +const ( + len2 = 2 +) + +func TestLimitListenerAccept(t *testing.T) { + convey.Convey("test Accept function", t, func() { + + limitLor, err := LimitListener(&mockLicener{}, len2, len2, DefaultCacheSize) + if err != nil { + return + } + l, ok := limitLor.(*localLimitListener) + if !ok { + return + } + mock2 := gomonkey.ApplyFunc(getIpAndKey, func(net.Conn) (string, string) { + return "127.0.0.1", "key-127.0.0.1" + }) + defer mock2.Reset() + convey.Convey("acquire token success", func() { + _, err = l.Accept() + convey.So(err, convey.ShouldEqual, nil) + }) + + convey.Convey("accept failed", func() { + mock := gomonkey.ApplyMethodFunc(l.Listener, "Accept", func() (net.Conn, error) { + return nil, errors.New("mock error") + }) + defer mock.Reset() + con, err := l.Accept() + convey.So(err, convey.ShouldNotEqual, nil) + convey.So(con, convey.ShouldEqual, nil) + }) + + convey.Convey("acquire token failed", func() { + mock := gomonkey.ApplyPrivateMethod(l, "acquire", func(*localLimitListener) bool { + return false + }) + defer mock.Reset() + con, err := l.Accept() + convey.So(err, convey.ShouldEqual, nil) + conm, ok := con.(*limitListenerConn) + if !ok { + return + } + convey.So(conm.release, convey.ShouldNotEqual, nil) + }) + + }) +} + +type mockLicener struct { +} + +func (l *mockLicener) Accept() (net.Conn, error) { + return &net.TCPConn{}, nil +} + +func (l *mockLicener) Addr() net.Addr { + return &net.IPAddr{ + IP: []byte("127.0.0.1"), + Zone: "", + } +} + +func (l *mockLicener) Close() error { + return nil +} + +func TestGetIpAndKey(t *testing.T) { + convey.Convey("test getIp function", t, func() { + c := net.TCPConn{} + mock := gomonkey.ApplyMethodFunc(&c, "RemoteAddr", func() net.Addr { + return &net.IPAddr{ + IP: []byte("127.0.0.1"), + Zone: "", + } + }) + defer mock.Reset() + ip, _ := getIpAndKey(&c) + convey.So(ip, convey.ShouldNotEqual, "") + }) +} + +func TestLimitListener(t *testing.T) { + convey.Convey("test new listener function success", t, func() { + l, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection, DefaultDataLimit) + convey.So(l, convey.ShouldNotEqual, nil) + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("test new listener function", t, func() { + _, err := LimitListener(&mockLicener{}, maxConnection+1, maxIPConnection, DefaultDataLimit) + convey.So(err, convey.ShouldNotEqual, nil) + }) + convey.Convey("test new listener function", t, func() { + _, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection+1, DefaultDataLimit) + convey.So(err, convey.ShouldNotEqual, nil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go new file mode 100644 index 0000000..9117d07 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go @@ -0,0 +1,64 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a writer limiter +package limiter + +import ( + "bytes" + "errors" + + "ascend-common/common-utils/hwlog" +) + +const defaultLimit = 1024 + +// LimitedWriter limit the size of written data +type LimitedWriter struct { + buffer *bytes.Buffer + limit int + size int +} + +// NewLimitedWriter create a LimitedWriter +func NewLimitedWriter(limit int) *LimitedWriter { + if limit <= 0 { + hwlog.RunLog.Warnf("limit: %v is invalid, set default limit: %v", limit, defaultLimit) + limit = defaultLimit + } + return &LimitedWriter{ + buffer: &bytes.Buffer{}, + limit: limit, + } +} + +// Write write bytes to buffer +func (lw *LimitedWriter) Write(p []byte) (int, error) { + if lw.size+len(p) > lw.limit { + return 0, errors.New("buffer limit exceeded") + } + n, err := lw.buffer.Write(p) + if err == nil { + lw.size += n + } + return n, err +} + +// GetBufferBytes get buffer bytes +func (lw *LimitedWriter) GetBufferBytes() []byte { + if lw.buffer == nil { + return []byte{} + } + return lw.buffer.Bytes() +} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go new file mode 100644 index 0000000..9a308f3 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go @@ -0,0 +1,37 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package limiter implement a writer limiter +package limiter + +import ( + "io" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestLimitWriterWrite(t *testing.T) { + convey.Convey("test limiter Writer write function", t, func() { + data := []byte("test") + limitBuffer := NewLimitedWriter(len(data)) + + n, err := limitBuffer.Write(data) + convey.So(err, convey.ShouldBeNil) + convey.So(n, convey.ShouldEqual, len(data)) + n, err = limitBuffer.Write(data) + convey.So(err, convey.ShouldEqual, io.EOF) + convey.So(n, convey.ShouldEqual, 0) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go new file mode 100644 index 0000000..1a97a1b --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go @@ -0,0 +1,71 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security rand +package rand + +import ( + "errors" + "fmt" + "io" + "os" + "runtime" + "sync" + "time" +) + +const ( + maxReadSize = 1<<25 - 1 +) + +// A randomReader satisfies reads by reading the file named name. +type randomReader struct { + f io.Reader + mu sync.Mutex +} + +func init() { + Reader = &randomReader{} +} + +func warnBlocked() { + fmt.Println("mindx-security/rand: blocked for 60 seconds waiting to read random data from the kernel") +} + +var supportOs = "linux" + +// Read implements the interface of io.Reader +func (r *randomReader) Read(b []byte) (int, error) { + t := time.AfterFunc(time.Minute, warnBlocked) + defer t.Stop() + if len(b) > maxReadSize { + return 0, errors.New("byte size is too large") + } + r.mu.Lock() + defer r.mu.Unlock() + if runtime.GOOS != supportOs { + return 0, errors.New("not supported") + } + f, err := os.Open("/dev/random") + if err != nil { + return 0, err + } + defer func() { + err = f.Close() + if err != nil { + fmt.Println("close random file failed") + } + }() + return f.Read(b) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go new file mode 100644 index 0000000..b02d9d6 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go @@ -0,0 +1,54 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security rand +package rand + +import ( + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +const ( + illegalSize = 1 << 25 +) + +func TestInnerRead(t *testing.T) { + convey.Convey("test random read func", t, func() { + reader := &randomReader{} + convey.Convey("read size too large, err returned", func() { + bs := make([]byte, illegalSize, illegalSize) + r, err := reader.Read(bs) + convey.So(err.Error(), convey.ShouldEqual, "byte size is too large") + convey.So(r, convey.ShouldEqual, 0) + }) + convey.Convey("windows,err returned", func() { + mock := gomonkey.ApplyGlobalVar(&supportOs, "windows") + defer mock.Reset() + bs := make([]byte, 1, 1) + r, err := reader.Read(bs) + convey.So(err.Error(), convey.ShouldEqual, "not supported") + convey.So(r, convey.ShouldEqual, 0) + }) + convey.Convey("normal situation,no err returned", func() { + // the length of byte is one, to prevent block when generate random + bs := make([]byte, 1, 1) + r, err := reader.Read(bs) + convey.So(err, convey.ShouldEqual, nil) + convey.So(r, convey.ShouldEqual, 1) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random.go b/mind-cluster/component/ascend-common/common-utils/rand/random.go new file mode 100644 index 0000000..353d868 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/random.go @@ -0,0 +1,28 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security io.Reader +package rand + +import ( + "io" +) + +// Reader rand reader to generate security random bytes +var Reader io.Reader + +// Read is a helper function that calls Reader.Read using io.ReadFull. +func Read(b []byte) (int, error) { + return io.ReadFull(Reader, b) +} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random_test.go b/mind-cluster/component/ascend-common/common-utils/rand/random_test.go new file mode 100644 index 0000000..04ce333 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/rand/random_test.go @@ -0,0 +1,32 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package rand implement the security rand +package rand + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestRead(t *testing.T) { + convey.Convey("package function test,normal situation", t, func() { + // the length of byte is one, to prevent block when generate random + bs := make([]byte, 1, 1) + l, err := Read(bs) + convey.So(err, convey.ShouldEqual, nil) + convey.So(l, convey.ShouldEqual, 1) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env.go b/mind-cluster/component/ascend-common/common-utils/utils/env.go new file mode 100644 index 0000000..4402375 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/env.go @@ -0,0 +1,35 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils env function +package utils + +import ( + "fmt" + "os/user" + "strconv" +) + +// GetCurrentUid get current uid +func GetCurrentUid() (uint32, error) { + userInfo, err := user.Current() + if err != nil { + return 0, fmt.Errorf("get current user info failed: %v", err) + } + uid, err := strconv.Atoi(userInfo.Uid) + if err != nil { + return 0, fmt.Errorf("convert uid to int failed: %v", err) + } + return uint32(uid), nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env_test.go b/mind-cluster/component/ascend-common/common-utils/utils/env_test.go new file mode 100644 index 0000000..95d8983 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/env_test.go @@ -0,0 +1,51 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils env test +package utils + +import ( + "fmt" + "os/user" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +func TestGetCurrentUid(t *testing.T) { + convey.Convey("test func GetCurrentUid success", t, func() { + var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "0"}, nil) + defer p1.Reset() + uid, err := GetCurrentUid() + convey.So(err, convey.ShouldBeNil) + convey.So(uid, convey.ShouldEqual, 0) + }) + convey.Convey("test func GetCurrentUid failed, get current user info failed", t, func() { + var p1 = gomonkey.ApplyFuncReturn(user.Current, nil, testErr) + defer p1.Reset() + uid, err := GetCurrentUid() + expErr := fmt.Errorf("get current user info failed: %v", testErr) + convey.So(err, convey.ShouldResemble, expErr) + convey.So(uid, convey.ShouldEqual, 0) + }) + convey.Convey("test func GetCurrentUid failed, uid is invalid", t, func() { + var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "invalid uid"}, nil) + defer p1.Reset() + uid, err := GetCurrentUid() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "convert uid to int failed") + convey.So(uid, convey.ShouldEqual, 0) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file.go b/mind-cluster/component/ascend-common/common-utils/utils/file.go new file mode 100644 index 0000000..253e2b5 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file.go @@ -0,0 +1,176 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "reflect" + "strings" +) + +const ( + // FileMode file privilege + FileMode = 0600 + // Size10M bytes of 10M + Size10M = 10 * 1024 * 1024 + maxSize = 1024 * 1024 * 1024 +) + +// ReadLimitBytes read limit length of contents from file path +func ReadLimitBytes(path string, limitLength int) ([]byte, error) { + if limitLength < 0 || limitLength > maxSize { + return nil, errors.New("the limit length is not valid") + } + + key, err := CheckPath(path) + if err != nil { + return nil, err + } + file, err := os.OpenFile(key, os.O_RDONLY, FileMode) + if err != nil { + return nil, errors.New(fmt.Sprintf("open file with read-only and %04o mode failed", FileMode)) + } + defer file.Close() + buf := make([]byte, limitLength, limitLength) + l, err := file.Read(buf) + if err != nil { + return nil, fmt.Errorf("read file failed: %v", err) + } + return buf[0:l], nil +} + +// LoadFile load file content +func LoadFile(filePath string) ([]byte, error) { + if filePath == "" { + return nil, nil + } + absPath, err := filepath.Abs(filePath) + if err != nil { + return nil, fmt.Errorf("the filePath is invalid: %v", err) + } + if !IsExist(absPath) { + return nil, nil + } + + return ReadLimitBytes(absPath, Size10M) +} + +func closeFile(file *os.File) { + if file == nil { + return + } + if err := file.Close(); err != nil { + return + } + return +} + +// CopyFile copy file +func CopyFile(src, dst string) error { + src, err := CheckPath(src) + if err != nil { + return err + } + if IsExist(dst) { + dst, err = CheckPath(dst) + if err != nil { + return err + } + } + + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer closeFile(srcFile) + + srcInfo, err := os.Stat(src) + if err != nil { + return err + } + + dstFile, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE|os.O_TRUNC, srcInfo.Mode()) + if err != nil { + return err + } + defer closeFile(dstFile) + + if _, err = io.Copy(dstFile, srcFile); err != nil { + return err + } + return os.Chmod(dst, srcInfo.Mode()) +} + +// CopyDir recursively copy files +func CopyDir(src string, dst string) error { + var ( + err error + fds []os.FileInfo = nil + dstInfo os.FileInfo + ) + + if dstInfo, err = os.Stat(src); err != nil { + return err + } + if err = os.MkdirAll(dst, dstInfo.Mode()); err != nil { + return err + } + if subFolder(src, dst) { + return errors.New("the destination directory is a subdirectory of the source directory") + } + if fds, err = ioutil.ReadDir(src); err != nil { + return err + } + for _, fd := range fds { + srcFile := filepath.Join(src, fd.Name()) + dstFile := filepath.Join(dst, fd.Name()) + if fd.IsDir() { + if err = CopyDir(srcFile, dstFile); err != nil { + return err + } + } else { + if err = CopyFile(srcFile, dstFile); err != nil { + return err + } + } + } + return nil +} + +func subFolder(src, dst string) bool { + if src == dst { + return true + } + srcReal, err := filepath.EvalSymlinks(src) + if err != nil { + return false + } + dstReal, err := filepath.EvalSymlinks(dst) + if err != nil { + return false + } + srcList := strings.Split(srcReal, string(os.PathSeparator)) + dstList := strings.Split(dstReal, string(os.PathSeparator)) + if len(srcList) > len(dstList) { + return false + } + return reflect.DeepEqual(srcList, dstList[:len(srcList)]) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check.go new file mode 100644 index 0000000..4134245 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_check.go @@ -0,0 +1,240 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" + "syscall" +) + +const ( + notValidPath = "not-valid-file-path" + maxAllowFileSize int64 = 1024 * 100 // in megabytes + oneMegabytes int64 = 1024 * 1024 + // DefaultWhiteList default white list in string + DefaultWhiteList = "-_./~" + // DefaultStringLength default string max length + DefaultStringLength = 256 + // DefaultPathLength default path max length + DefaultPathLength = 4096 +) + +// RealFileChecker Check whether the file is valid +func RealFileChecker(path string, checkParent, allowLink bool, size int64) (string, error) { + realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) + if err != nil { + return notValidPath, err + } + if fileInfo.IsDir() { + return notValidPath, fmt.Errorf("invalid dir") + } + if !fileInfo.Mode().IsRegular() { + return notValidPath, fmt.Errorf("invalid regular file") + } + if size > maxAllowFileSize || size < 0 { + return notValidPath, fmt.Errorf("invalid size") + } + if fileInfo.Size() > size*oneMegabytes { + return notValidPath, fmt.Errorf("size too large") + } + return realPath, nil +} + +// RealDirChecker Check whether the directory is valid +func RealDirChecker(path string, checkParent, allowLink bool) (string, error) { + realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) + if err != nil { + return notValidPath, err + } + if !fileInfo.IsDir() { + return notValidPath, fmt.Errorf("is not dir") + } + return realPath, nil +} + +// PathStringChecker Check whether the directory string is valid +func PathStringChecker(path string) (string, error) { + realPath, err := filepath.Abs(path) + if err != nil { + return notValidPath, err + } + if len(realPath) > DefaultPathLength { + return notValidPath, fmt.Errorf("path over max path length") + } + if !stringChecker(realPath, 0, DefaultPathLength) { + return notValidPath, fmt.Errorf("invalid path") + } + if err = pathDepthChecker(realPath, 0); err != nil { + return notValidPath, err + } + return realPath, nil +} + +// VerifyFile verify the file after it is opened. +func VerifyFile(file *os.File, size int64) error { + fileInfo, err := file.Stat() + if err != nil { + return err + } + if size > maxAllowFileSize || size < 0 { + return fmt.Errorf("invalid size") + } + if fileInfo.Size() > size*oneMegabytes { + return fmt.Errorf("file size error %v", fileInfo.Size()) + } + if (fileInfo.Mode() & fs.ModeSymlink) != 0 { + return fmt.Errorf("file is softlink") + } + if st := fileInfo.Sys(); st.(*syscall.Stat_t).Uid != uint32(os.Geteuid()) { + return fmt.Errorf("file owner incorrect") + } + return nil +} + +// SafeChmod after the verification is complete, run the chmod command. +func SafeChmod(path string, size int64, mode os.FileMode) error { + file, err := os.Open(path) + if err != nil { + return err + } + defer file.Close() + if err = VerifyFile(file, size); err != nil { + return err + } + if err = file.Chmod(mode); err != nil { + return err + } + return nil +} + +func realPathChecker(path string, checkParent, allowLink bool) (string, os.FileInfo, error) { + realPath, err := filepath.Abs(path) + if err != nil { + return notValidPath, nil, err + } + if len(realPath) > DefaultPathLength { + return notValidPath, nil, fmt.Errorf("path over max path length") + } + if !stringChecker(realPath, 0, DefaultPathLength) { + return notValidPath, nil, fmt.Errorf("invalid path") + } + if err = fileChecker(realPath, true, checkParent, allowLink, 0); err != nil { + return notValidPath, nil, err + } + fileInfo, err := os.Stat(realPath) + if err != nil { + return notValidPath, nil, err + } + return realPath, fileInfo, nil +} + +func fileChecker(path string, allowDir, checkParent, allowLink bool, deep int) error { + const maxDepth int = 99 + if deep > maxDepth { + return fmt.Errorf("over maxDepth %d", maxDepth) + } + fileInfo, err := normalFileCheck(path, allowDir, allowLink) + if err != nil { + return err + } + if err = checkOwnerAndPermission(fileInfo, path); err != nil { + return err + } + if path != "/" && checkParent { + return fileChecker(filepath.Dir(path), true, true, allowLink, deep+1) + } + return nil +} + +func pathDepthChecker(path string, deep int) error { + const maxDepth int = 99 + if deep > maxDepth { + return fmt.Errorf("over maxDepth %d", maxDepth) + } + if path != "/" { + return pathDepthChecker(filepath.Dir(path), deep+1) + } + return nil +} + +func checkOwnerAndPermission(fileInfo os.FileInfo, filePath string) error { + const groupWriteIndex, otherWriteIndex, permLength int = 5, 8, 10 + perm := fileInfo.Mode().Perm().String() + if len(perm) != permLength { + return fmt.Errorf("permission not right %v %v", filePath, perm) + } + for index, char := range perm { + if (index == groupWriteIndex || index == otherWriteIndex) && char == 'w' { + return fmt.Errorf("write permission not right %v %v", filePath, perm) + } + } + stat, ok := fileInfo.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("can not get stat %v", filePath) + } + if !(int(stat.Uid) == 0 || int(stat.Uid) == os.Getuid()) { + return fmt.Errorf("owner not right %v %v", filePath, int(stat.Uid)) + } + return nil +} + +func normalFileCheck(filePath string, allowDir, allowLink bool) (os.FileInfo, error) { + realPath, err := filepath.EvalSymlinks(filePath) + if err != nil || (realPath != filePath && !allowLink) { + return nil, fmt.Errorf("symlinks or not existed, failed %v, %v", filePath, err) + } + fileInfo, err := os.Stat(filePath) + if err != nil { + return nil, fmt.Errorf("get file stat failed %v", err) + } + if allowDir && !fileInfo.Mode().IsRegular() && !fileInfo.IsDir() { + return nil, fmt.Errorf("not regular file/dir %v", filePath) + } + if !allowDir && !fileInfo.Mode().IsRegular() { + return nil, fmt.Errorf("not regular file %v", filePath) + } + if fileInfo.Mode()&os.ModeSetuid != 0 { + return nil, fmt.Errorf("setuid not allowed %v", filePath) + } + if fileInfo.Mode()&os.ModeSetgid != 0 { + return nil, fmt.Errorf("setgid not allowed %v", filePath) + } + return fileInfo, nil +} + +func isValidCode(c rune) bool { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') +} + +func isInWhiteList(c rune) bool { + return strings.Contains(DefaultWhiteList, string(c)) +} + +func stringChecker(text string, minLength, maxLength int) bool { + if len(text) <= minLength || len(text) >= maxLength { + return false + } + for _, char := range text { + if !isValidCode(char) && !isInWhiteList(char) { + return false + } + } + return true +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go new file mode 100644 index 0000000..3c8e065 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go @@ -0,0 +1,194 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package mindxcheckutils is a check utils package +package utils + +import ( + "os" + "strings" + "testing" +) + +func TestNormalFileCheckRegularFile(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + defer removeTmpDir(t, tmpDir) + err = os.Symlink(filePath, tmpDir+"/syslink") + if err != nil { + t.Fatalf("create symlink failed %q: %s", filePath, err) + } + + if _, err = normalFileCheck(tmpDir, true, false); err != nil { + t.Fatalf("check allow dir failed %q: %s", tmpDir+"/__test__", err) + } + + if _, err = normalFileCheck(tmpDir, false, false); !strings.Contains(err.Error(), "not regular file") { + t.Fatalf("check not allow dir failed %q: %s", tmpDir+"/__test__", err) + } + + if _, err = normalFileCheck("/dev/zero", true, false); !strings.Contains(err.Error(), "not regular file/dir") { + t.Fatalf("check /dev/zero failed %q: %s", tmpDir+"/__test__", err) + } + + if _, err = normalFileCheck(tmpDir+"/syslink", false, false); !strings.Contains(err.Error(), "symlinks") { + t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) + } + + if _, err = normalFileCheck(filePath, false, false); err != nil { + t.Fatalf("check failed %q: %s", filePath, err) + } + + if _, err = normalFileCheck(tmpDir+"/notexisted", false, false); !strings.Contains(err.Error(), "not existed") { + t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) + } +} + +func TestRealFileChecker(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + const permission os.FileMode = 0700 + err = os.WriteFile(filePath, []byte("hello\n"), permission) + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + if _, err = RealFileChecker(filePath, false, true, 0); err == nil { + t.Fatalf("size check wrong 0 %q: %s", filePath, err) + } + if _, err = RealFileChecker(filePath, false, true, 1); err != nil { + t.Fatalf("size check wrong 1 %q: %s", filePath, err) + } +} + +func TestRealFileCheckerInside(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + const permission os.FileMode = 0700 + const deep int = 100 + err = os.WriteFile(filePath, []byte("hello\n"), permission) + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + if err = fileChecker(filePath, false, false, false, deep); err == nil { + t.Fatalf("size check wrong 0 %q: %s", filePath, err) + } +} + +func TestRealDirChecker(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + if _, err = RealDirChecker(filePath, false, true); err == nil { + t.Fatalf("should be dir 0 %q: %s", filePath, err) + } + if _, err = RealDirChecker(tmpDir, false, true); err != nil { + t.Fatalf("should be dir 1 %q: %s", filePath, err) + } +} + +func TestVerifyFile(t *testing.T) { + tmpDir, filePath, err := createTestFile(t, "test_file.txt") + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + defer removeTmpDir(t, tmpDir) + err = os.Symlink(filePath, tmpDir+"/syslink") + if err != nil { + t.Fatalf("create symlink failed %q: %s", filePath, err) + } + file, err := os.Open(filePath) + if err != nil { + t.Fatalf("open file failed") + } + defer file.Close() + linkFile, err := os.Open(tmpDir + "/syslink") + if err != nil { + t.Fatalf("open file failed") + } + defer linkFile.Close() + const permission os.FileMode = 0700 + err = os.WriteFile(filePath, []byte("hello\n"), permission) + if err != nil { + t.Fatalf("create file failed %q: %s", filePath, err) + } + if err = VerifyFile(file, 0); err == nil { + t.Fatalf("size check wrong 0 %q: %s", filePath, err) + } + if err = VerifyFile(file, 1); err != nil { + t.Fatalf("size check wrong 1 %q: %s", filePath, err) + } + if err = VerifyFile(linkFile, 1); err != nil && !strings.Contains(err.Error(), "symlinks") { + t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) + } +} + +func TestStringChecker(t *testing.T) { + if ok := stringChecker("0123456789abcABC", 0, DefaultStringLength); !ok { + t.Fatalf("failed on regular letters") + } + const testSize = 3 + if ok := stringChecker("123", 0, testSize); ok { + t.Fatalf("failed on max length") + } + if ok := stringChecker("1234", 0, testSize); ok { + t.Fatalf("failed on max length") + } + if ok := stringChecker("12", 0, testSize); !ok { + t.Fatalf("failed on max length") + } + if ok := stringChecker("", 0, testSize); ok { + t.Fatalf("failed on min length") + } + if ok := stringChecker("123", testSize, DefaultStringLength); ok { + t.Fatalf("failed on min length") + } + if ok := stringChecker("123%", 0, DefaultStringLength); ok { + t.Fatalf("failed on strange words") + } + if ok := stringChecker("123.-/~", 0, DefaultStringLength); !ok { + t.Fatalf("failed on strange words") + } +} + +func createTestFile(t *testing.T, fileName string) (string, string, error) { + const fileMode os.FileMode = 0600 + tmpDir := os.TempDir() + const permission os.FileMode = 0700 + if os.MkdirAll(tmpDir+"/__test__", permission) != nil { + t.Fatalf("MkdirAll failed %q", tmpDir+"/__test__") + } + f, err := os.Create(tmpDir + "/__test__" + fileName) + if err != nil { + t.Fatalf("create file failed %q: %s", tmpDir+"/__test__", err) + } + defer f.Close() + err = f.Chmod(fileMode) + if err != nil { + t.Fatalf("change file mode failed %q: %s", tmpDir+"/__test__", err) + } + return tmpDir + "/__test__", tmpDir + "/__test__" + fileName, err +} + +func removeTmpDir(t *testing.T, tmpDir string) { + if os.RemoveAll(tmpDir) != nil { + t.Logf("removeall %v", tmpDir) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_test.go new file mode 100644 index 0000000..8f91417 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_test.go @@ -0,0 +1,169 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +func TestReadLimitBytes(t *testing.T) { + convey.Convey("test ReadLimitBytes func", t, func() { + convey.Convey("should return nil given empty string", func() { + emptyString := "" + const limitLength = 10 + res, err := ReadLimitBytes(emptyString, limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err, convey.ShouldBeError) + }) + + convey.Convey("should not return nil given valid path", func() { + const limitLength = 10 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldNotBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return nil given invalid limit length", func() { + const limitLength = -1 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "the limit length is not valid") + }) + + convey.Convey("should return nil when check path failed", func() { + checkStub := gomonkey.ApplyFunc(CheckPath, func(path string) (string, error) { + return "", errors.New("check failed") + }) + defer checkStub.Reset() + const limitLength = 10 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "check failed") + }) + + convey.Convey("should return nil when read file failed", func() { + var file *os.File + checkStub := gomonkey.ApplyMethod(reflect.TypeOf(file), "Read", + func(_ *os.File, _ []byte) (int, error) { + return 0, errors.New("read file failed") + }) + defer checkStub.Reset() + const limitLength = 10 + res, err := ReadLimitBytes("../../go.mod", limitLength) + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "read file failed: read file failed") + }) + }) +} + +func TestLoadFile(t *testing.T) { + convey.Convey("test LoadFile func", t, func() { + convey.Convey("should return error given empty path", func() { + res, err := LoadFile("") + convey.So(res, convey.ShouldBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return nil given path not existing", func() { + res, err := LoadFile("xxxx") + convey.So(res, convey.ShouldBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should not return nil given valid path", func() { + res, err := LoadFile("../../go.mod") + convey.So(res, convey.ShouldNotBeNil) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return nil given invalid path", func() { + absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { + return "", errors.New("the path is invalid") + }) + defer absStub.Reset() + res, err := LoadFile("../../go.mod") + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "the filePath is invalid: the path is invalid") + }) + + convey.Convey("should return nil when read file failed", func() { + readStub := gomonkey.ApplyFunc(ReadLimitBytes, func(path string, limitLength int) ([]byte, error) { + return nil, errors.New("read file failed") + }) + defer readStub.Reset() + res, err := LoadFile("../../go.mod") + convey.So(res, convey.ShouldBeNil) + convey.So(err.Error(), convey.ShouldEqual, "read file failed") + }) + }) +} + +func TestCopyDir(t *testing.T) { + convey.Convey("test CopyDir func", t, func() { + convey.Convey("should return error given empty src path", func() { + err := CopyDir("", "") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given file src path", func() { + err := CopyDir("../../go.mod", "") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return nil given dir src path", func() { + err := CopyDir("../utils", "../utils_test") + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("should return error given file dst path", func() { + err := CopyDir("../utils", "../utils_test/file_test.go") + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestCopyFile(t *testing.T) { + convey.Convey("test CopyFile func", t, func() { + convey.Convey("should return error given empty src file path", func() { + err := CopyFile("", "../utils_test/file_test.go") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given empty dst path", func() { + err := CopyFile("../utils_test/file_test.go", "") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given dir scr path", func() { + err := CopyFile("../utils", "../utils_test/file_test.go") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return error given dir dst path", func() { + err := CopyFile("../utils/file_test.go", "../utils_test") + convey.So(err, convey.ShouldNotBeNil) + }) + convey.Convey("should return nil given file scr and dst path", func() { + err := CopyFile("../utils/file_test.go", "../utils_test/file_test.go") + convey.So(err, convey.ShouldBeNil) + }) + }) + if err := os.RemoveAll("../utils_test"); err != nil { + fmt.Print("remove util_test file failed") + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go new file mode 100644 index 0000000..78f4266 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go @@ -0,0 +1,85 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer utils for file watcher +package utils + +import ( + "fmt" + "os" + + "github.com/fsnotify/fsnotify" +) + +// FileWatcher struct file watcher +type FileWatcher struct { + watcher *fsnotify.Watcher +} + +// NewFileWatcher new FileWatcher +func NewFileWatcher() (*FileWatcher, error) { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return nil, err + } + return &FileWatcher{watcher: watcher}, nil +} + +// WatchFile add file to watch +func (fw *FileWatcher) WatchFile(filePath string) error { + if _, err := os.Stat(filePath); err != nil { + return err + } + if _, err := PathStringChecker(filePath); err != nil { + return err + } + return fw.watcher.Add(filePath) +} + +// Events get event channel +func (fw *FileWatcher) Events() chan fsnotify.Event { + if fw == nil || fw.watcher == nil { + return nil + } + return fw.watcher.Events +} + +// Errors get error channel +func (fw *FileWatcher) Errors() chan error { + if fw == nil || fw.watcher == nil { + return nil + } + return fw.watcher.Errors +} + +// Close to close the file watcher +func (fw *FileWatcher) Close() error { + if fw == nil || fw.watcher == nil { + return nil + } + return fw.watcher.Close() +} + +// GetFileWatcherChan get eventCh and errCh for file watcher +func GetFileWatcherChan(filePath string) (*FileWatcher, error) { + watcher, err := NewFileWatcher() + if err != nil { + return nil, fmt.Errorf("new file watcher failed, error: %v", err) + } + if err = watcher.WatchFile(filePath); err != nil { + return nil, fmt.Errorf("watch file <%s> failed, error: %v", filePath, err) + } + fmt.Printf("watching file <%s>...\n", filePath) + return watcher, nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go new file mode 100644 index 0000000..32220da --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go @@ -0,0 +1,81 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils test for file watcher utils +package utils + +import ( + "errors" + "fmt" + "os" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/fsnotify/fsnotify" + "github.com/smartystreets/goconvey/convey" +) + +var testErr = errors.New("test error") + +const ( + testFilePath = "./test.txt" + errFilePath = "./not_exist_file.txt" +) + +func TestGetFileWatcherChan(t *testing.T) { + prepareTestFile(t) + defer removeFile() + + p1 := gomonkey.ApplyFuncReturn(PathStringChecker, "", nil) + defer p1.Reset() + convey.Convey("test func GetFileWatcherChan success", t, func() { + _, err := GetFileWatcherChan(testFilePath) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("test func GetFileWatcherChan failed, new watcher err", t, func() { + p2 := gomonkey.ApplyFuncReturn(fsnotify.NewWatcher, nil, testErr) + defer p2.Reset() + _, err := GetFileWatcherChan(testFilePath) + expErr := fmt.Errorf("new file watcher failed, error: %v", testErr) + convey.So(err, convey.ShouldResemble, expErr) + }) + convey.Convey("test func GetFileWatcherChan failed, file does not exist", t, func() { + _, err := GetFileWatcherChan(errFilePath) + expErr := fmt.Sprintf("watch file <%s> failed", errFilePath) + convey.So(err.Error(), convey.ShouldContainSubstring, expErr) + }) + convey.Convey("test func GetFileWatcherChan failed, watcher is nil", t, func() { + var watcher = &FileWatcher{} + eventCh := watcher.Events() + convey.So(eventCh, convey.ShouldBeNil) + errCh := watcher.Errors() + convey.So(errCh, convey.ShouldBeNil) + err := watcher.Close() + convey.So(err, convey.ShouldBeNil) + }) +} + +func prepareTestFile(t *testing.T) { + const mode644 = 0644 + err := os.WriteFile(testFilePath, []byte("file context"), mode644) + if err != nil { + t.Error(err) + } +} + +func removeFile() { + if err := os.Remove(testFilePath); err != nil && errors.Is(err, os.ErrNotExist) { + fmt.Printf("remove file %s failed, %v\n", testFilePath, err) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface.go b/mind-cluster/component/ascend-common/common-utils/utils/interface.go new file mode 100644 index 0000000..7ccae4d --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/interface.go @@ -0,0 +1,29 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import "reflect" + +// IsNil check whether the interface is nil, including type or data is nil +func IsNil(i interface{}) bool { + if i == nil { + return true + } + defer func() { + recover() + }() + return reflect.ValueOf(i).IsNil() +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go b/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go new file mode 100644 index 0000000..f2ce878 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go @@ -0,0 +1,36 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +func TestIsNil(t *testing.T) { + var a interface{} // type = nil, data = nil + var b interface{} = (*int)(nil) // type is *int , data = nil + var c interface{} = "dd" + convey.Convey("test IsNil func, type and data is both nil", t, func() { + convey.So(a == nil, convey.ShouldEqual, true) + convey.So(b == nil, convey.ShouldEqual, false) + convey.So(c == nil, convey.ShouldEqual, false) + convey.So(IsNil(a), convey.ShouldEqual, true) + convey.So(IsNil(b), convey.ShouldEqual, true) + convey.So(IsNil(c), convey.ShouldEqual, false) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go new file mode 100644 index 0000000..f3ed96e --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go @@ -0,0 +1,98 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import ( + "errors" + "net" + "net/http" + "regexp" + "strings" +) + +const ( + domainReg = "^[a-zA-Z0-9][a-zA-Z0-9.-]{1,256}[a-zA-Z0-9]$" +) + +// ClientIP try to get the clientIP +func ClientIP(r *http.Request) string { + // get forward ip fistly + var ip string + xForwardedFor := r.Header.Get("X-Forwarded-For") + forwardSlice := strings.Split(xForwardedFor, ",") + if len(forwardSlice) >= 1 { + if ip = strings.TrimSpace(forwardSlice[0]); ip != "" { + return ip + } + } + // try get ip from "X-Real-Ip" + ip = strings.TrimSpace(r.Header.Get("X-Real-Ip")) + if ip != "" { + return ip + } + var err error + if ip, _, err = net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)); err == nil { + return ip + } + return "" +} + +// CheckDomain check domain which by regex and blacklist +func CheckDomain(domain string, forLocalUsage bool) error { + matched, err := regexp.MatchString(domainReg, domain) + if err != nil { + return err + } + if !matched { + return errors.New("domain does not match allowed regex") + } + if !forLocalUsage { + return nil + } + if IsDigitString(domain) { + return errors.New("domain can not be all digits") + } + if strings.Contains(domain, "localhost") { + return errors.New("domain can not contain localhost") + } + return nil +} + +// IsHostValid check if the host is valid +func IsHostValid(host string) error { + parsedIp := net.ParseIP(host) + if parsedIp != nil { + return IsIPValid(parsedIp) + } + return CheckDomain(host, false) +} + +// IsIPValid check ip valid +func IsIPValid(parsedIp net.IP) error { + if parsedIp == nil { + return errors.New("parse ip is nil") + } + if parsedIp.To4() == nil && parsedIp.To16() == nil { + return errors.New("not a valid ipv4 or ipv6 ip") + } + if parsedIp.IsUnspecified() { + return errors.New("is all zeros ip") + } + if parsedIp.IsMulticast() { + return errors.New("is multicast ip") + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go new file mode 100644 index 0000000..6ad93ab --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go @@ -0,0 +1,182 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils offer the some utils for certificate handling +package utils + +import ( + "net/http" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +const ( + localhost = "127.0.0.1" + localhostLoop = "0.0.0.0" +) + +func TestClientIP(t *testing.T) { + convey.Convey("test ClientIP func", t, func() { + convey.Convey("get IP from X-Forwarded-For", func() { + ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {localhost, localhostLoop}})) + convey.So(ip, convey.ShouldEqual, localhost) + }) + convey.Convey("get IP from X-Real-Ip", func() { + ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, + "X-Real-Ip": {localhost}})) + convey.So(ip, convey.ShouldEqual, localhost) + }) + convey.Convey("get IP from RemoteAddr", func() { + ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, + "X-Real-Ip": {}})) + convey.So(ip, convey.ShouldEqual, localhost) + }) + convey.Convey("get IP from RemoteAddr failed", func() { + ip := ClientIP(&http.Request{RemoteAddr: localhost}) + convey.So(ip, convey.ShouldEqual, "") + }) + convey.Convey("get IP failed", func() { + ip := ClientIP(&http.Request{}) + convey.So(ip, convey.ShouldEqual, "") + }) + }) +} + +func mockRequest(header map[string][]string) *http.Request { + return &http.Request{ + Method: "GET", + URL: nil, + Proto: "HTTP", + ProtoMajor: 0, + ProtoMinor: 0, + Header: header, + ContentLength: 0, + Close: false, + Host: "www.test.com", + RemoteAddr: "127.0.0.1:8080", + } +} + +func TestCheckDomain(t *testing.T) { + convey.Convey("CheckDomain function test suite", t, func() { + testDomainFormatValidation() + testLocalUsageConstraints() + testParameterCombinations() + }) +} + +// Test domain format validation +func testDomainFormatValidation() { + convey.Convey("Validate domain format rules", func() { + convey.Convey("Valid domain should pass validation", func() { + err := CheckDomain("example.com", false) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("Domain with special characters should be rejected", func() { + err := CheckDomain("example@com", false) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "domain does not match allowed regex") + }) + + convey.Convey("Domain starting with hyphen should be rejected", func() { + err := CheckDomain("-example.com", false) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +// Test local usage constraints +func testLocalUsageConstraints() { + convey.Convey("Validate constraints for local usage (forLocalUsage=true)", func() { + convey.Convey("All-digit domain should be rejected", func() { + err := CheckDomain("123456", true) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not be all digits") + }) + + convey.Convey("Domain containing 'localhost' should be rejected", func() { + err := CheckDomain("my-localhost.com", true) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not contain localhost") + }) + + convey.Convey("Valid local domain should pass validation", func() { + err := CheckDomain("local-app.example", true) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +// Test parameter combinations +func testParameterCombinations() { + convey.Convey("Validate parameter combinations", func() { + convey.Convey("All-digit restriction ignored when forLocalUsage=false", func() { + err := CheckDomain("123456", false) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("DNS check skipped when forLocalUsage=false", func() { + err := CheckDomain("unresolvable.test", false) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestIsHostValid(t *testing.T) { + tests := []struct { + name string + ip string + wantErr bool + errMsg string + }{ + { + name: "invalid IP format but domain", ip: "not.an.ip", + wantErr: false, + }, + { + name: "valid IPv4", ip: "192.168.1.1", wantErr: false, + }, + { + name: "valid IPv6", ip: "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + wantErr: false, + }, + { + name: "unspecified IPv4", ip: "0.0.0.0", + wantErr: true, errMsg: "is all zeros ip", + }, + { + name: "unspecified IPv6", ip: "::", + wantErr: true, errMsg: "is all zeros ip", + }, + { + name: "IPv6 multicast", ip: "ff02::1", + wantErr: true, errMsg: "is multicast ip", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := IsHostValid(tt.ip) + if (err != nil) != tt.wantErr { + t.Errorf("IsIPValid() error = %v, wantErr %v", err, tt.wantErr) + return + } + if err != nil && err.Error() != tt.errMsg { + t.Errorf("IsIPValid() error = %v, wantErrMsg %v", + err.Error(), tt.errMsg) + } + }) + } +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path.go b/mind-cluster/component/ascend-common/common-utils/utils/path.go new file mode 100644 index 0000000..b3150b9 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/path.go @@ -0,0 +1,382 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "bufio" + "errors" + "fmt" + "io" + "io/fs" + "log" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "syscall" +) + +const ( + dirMode = 0700 + + rootUID = 0 + maxPathDepth = 20 + maxPathLength = 1024 + // DefaultWriteFileMode default file mode for write permission check + DefaultWriteFileMode = 0022 + + ldSplitLen = 2 + ldLibNameIndex = 0 + ldLibPathIndex = 1 + ldCommand = "/sbin/ldconfig" + ldParam = "--print-cache" + // LdLibPath LD_LIBRARY_PATH + LdLibPath = "LD_LIBRARY_PATH" + grepCommand = "/bin/grep" +) + +// IsDir check whether the path is a directory. +func IsDir(path string) bool { + if path == "" { + return false + } + + if !IsExist(path) { + return path[len(path)-1:] == "/" + } + s, err := os.Stat(path) + if err != nil { + return false + } + return s.IsDir() +} + +// IsFile check whether the path is a file +func IsFile(path string) bool { + if path == "" { + return false + } + return !IsDir(path) +} + +// IsSoftlink check whether the path is softlink +func IsSoftlink(path string) (bool, error) { + file, err := os.Open(path) + if err != nil { + return false, err + } + defer file.Close() + fileInfo, err := file.Stat() + if err != nil { + return false, err + } + if (fileInfo.Mode() & fs.ModeSymlink) != 0 { + return true, nil + } + return false, nil +} + +// IsExist check whether the path exists, If the file is a symbolic link, the returned the final FileInfo +func IsExist(filePath string) bool { + _, err := os.Stat(filePath) + if err == nil { + return true + } + if os.IsExist(err) { + return true + } + return false +} + +// IsLexist check whether the path exists, If the file is a symbolic link, the returned FileInfo +// describes the symbolic link +func IsLexist(filePath string) bool { + _, err := os.Lstat(filePath) + if err == nil { + return true + } + if os.IsExist(err) { + return true + } + return false +} + +// CheckPath validate given path and return resolved absolute path +func CheckPath(path string) (string, error) { + if path == "" { + return path, nil + } + origin := path + for !IsLexist(path) { + path = filepath.Dir(path) + if path == "." { + return "", os.ErrNotExist + } + } + absPath, err := filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("get the absolute path failed: %v", err) + } + resoledPath, err := filepath.EvalSymlinks(absPath) + if err != nil { + if strings.Contains(err.Error(), "no such file or directory") { + return "", os.ErrNotExist + } + return "", fmt.Errorf("get the symlinks path failed: %v", err) + } + if absPath != resoledPath { + return "", errors.New("can't support symlinks") + } + // get the original full path + absOrigin, err := filepath.Abs(origin) + if err != nil { + return "", fmt.Errorf("get the absolute path failed: %v", err) + } + return absOrigin, nil +} + +// MakeSureDir create directory. The last element of path should end with slash, or it will be omitted. +func MakeSureDir(path string) error { + dir := filepath.Dir(path) + if IsExist(dir) { + return nil + } + + if err := os.MkdirAll(dir, dirMode); err != nil { + return fmt.Errorf("create directory failed: %v", err) + } + + return nil +} + +// CheckMode check input file mode whether includes invalid mode. +// For example, if read operation of group and other is forbidden, then call CheckMode(inputFileMode, 0044). +// All operations are forbidden for group and other, then call CheckMode(inputFileMode, 0077). +// Write operation is forbidden for group and other by default, with calling CheckMode(inputFileMode) +func CheckMode(mode os.FileMode, optional ...os.FileMode) bool { + var targetMode os.FileMode + if len(optional) > 0 { + targetMode = optional[0] + } else { + targetMode = DefaultWriteFileMode + } + checkMode := uint32(mode) & uint32(targetMode) + return checkMode == 0 +} + +// CheckOwnerAndPermission check path owner and permission +func CheckOwnerAndPermission(verifyPath string, mode os.FileMode, uid uint32) (string, error) { + if verifyPath == "" { + return verifyPath, errors.New("empty path") + } + absPath, err := filepath.Abs(verifyPath) + if err != nil { + return "", fmt.Errorf("abs failed %v", err) + } + resoledPath, err := filepath.EvalSymlinks(absPath) + if err != nil { + return "", fmt.Errorf("evalSymlinks failed %v", err) + } + // if symlinks + if absPath != resoledPath { + // check symlinks its self owner + pathInfo, err := os.Lstat(absPath) + if err != nil { + return "", fmt.Errorf("lstat failed, %v", err) + } + stat, ok := pathInfo.Sys().(*syscall.Stat_t) + if !ok || stat.Uid != uid { + return "", errors.New("symlinks owner may not root") + } + } + pathInfo, err := os.Stat(resoledPath) + if err != nil { + return "", fmt.Errorf("stat failed %v", err) + } + stat, ok := pathInfo.Sys().(*syscall.Stat_t) + if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { + return "", errors.New("check uid or mode failed") + } + return resoledPath, nil +} + +// DoCheckOwnerAndPermission check path owner and permission +func DoCheckOwnerAndPermission(path string, mode os.FileMode, uid uint32) error { + if !IsExist(path) { + return nil + } + pathInfo, err := os.Stat(path) + if err != nil { + return fmt.Errorf("stat failed %v", err) + } + stat, ok := pathInfo.Sys().(*syscall.Stat_t) + if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { + return fmt.Errorf("check uid or mode failed : %v", path) + } + return nil +} + +func checkAbsPath(libPath string) (string, error) { + absLibPath, err := CheckOwnerAndPermission(libPath, DefaultWriteFileMode, rootUID) + if err != nil { + return "", fmt.Errorf("%s: %v", libPath, err) + } + count := 0 + fPath := absLibPath + for { + if count >= maxPathDepth { + break + } + count++ + if fPath == "/" { + return absLibPath, nil + } + fPath = filepath.Dir(fPath) + if _, err := CheckOwnerAndPermission(fPath, DefaultWriteFileMode, rootUID); err != nil { + return "", fmt.Errorf("%s: %v", fPath, err) + } + } + return "", errors.New("absolute path check failed") +} + +func checkLibsPath(libraryPaths []string) (string, error) { + errs := make([]string, 0, len(libraryPaths)) + for _, libraryAbsName := range libraryPaths { + absLibPath, err := checkAbsPath(libraryAbsName) + if err == nil { + return absLibPath, nil + } + errs = append(errs, fmt.Sprintf("%s;", err.Error())) + } + return "", fmt.Errorf("lib path is invalid, %v", errs) +} + +func getLibFromEnv(libraryName string) (string, error) { + ldLibraryPath := os.Getenv(LdLibPath) + if len(ldLibraryPath) > maxPathLength { + return "", fmt.Errorf("invalid library path env") + } + libraryPaths := strings.Split(ldLibraryPath, ":") + targetLibs := make([]string, 0, len(ldLibraryPath)) + for _, libraryPath := range libraryPaths { + libraryAbsName := path.Join(libraryPath, libraryName) + if len(libraryAbsName) > maxPathLength || !IsLexist(libraryAbsName) { + continue + } + targetLibs = append(targetLibs, libraryAbsName) + } + if len(libraryPaths) == 0 { + return "", errors.New("file path no exist or too long") + } + return checkLibsPath(targetLibs) +} + +func trimSpaceTable(data string) string { + data = strings.Replace(data, " ", "", -1) + data = strings.Replace(data, "\t", "", -1) + data = strings.Replace(data, "\n", "", -1) + return data +} + +func parserLibPath(line, libraryName string) string { + ldInfo := strings.Split(line, "=>") + if len(ldInfo) < ldSplitLen { + return "" + } + libNames := strings.Split(ldInfo[ldLibNameIndex], " ") + for index, libName := range libNames { + if index >= maxPathDepth { + break + } + if len(libName) == 0 { + continue + } + if name := trimSpaceTable(libName); name != libraryName { + continue + } + return trimSpaceTable(ldInfo[ldLibPathIndex]) + } + return "" +} + +func parseLibFromLdCmd(libraryName string) (string, error) { + ldCmd := exec.Command(ldCommand, ldParam) + grepCmd := exec.Command(grepCommand, libraryName) + ldCmdStdout, err := ldCmd.StdoutPipe() + if err != nil { + return "", fmt.Errorf("command exec failed: %v", err) + } + grepCmd.Stdin = ldCmdStdout + stdout, err := grepCmd.StdoutPipe() + if err != nil { + return "", fmt.Errorf("get pipe failed: %v", err) + } + if err = grepCmd.Start(); err != nil { + return "", fmt.Errorf("command exec failed: %v", err) + } + if err = ldCmd.Run(); err != nil { + return "", fmt.Errorf("command exec failed: %v", err) + } + defer func() { + if err = grepCmd.Wait(); err != nil { + log.Printf("command exec failed, %v", err) + } + }() + reader := bufio.NewReader(stdout) + count := 0 + line := "" + for { + if count >= maxPathLength { + err = errors.New("too many items in command stdout") + break + } + count++ + line, err = reader.ReadString('\n') + if err != nil || io.EOF == err { + break + } + if libPath := parserLibPath(line, libraryName); libPath != "" { + return libPath, nil + } + } + return "", fmt.Errorf("can't find valid lib: %v", err) +} + +func getLibFromLdCmd(libraryName string) (string, error) { + libraryAbsName, err := parseLibFromLdCmd(libraryName) + if err != nil { + return "", err + } + var absLibPath string + if absLibPath, err = checkAbsPath(libraryAbsName); err == nil { + return absLibPath, nil + } + return "", fmt.Errorf("driver lib is not exist or it's permission is invalid, %v", err) +} + +// GetDriverLibPath get driver lib path from ld config +func GetDriverLibPath(libraryName string) (string, error) { + var libPath string + var envErr, cmdErr error + if libPath, envErr = getLibFromEnv(libraryName); envErr == nil { + return libPath, nil + } + if libPath, cmdErr = getLibFromLdCmd(libraryName); cmdErr == nil { + return libPath, nil + } + return "", fmt.Errorf("cannot found valid driver lib, fromEnv: %v, fromLdCmd: %v", envErr, cmdErr) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path_test.go b/mind-cluster/component/ascend-common/common-utils/utils/path_test.go new file mode 100644 index 0000000..4e2346f --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/path_test.go @@ -0,0 +1,232 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" +) + +func TestIsDir(t *testing.T) { + convey.Convey("test logger", t, func() { + convey.Convey("test IsDir func", func() { + res := IsDir("/tmp/") + convey.So(res, convey.ShouldBeTrue) + res = IsDir("/utils/") + convey.So(res, convey.ShouldBeTrue) + res = IsDir("") + convey.So(res, convey.ShouldBeFalse) + }) + }) +} + +func TestIsFile(t *testing.T) { + convey.Convey("test IsFile func", t, func() { + res := IsFile("/tmp/") + convey.So(res, convey.ShouldBeFalse) + res = IsFile("") + convey.So(res, convey.ShouldBeFalse) + }) +} + +func TestIsExist(t *testing.T) { + convey.Convey("test IsExist func", t, func() { + res := IsExist("/xxxx/") + convey.So(res, convey.ShouldBeFalse) + }) +} + +func TestIsLexist(t *testing.T) { + convey.Convey("test IsLexist func", t, func() { + res := IsLexist("/xxxx/") + convey.So(res, convey.ShouldBeFalse) + }) +} + +func TestCheckPath(t *testing.T) { + convey.Convey("test CheckPath func", t, func() { + convey.Convey("should return itself given empty string", func() { + res, err := CheckPath("") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error given not exist path", func() { + res, err := CheckPath("xxxxxxx") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "file does not exist") + }) + + convey.Convey("should return resolve path given normal path", func() { + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldNotBeEmpty) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return err when get abs path failed", func() { + absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { + return "", errors.New("abs failed") + }) + defer absStub.Reset() + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "get the absolute path failed: abs failed") + }) + + convey.Convey("should return err when get eval symbol link failed", func() { + symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { + return "", errors.New("symlinks path failed") + }) + defer symStub.Reset() + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "get the symlinks path failed: symlinks path failed") + }) + + convey.Convey("should return err given symbol link", func() { + symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { + return "xxx", nil + }) + defer symStub.Reset() + res, err := CheckPath("../../go.mod") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err.Error(), convey.ShouldEqual, "can't support symlinks") + }) + + }) +} + +func TestMakeSureDir(t *testing.T) { + convey.Convey("test MakeSureDir func", t, func() { + convey.Convey("normal situation, no err returned", func() { + err := MakeSureDir("./testdata/tmp/test") + convey.So(err, convey.ShouldEqual, nil) + }) + convey.Convey("abnormal situation,err returned", func() { + mock := gomonkey.ApplyFunc(os.MkdirAll, func(name string, perm os.FileMode) error { + return fmt.Errorf("error") + }) + defer mock.Reset() + err := MakeSureDir("./xxxx/xxx") + convey.So(err.Error(), convey.ShouldEqual, "create directory failed: error") + }) + }) +} + +func TestGetDriverLibPath(t *testing.T) { + convey.Convey("test GetDriverLibPath func", t, func() { + convey.Convey("should return itself given empty string", func() { + err := os.Setenv(LdLibPath, "") + convey.So(err, convey.ShouldBeNil) + res, err := GetDriverLibPath("") + convey.So(res, convey.ShouldBeEmpty) + convey.So(err, convey.ShouldBeError) + }) + + convey.Convey("should return path when getLibFromEnv succeed", func() { + envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { + return "/test", nil + }) + defer envStub.Reset() + res, err := GetDriverLibPath("") + convey.So(res, convey.ShouldEqual, "/test") + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return path when getLibFromEnv failed but getLibFromLdCmd succeed", func() { + envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { + return "", errors.New("failed") + }) + defer envStub.Reset() + cmdStub := gomonkey.ApplyFunc(getLibFromLdCmd, func(libraryName string) (string, error) { + return "/test", nil + }) + defer cmdStub.Reset() + res, err := GetDriverLibPath("") + convey.So(res, convey.ShouldEqual, "/test") + convey.So(err, convey.ShouldBeNil) + }) + + }) +} + +type mockFileInfo struct { + mode os.FileMode + sys interface{} +} + +func (m *mockFileInfo) Name() string { return "mock" } +func (m *mockFileInfo) Size() int64 { return 0 } +func (m *mockFileInfo) Mode() os.FileMode { return m.mode } +func (m *mockFileInfo) ModTime() time.Time { return time.Now() } +func (m *mockFileInfo) IsDir() bool { return false } +func (m *mockFileInfo) Sys() interface{} { return m.sys } + +func TestDoCheckOwnerAndPermission(t *testing.T) { + var testPath = "/test" + var testMode os.FileMode = 0660 + var excludePermissions os.FileMode = 0002 + patch := gomonkey.NewPatches() + defer patch.Reset() + convey.Convey("should return nil when path is not exist", t, func() { + patch.ApplyFuncReturn(IsExist, false) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldBeNil) + }) + + patch.ApplyFuncReturn(IsExist, true) + convey.Convey("should return err when stat failed", t, func() { + patch.ApplyFuncReturn(os.Stat, nil, os.ErrNotExist) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err.Error(), convey.ShouldContainSubstring, "stat failed") + }) + + convey.Convey("should return err when get uid failed", t, func() { + patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: "invalid-type"}, nil) + defer patch.Reset() + + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") + }) + + convey.Convey("should return err when permission check failure", t, func() { + patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) + patch.ApplyFuncReturn(CheckMode, false) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") + }) + + convey.Convey("should return nil where all checks pass", t, func() { + patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) + patch.ApplyFuncReturn(CheckMode, true) + defer patch.Reset() + err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) + convey.So(err, convey.ShouldBeNil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go new file mode 100644 index 0000000..49c2f36 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go @@ -0,0 +1,75 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for password handler +package utils + +import ( + "bytes" + "errors" + "regexp" +) + +const ( + lowercaseCharactersRegex = `[a-z]{1,}` + uppercaseCharactersRegex = `[A-Z]{1,}` + baseNumberRegex = `[0-9]{1,}` + specialCharactersRegex = `[!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{1,}` + passWordRegex = `^[a-zA-Z0-9!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{8,64}$` + minComplexCount = 2 +) + +// CheckPassWordComplexity check password complexity +func CheckPassWordComplexity(s []byte) error { + complexCheckRegexArr := []string{ + lowercaseCharactersRegex, + uppercaseCharactersRegex, + baseNumberRegex, + specialCharactersRegex, + } + complexCount := 0 + for _, pattern := range complexCheckRegexArr { + if matched, err := regexp.Match(pattern, s); matched && err == nil { + complexCount++ + } + } + if complexCount < minComplexCount { + return errors.New("password complex not meet the requirement") + } + return nil +} + +// ValidatePassWord validate password +func ValidatePassWord(userName string, passWord []byte) error { + if err := commonCheckForPassWord(userName, passWord); err != nil { + return err + } + return CheckPassWordComplexity(passWord) +} + +func commonCheckForPassWord(userName string, passWord []byte) error { + if matched, err := regexp.Match(passWordRegex, passWord); err != nil || !matched { + return errors.New("password not meet requirement") + } + var userNameByte []byte = []byte(userName) + if bytes.Equal(userNameByte, passWord) { + return errors.New("password cannot equals username") + } + var reverseUserName = ReverseString(userName) + var reverseUserNameByte []byte = []byte(reverseUserName) + if bytes.Equal(reverseUserNameByte, passWord) { + return errors.New("password cannot equal reversed username") + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go new file mode 100644 index 0000000..808c231 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go @@ -0,0 +1,59 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for password handler +package utils + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +var ( + truePasswd = []byte("aA0!\"#$%&'()*+,-. /:;<=>?@[\\]^_`{|}~") + falsePasswd1 = []byte("userName") + falsePasswd2 = []byte("12345678") + falsePasswd3 = []byte("1234567") + falsePasswd4 = []byte("emaNresu.") + falsePasswd5 = []byte("不支持特殊字符测试test") +) + +// TestCommonCheckForPassWord test common check for passWord +func TestCommonCheckForPassWord(t *testing.T) { + convey.Convey("correct password", t, func() { + err := ValidatePassWord("userName", truePasswd) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("username == password", t, func() { + err := ValidatePassWord("userName", falsePasswd1) + convey.So(err.Error(), convey.ShouldEqual, "password cannot equals username") + }) + convey.Convey("complex not meet the requirement", t, func() { + err := ValidatePassWord("userName", falsePasswd2) + convey.So(err.Error(), convey.ShouldEqual, "password complex not meet the requirement") + }) + convey.Convey("password too short", t, func() { + err := ValidatePassWord("userName", falsePasswd3) + convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") + }) + convey.Convey("username equal reverse password", t, func() { + err := ValidatePassWord(".userName", falsePasswd4) + convey.So(err.Error(), convey.ShouldEqual, "password cannot equal reversed username") + }) + convey.Convey("test special ", t, func() { + err := ValidatePassWord("userName", falsePasswd5) + convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice.go b/mind-cluster/component/ascend-common/common-utils/utils/slice.go new file mode 100644 index 0000000..f673bc1 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/slice.go @@ -0,0 +1,129 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for slice utils +package utils + +import ( + "fmt" + "slices" + "strconv" +) + +// hex hexadecimal +const hex = 16 + +type stringTool struct{} + +// StringTool slice for string tool +var StringTool stringTool + +// HexStringToInt hex string slice to int64 slice +func (s stringTool) HexStringToInt(sources []string) map[int64]struct{} { + intMap := make(map[int64]struct{}, len(sources)) + for _, source := range sources { + num, err := strconv.ParseInt(source, hex, 0) + if err != nil { + fmt.Printf("parse hex to int failed, skip it. error: %v\n", err) + continue + } + intMap[num] = struct{}{} + } + return intMap +} + +// Contains check whether slice contains target +func Contains[T comparable](sources []T, target T) bool { + for _, v := range sources { + if v == target { + return true + } + } + return false +} + +// Remove delete the first matching element in the slice +func Remove[T comparable](slice []T, target T) []T { + for i, v := range slice { + if v == target { + return append(slice[:i], slice[i+1:]...) + } + } + return slice +} + +// RemoveDuplicates remove duplicates from slice +func RemoveDuplicates[T comparable](slice []T) []T { + existMap := make(map[T]struct{}) + result := make([]T, 0) + for _, str := range slice { + if _, ok := existMap[str]; !ok { + existMap[str] = struct{}{} + result = append(result, str) + } + } + return result +} + +// SameElementInMap whether map contains target +func SameElementInMap[T comparable](sources map[T]struct{}, targets []T) bool { + for _, target := range targets { + if _, ok := sources[target]; ok { + return true + } + } + return false +} + +// RemoveEleSli remove element in sources which is in target +func RemoveEleSli[T comparable](source, target []T) []T { + sliMap := make(map[T]struct{}) + for _, item := range target { + sliMap[item] = struct{}{} + } + + result := make([]T, 0) + for _, ele := range source { + if _, ok := sliMap[ele]; !ok { + result = append(result, ele) + } + } + return result +} + +// RemoveElementsNotInSecond remove elements not in slice2 +func RemoveElementsNotInSecond[T comparable](slice1, slice2 []T) []T { + sliMap := make(map[T]struct{}) + for _, item := range slice2 { + sliMap[item] = struct{}{} + } + + result := make([]T, 0) + for _, item := range slice1 { + if _, ok := sliMap[item]; ok { + result = append(result, item) + } + } + return result +} + +// CheckSliceSupport check elements is supported in expects +func CheckSliceSupport(elements []int64, expects []int64) error { + for _, e := range elements { + if !slices.Contains(expects, e) { + return fmt.Errorf("element %v does not contain %v", e, expects) + } + } + return nil +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go b/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go new file mode 100644 index 0000000..b3bf161 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go @@ -0,0 +1,536 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils this file for slice utils +package utils + +import ( + "fmt" + "reflect" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +const ( + decimal1A = 26 + decimalFF = 255 + decimalNeg10 = 16 + decimalNegFF = -255 +) + +func buildHexStringToIntTestCase() []struct { + name string + input []string + expected map[int64]struct{} +} { + return []struct { + name string + input []string + expected map[int64]struct{} + }{ + { + name: "01 - Valid hex strings", + input: []string{"1A", "FF", "10"}, + expected: map[int64]struct{}{ + decimal1A: {}, + decimalFF: {}, + decimalNeg10: {}, + }, + }, + { + name: "02 - Invalid hex strings", + input: []string{"xyz", "ghijk"}, + expected: map[int64]struct{}{}, + }, + { + name: "03 - Empty input array", + input: []string{}, + expected: map[int64]struct{}{}, + }, + { + name: "04 - Duplicate values should be deduplicated", + input: []string{"0x1A", "1A", "0x1a"}, // All represent 26 in decimal + expected: map[int64]struct{}{ + decimal1A: {}, + }, + }, + { + name: "05 - Mixed valid and invalid inputs", + input: []string{"0x1A", "xyz", "0xFF", "invalid", "0x10"}, + expected: map[int64]struct{}{}, + }, + { + name: "06 - Negative hex numbers", + input: []string{"-0x1A", "-FF"}, + expected: map[int64]struct{}{ + decimalNegFF: {}, + }, + }, + } +} + +func TestHexStringToInt(t *testing.T) { + for _, tt := range buildHexStringToIntTestCase() { + t.Run(tt.name, func(t *testing.T) { + result := StringTool.HexStringToInt(tt.input) + for i := range tt.expected { + fmt.Println(i) + } + if len(result) != len(tt.expected) { + t.Errorf("Expected map length %d, but got %d", len(tt.expected), len(result)) + return + } + for key := range tt.expected { + if _, exists := result[key]; !exists { + t.Errorf("Expected key %d not found in result", key) + } + } + for key := range result { + if _, exists := tt.expected[key]; !exists { + t.Errorf("Unexpected key %d found in result", key) + } + } + }) + } +} + +func TestSameElementInMap(t *testing.T) { + for _, tt := range buildSameElementInMapTestCase() { + t.Run(tt.name, func(t *testing.T) { + result := SameElementInMap(tt.sources, tt.targets) + if result != tt.expected { + t.Errorf("SameElementInMap() = %v, expected %v", result, tt.expected) + } + }) + } +} + +func buildSameElementInMapTestCase() []struct { + name string + sources map[int]struct{} + targets []int + expected bool +} { + return []struct { + name string + sources map[int]struct{} + targets []int + expected bool + }{ + { + name: "01 There are identical elements present", + sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, + targets: []int{4, 5, 2}, + expected: true, + }, + { + name: "02 There are no identical elements present\n", + sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, + targets: []int{4, 5, 6}, + expected: false, + }, + { + name: "03 target is nil", + sources: map[int]struct{}{1: {}, 2: {}}, + targets: []int{}, + expected: false, + }, + { + name: "04 source is nil", + sources: map[int]struct{}{}, + targets: []int{1, 2, 3}, + expected: false, + }, + { + name: "05 source and target are both nil", + sources: map[int]struct{}{}, + targets: []int{}, + expected: false, + }, + } +} + +func TestSameElementInMap_StringType(t *testing.T) { + sources := map[string]struct{}{ + "apple": {}, + "banana": {}, + "orange": {}, + } + targets := []string{"grape", "apple", "kiwi"} + result := SameElementInMap(sources, targets) + if !result { + t.Errorf("SameElementInMap() with string type should return true, got false") + } + targetsNoMatch := []string{"grape", "kiwi", "mango"} + resultNoMatch := SameElementInMap(sources, targetsNoMatch) + if resultNoMatch { + t.Errorf("SameElementInMap() with string type should return false, got true") + } +} + +func TestContains(t *testing.T) { + for _, tt := range buildContainsTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.source.(type) { + case []int: + s2 := tt.target.(int) + result := Contains(s1, s2) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Contains() = %v, want %v", result, tt.expected) + } + case []string: + s2 := tt.target.(string) + result := Contains(s1, s2) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Contains() = %v, want %v", result, tt.expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildContainsTestCase() []struct { + name string + source interface{} + target interface{} + expected bool +} { + return []struct { + name string + source interface{} + target interface{} + expected bool + }{ + { + name: "01 contains for int type", + source: []int{1, 2, 3, 4}, + target: 1, + expected: true, + }, + { + name: "02 not contains for int type", + source: []int{1, 2, 3, 4}, + target: 0, + expected: false, + }, + { + name: "03 contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "1", + expected: true, + }, + { + name: "04 not contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "0", + expected: false, + }, + { + name: "05 empty source slice", + source: []int{}, + target: 1, + expected: false, + }, + } +} + +func TestRemove(t *testing.T) { + for _, tt := range buildRemoveTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.source.(type) { + case []int: + s2 := tt.target.(int) + result := Remove(s1, s2) + expected := tt.expected.([]int) + if !reflect.DeepEqual(result, expected) { + t.Errorf("Contains() = %v, want %v", result, expected) + } + case []string: + s2 := tt.target.(string) + result := Remove(s1, s2) + expected := tt.expected.([]string) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildRemoveTestCase() []struct { + name string + source interface{} + target interface{} + expected interface{} +} { + return []struct { + name string + source interface{} + target interface{} + expected interface{} + }{ + { + name: "01 contains for int type", + source: []int{1, 2, 3, 4}, + target: 1, + expected: []int{2, 3, 4}, + }, + { + name: "02 not contains for int type", + source: []int{1, 2, 3, 4}, + target: 0, + expected: []int{1, 2, 3, 4}, + }, + { + name: "03 contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "1", + expected: []string{"2", "3", "4"}, + }, + { + name: "04 not contains for string type", + source: []string{"1", "2", "3", "4"}, + target: "0", + expected: []string{"1", "2", "3", "4"}, + }, + { + name: "05 empty source slice", + source: []int{}, + target: 1, + expected: []int{}, + }, + } +} + +func buildRemoveElementsNotInSecondTestCase() []struct { + name string + slice1 interface{} + slice2 interface{} + expected interface{} +} { + return []struct { + name string + slice1 interface{} + slice2 interface{} + expected interface{} + }{ + { + name: "01 Basic functionality - integer slices with partial overlap", + slice1: []int{1, 2, 3, 4}, + slice2: []int{2, 4, 6, 8}, + expected: []int{2, 4}, + }, + { + name: "02 Empty first slice", + slice1: []int{}, + slice2: []int{1, 2, 3}, + expected: []int{}, + }, + { + name: "03 Empty second slice", + slice1: []int{1, 2, 3}, + slice2: []int{}, + expected: []int{}, + }, + { + name: "04 Both slices empty", + slice1: []int{}, + slice2: []int{}, + expected: []int{}, + }, + { + name: "05 No intersection between slices", + slice1: []int{1, 2, 3}, + slice2: []int{4, 5, 6}, + expected: []int{}, + }, + { + name: "06 String type test", + slice1: []string{"1", "2", "3"}, + slice2: []string{"2", "3", "4"}, + expected: []string{"2", "3"}, + }, + } +} + +func TestRemoveElementsNotInSecond(t *testing.T) { + for _, tt := range buildRemoveElementsNotInSecondTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.slice1.(type) { + case []int: + s2 := tt.slice2.([]int) + expected := tt.expected.([]int) + result := RemoveElementsNotInSecond(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) + } + case []string: + s2 := tt.slice2.([]string) + expected := tt.expected.([]string) + result := RemoveElementsNotInSecond(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildRemoveEleSliTestCase() []struct { + name string + source interface{} + target interface{} + expected interface{} +} { + return []struct { + name string + source interface{} + target interface{} + expected interface{} + }{ + { + name: "01 int type", + source: []int{1, 2, 3, 4, 5}, + target: []int{2, 4}, + expected: []int{1, 3, 5}, + }, + { + name: "02 source is empty for int type", + source: []int{}, + target: []int{1, 2}, + expected: []int{}, + }, + { + name: "03 target is empty for int type", + source: []int{1, 2, 3}, + target: []int{}, + expected: []int{1, 2, 3}, + }, + { + name: "04 source and target are both empty for int type", + source: []int{}, + target: []int{}, + expected: []int{}, + }, + { + name: "05 string type", + source: []string{"a", "b", "c", "d"}, + target: []string{"b", "d"}, + expected: []string{"a", "c"}, + }, + } +} + +func TestRemoveEleSli(t *testing.T) { + for _, tt := range buildRemoveEleSliTestCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.source.(type) { + case []int: + s2 := tt.target.([]int) + expected := tt.expected.([]int) + result := RemoveEleSli(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveEleSli() = %v, want %v", result, expected) + } + case []string: + s2 := tt.target.([]string) + expected := tt.expected.([]string) + result := RemoveEleSli(s1, s2) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveEleSli() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func buildRemoveDuplicatesCase() []struct { + name string + input interface{} + expected interface{} +} { + return []struct { + name string + input interface{} + expected interface{} + }{ + { + name: "01 empty slice for int type", + input: []int{}, + expected: []int{}, + }, + { + name: "02 no duplicates for int type", + input: []int{1, 2, 3}, + expected: []int{1, 2, 3}, + }, + { + name: "03 with duplicates for int type", + input: []int{1, 2, 2, 3, 1, 4}, + expected: []int{1, 2, 3, 4}, + }, + { + name: "04 with duplicates for string type", + input: []string{"1", "3", "3", "4"}, + expected: []string{"1", "3", "4"}, + }, + } +} + +func TestRemoveDuplicates(t *testing.T) { + for _, tt := range buildRemoveDuplicatesCase() { + t.Run(tt.name, func(t *testing.T) { + switch s1 := tt.input.(type) { + case []int: + expected := tt.expected.([]int) + result := RemoveDuplicates(s1) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) + } + case []string: + expected := tt.expected.([]string) + result := RemoveDuplicates(s1) + if !reflect.DeepEqual(result, expected) { + t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) + } + default: + t.Errorf("unsupported type") + } + }) + } +} + +func TestCheckSliceSupport(t *testing.T) { + convey.Convey("test TestCheckSliceSupport, check ok", t, func() { + elements := []int64{1, 2} + expects := []int64{1, 2, 3} + err := CheckSliceSupport(elements, expects) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("test TestCheckSliceSupport, check fail", t, func() { + elements := []int64{1, 2, 4} + expects := []int64{1, 2, 3} + err := CheckSliceSupport(elements, expects) + convey.So(err, convey.ShouldNotBeNil) + }) +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings.go b/mind-cluster/component/ascend-common/common-utils/utils/strings.go new file mode 100644 index 0000000..c3d98aa --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/strings.go @@ -0,0 +1,75 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "crypto/sha256" + "fmt" + "unicode" +) + +const ( + maskLen = 2 +) + +// ReplacePrefix replace string with prefix +func ReplacePrefix(source, prefix string) string { + if prefix == "" { + prefix = "****" + } + if len(source) <= maskLen { + return prefix + } + end := string([]rune(source)[maskLen:len(source)]) + return prefix + end +} + +// MaskPrefix mask string prefix with **** +func MaskPrefix(source string) string { + return ReplacePrefix(source, "") +} + +// GetSha256Code return the sha256 hash bytes +func GetSha256Code(data []byte) []byte { + hash256 := sha256.New() + if _, err := hash256.Write(data); err != nil { + fmt.Println(err) + return nil + } + return hash256.Sum(nil) +} + +// ReverseString reverse string +func ReverseString(s string) string { + runes := []rune(s) + for start, end := 0, len(runes)-1; start < end; start, end = start+1, end-1 { + runes[start], runes[end] = runes[end], runes[start] + } + return string(runes) +} + +// IsDigitString return string is all digit +func IsDigitString(s string) bool { + if len(s) == 0 { + return false + } + for _, c := range s { + if !unicode.IsDigit(c) { + return false + } + } + return true +} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go b/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go new file mode 100644 index 0000000..390e424 --- /dev/null +++ b/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go @@ -0,0 +1,84 @@ +/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils provides the util func +package utils + +import ( + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +const byteLength = 32 + +func TestReplacePrefix(t *testing.T) { + convey.Convey("relative path", t, func() { + path := ReplacePrefix("./testdata/cert/ca.crt", "****") + convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") + }) + convey.Convey("abconvey.Solute path", t, func() { + path := ReplacePrefix("/testdata/cert/ca.crt", "****") + convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") + }) + convey.Convey("path length less than 2", t, func() { + path := ReplacePrefix("/", "****") + convey.So(path, convey.ShouldEqual, "****") + }) + convey.Convey("empty string", t, func() { + path := ReplacePrefix("", "****") + convey.So(path, convey.ShouldEqual, "****") + }) + +} + +func TestMaskPrefix(t *testing.T) { + convey.Convey("relative path", t, func() { + path := MaskPrefix("./testdata/cert/ca.crt") + convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") + }) + convey.Convey("abconvey.Solute path", t, func() { + path := MaskPrefix("/testdata/cert/ca.crt") + convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") + }) + convey.Convey("path length less than 2", t, func() { + path := MaskPrefix("/") + convey.So(path, convey.ShouldEqual, "****") + }) + convey.Convey("empty string", t, func() { + path := MaskPrefix("") + convey.So(path, convey.ShouldEqual, "****") + }) + +} + +func TestGetSha256Code(t *testing.T) { + convey.Convey("test sha256", t, func() { + hashs := GetSha256Code([]byte("this is a test sentence")) + convey.So(len(hashs), convey.ShouldEqual, byteLength) + }) +} + +func TestIsDigitString(t *testing.T) { + convey.Convey("test IsDigitString", t, func() { + convey.Convey("case IsDigitString is true", func() { + str := "123" + convey.ShouldBeTrue(IsDigitString(str)) + }) + convey.Convey("case IsDigitString is false", func() { + str := "123a" + convey.ShouldBeFalse(IsDigitString(str)) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/devmanager/a310mgr.go b/mind-cluster/component/ascend-common/devmanager/a310mgr.go new file mode 100644 index 0000000..081f167 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/a310mgr.go @@ -0,0 +1,25 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this Ascend310 device manager +package devmanager + +import ( + "ascend-common/devmanager/dcmi" +) + +// A310Manager Ascend310 device manager +type A310Manager struct { + dcmi.DcManager +} diff --git a/mind-cluster/component/ascend-common/devmanager/a310pmgr.go b/mind-cluster/component/ascend-common/devmanager/a310pmgr.go new file mode 100644 index 0000000..b32d1fa --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/a310pmgr.go @@ -0,0 +1,35 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this Ascend310P device manager +package devmanager + +import ( + "ascend-common/devmanager/dcmi" +) + +// A310PManager Ascend310P device manager +type A310PManager struct { + dcmi.DcManager +} + +// DcGetDevicePowerInfo query power by mcu interface for 310P +func (d *A310PManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { + return d.DcGetMcuPowerInfo(cardID) +} + +// DcGetMcuPowerInfo this function is only for Ascend310P +func (d *A310PManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { + return dcmi.FuncDcmiMcuGetPowerInfo(cardID) +} diff --git a/mind-cluster/component/ascend-common/devmanager/a910mgr.go b/mind-cluster/component/ascend-common/devmanager/a910mgr.go new file mode 100644 index 0000000..1bb2beb --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/a910mgr.go @@ -0,0 +1,31 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this Ascend910 device manager +package devmanager + +import ( + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// A910Manager Ascend910 device manager +type A910Manager struct { + dcmi.DcManager +} + +// DcGetHbmInfo get HBM information, only for Ascend910 +func (d *A910Manager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { + return dcmi.FuncDcmiGetDeviceHbmInfo(cardID, deviceID) +} diff --git a/mind-cluster/component/ascend-common/devmanager/common/constants.go b/mind-cluster/component/ascend-common/devmanager/common/constants.go new file mode 100644 index 0000000..e39ddac --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/constants.go @@ -0,0 +1,272 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common define common variable +package common + +import ( + "math" + + "k8s.io/apimachinery/pkg/util/sets" +) + +// DeviceType define device type +type DeviceType struct { + // Code device type code + Code int32 + // Name device type name + Name string +} + +var ( + // ProfilingTime for getting PCIe bandwidth + ProfilingTime int + + // HccsBWProfilingTime for getting hccs bandwidth + HccsBWProfilingTime int + + // a3BoardIds for A3 Board IDs + a3BoardIds = sets.NewInt32(A900A3SuperPodBin1BoardId, A900A3SuperPodBin2BoardId, + A900A3SuperPodBin3BoardId, A800IA3BoardId) + + // a900A3SuperPodMainBoardIds for A900 A3 Super Pod Main Board IDs + a900A3SuperPodMainBoardIds = sets.NewInt32(A900A3SuperPodMainBoardId1, A900A3SuperPodMainBoardId2) + + // a9000A3SuperPodMainBoardIds for A9000 A3 Super Pod Main Board IDs + a9000A3SuperPodMainBoardIds = sets.NewInt32(A9000A3SuperPodMainBoardId1, A9000A3SuperPodMainBoardId2) +) + +// DeviceType for utilization +var ( + // AICore Ascend310 & Ascend910 + AICore = DeviceType{Code: 2, Name: "AICore"} + // HbmUtilization utilization rate of hbm + HbmUtilization = DeviceType{Code: 6, Name: "Hbm"} + // VectorCore Ascend310P + VectorCore = DeviceType{Code: 12, Name: "VectorCore"} + // Overall Overall utilization rate of NPU + Overall = DeviceType{Code: 13, Name: "Overall"} +) + +// DeviceType for frequency +var ( + // AICoreCurrentFreq Ascend310 & Ascend910 & Ascend910B & Ascend310P + AICoreCurrentFreq = DeviceType{Code: 7, Name: "AICore Current"} +) + +const ( + // Success for interface return code + Success = 0 + // DeviceNotReadyErrCodeStr for dcmi interface device not ready err code string + DeviceNotReadyErrCodeStr = "-8012" + // DeviceNotReadyErrCode for dcmi interface device not ready err code + DeviceNotReadyErrCode = -8012 + // CardDropFaultCode card drop fault code + CardDropFaultCode = 0x40F84E00 + // RetError return error when the function failed + RetError = -1 + // Percent constant of 100 + Percent = 100 + // MaxErrorCodeCount number of error codes + MaxErrorCodeCount = 128 + // UnRetError return unsigned int error + UnRetError = math.MaxUint32 + // Abnormal status of Abnormal + Abnormal = "Abnormal" + // ChannelStateOk means out band channel is ok for resetting + ChannelStateOk = 1 + + // HiAIMaxCardID max card id for Ascend chip + HiAIMaxCardID = math.MaxInt32 + + // HiAIMaxCardNum max card number + HiAIMaxCardNum = 64 + + // HiAIMaxDeviceNum max device number + HiAIMaxDeviceNum = 4 + + // NpuType present npu chip + NpuType = 0 + + // ReduceOnePercent for calculation reduce one percent + ReduceOnePercent = 0.01 + // ReduceTenth for calculation reduce one tenth + ReduceTenth = 0.1 + // DefaultTemperatureWhenQueryFailed when get temperature failed, use this value + DefaultTemperatureWhenQueryFailed = -275 + + // Ascend310P ascend 310P chip + Ascend310P = "Ascend310P" + // Ascend910 ascend 910 chip + Ascend910 = "Ascend910" + // Ascend910B ascend 910B chip + Ascend910B = "Ascend910B" + // Ascend910A3 ascend Ascend910A3 chip + Ascend910A3 = "Ascend910A3" + // Atlas200ISoc 200 soc env + Atlas200ISoc = "Atlas 200I SoC A1" + + // DcmiApiTimeout dcmi interface timeout seconds + DcmiApiTimeout = 1 + + // SubscribeAllDevice subscribe all device ID + SubscribeAllDevice = -1 + // MinVDevID min value of virtual device id + MinVDevID = 100 + // MaxVDevID max value of virtual device id + MaxVDevID = 1124 + + // InvalidID invalid ID + InvalidID = 0xffffffff + + // FailedMetricValue for failed metric value + FailedMetricValue = -1 + + // FailedValue for failed value + FailedValue = 0xffffffff + + // MaxErrorCodeLen max length of error code for Prometheus + MaxErrorCodeLen = 10 +) + +const ( + // BootStartFinish chip hot reset finish + BootStartFinish = 16 +) + +const ( + // FaultRecover device fault recover + FaultRecover = int8(0) + // FaultOccur device fault occur + FaultOccur = int8(1) + // FaultOnce once device fault + FaultOnce = int8(2) +) + +const ( + // AMPMode for AMP chip work mode + AMPMode = "AMP" + // SMPMode for SMP chip work mode + SMPMode = "SMP" + + // NetworkInit init status + NetworkInit = 6 + // NetworkSuccess chip network is healthy + NetworkSuccess = 0 + + // MaxProcNum process number in device side + MaxProcNum = 32 + // UnitMB MB + UnitMB float64 = 1024 * 1024 + + // Chip910 chip name 910 + Chip910 = "910" + + // A300IA2BoardId board id of A300I A2 and 910proB + A300IA2BoardId = 0x28 + + // A300IA2GB64BoardId board id of A300I A2 64GB + A300IA2GB64BoardId = 0x29 + + // A900A3SuperPodBin1BoardId board id of A900/A9000 A3 SuperPod Bin1 + A900A3SuperPodBin1BoardId = 0xb0 + + // A900A3SuperPodBin2BoardId board id of A900/A9000 A3 SuperPod Bin2 + A900A3SuperPodBin2BoardId = 0xb1 + + // A900A3SuperPodBin3BoardId board id of A900/A9000 A3 SuperPod Bin3 + A900A3SuperPodBin3BoardId = 0xb2 + + // A800IA3BoardId board id of A800I A3 + A800IA3BoardId = 0xb3 + + // A900A3SuperPodMainBoardId1 board id of A900 A3 SuperPod MainBoard1 + A900A3SuperPodMainBoardId1 = 0x18 + + // A900A3SuperPodMainBoardId2 board id of A900 A3 SuperPod MainBoard2 + A900A3SuperPodMainBoardId2 = 0x19 + + // A800IA3MainBoardId A800I A3 MainBoardId + A800IA3MainBoardId = 0x14 + + // A9000A3SuperPodMainBoardId1 board id of A9000 A3 SuperPod MainBoard1 + A9000A3SuperPodMainBoardId1 = 0x1C + + // A9000A3SuperPodMainBoardId2 board id of A9000 A3 SuperPod MainBoard2 + A9000A3SuperPodMainBoardId2 = 0x1D +) + +// log limit domains for metrics +const ( + // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID + DomainForLogicIdErr = "logicID" +) + +// DcmiDeviceType used to represent the dcmi device type +type DcmiDeviceType int32 + +const ( + // DcmiDeviceTypeDDR represents the component type DCMI_DEVICE_TYPE_DDR + DcmiDeviceTypeDDR DcmiDeviceType = 0 + // DcmiDeviceTypeSRAM represents the component type DCMI_DEVICE_TYPE_SRAM + DcmiDeviceTypeSRAM DcmiDeviceType = 1 + // DcmiDeviceTypeHBM represents the component type DCMI_DEVICE_TYPE_HBM + DcmiDeviceTypeHBM DcmiDeviceType = 2 + // DcmiDeviceTypeNPU represents the component type DCMI_DEVICE_TYPE_NPU + DcmiDeviceTypeNPU DcmiDeviceType = 3 + // DcmiDeviceTypeNONE represents the component type DCMI_DEVICE_TYPE_NONE + DcmiDeviceTypeNONE DcmiDeviceType = 0xff +) + +const ( + // ErrMsgInitCardListFailed is used where initialization of the card list fails + ErrMsgInitCardListFailed = "get card list failed for init" + // ErrMsgGetBoardInfoFailed is used where there is a failure in getting board info + ErrMsgGetBoardInfoFailed = "get board info failed, no card found" +) + +const ( + // MaxHccspingMeshAddr is the max number of hccsping addresses + MaxHccspingMeshAddr = 1024 + // MinPktSize is the min packet size + MinPktSize = 1792 + // MaxPktSize is the max packet size + MaxPktSize = 3000 + // MinPktSendNum is the min packet send number + MinPktSendNum = 1 + // MaxPktSendNum is the max packet send number + MaxPktSendNum = 1000 + // MinPktInterval is the min packet interval + MinPktInterval = 1 + // MaxPktInterval is the max packet interval + MaxPktInterval = 1000 + // MinTaskInterval is the min task interval + MinTaskInterval = 1 + // MaxTaskInterval is the max task interval + MaxTaskInterval = 60 + // InternalPingMeshTaskID is the inner ping mesh task id + InternalPingMeshTaskID uint = 0 + // ExternalPingMeshTaskID is the outer ping mesh task id + ExternalPingMeshTaskID uint = 1 + // DefaultPingMeshPortID is the default ping mesh port + DefaultPingMeshPortID = 0 + // DefaultPktSize is the default packet size + DefaultPktSize = 1792 + // DefaultPktSendNum is the default packet send number + DefaultPktSendNum = 10 + // DefaultPktInterval is the default packet interval + DefaultPktInterval = 10 + // DefaultTimeout is the default timeout + DefaultTimeout = 1 +) diff --git a/mind-cluster/component/ascend-common/devmanager/common/types.go b/mind-cluster/component/ascend-common/devmanager/common/types.go new file mode 100644 index 0000000..870c716 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/types.go @@ -0,0 +1,435 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common define common types +package common + +// MemoryInfo memory information struct +type MemoryInfo struct { + MemorySize uint64 `json:"memory_size"` + MemoryAvailable uint64 `json:"memory_available"` + Frequency uint32 `json:"memory_frequency"` + Utilization uint32 `json:"memory_utilization"` +} + +// HbmInfo high bandwidth memory info +type HbmInfo struct { + MemorySize uint64 `json:"memory_size"` // total size,MB + Frequency uint32 `json:"hbm_frequency"` // frequency MHz + Usage uint64 `json:"memory_usage"` // memory usage,MB + Temp int32 `json:"hbm_temperature"` // temperature + BandWidthUtilRate uint32 `json:"hbm_bandwidth_util"` // bandwidth utilization +} + +// HbmAggregateInfo more comprehensive high bandwidth memory information with ecc information +type HbmAggregateInfo struct { + *HbmInfo + ECCInfo *ECCInfo `json:"hbm_ecc_info"` // ECC information +} + +// ChipInfo chip info +type ChipInfo struct { + Type string `json:"chip_type"` + Name string `json:"chip_name"` + Version string `json:"chip_version"` + NpuName string `json:"npu_name"` + AICoreCnt int `json:"aicore_cnt"` +} + +// ChipBaseInfo all id of chip +type ChipBaseInfo struct { + PhysicID int32 + LogicID int32 + CardID int32 + DeviceID int32 +} + +// CgoCreateVDevOut create virtual device output info +type CgoCreateVDevOut struct { + VDevID uint32 + PcieBus uint32 + PcieDevice uint32 + PcieFunc uint32 + VfgID uint32 + Reserved []uint8 +} + +// CgoCreateVDevRes create virtual device input info +type CgoCreateVDevRes struct { + VDevID uint32 + VfgID uint32 + TemplateName string + Reserved []uint8 +} + +// CgoBaseResource base resource info +type CgoBaseResource struct { + Token uint64 + TokenMax uint64 + TaskTimeout uint64 + VfgID uint32 + VipMode uint8 + Reserved []uint8 +} + +// CgoComputingResource compute resource info +type CgoComputingResource struct { + // accelator resource + Aic float32 + Aiv float32 + Dsa uint16 + Rtsq uint16 + Acsq uint16 + Cdqm uint16 + CCore uint16 + Ffts uint16 + Sdma uint16 + PcieDma uint16 + + // memory resource, MB as unit + MemorySize uint64 + + // id resource + EventID uint32 + NotifyID uint32 + StreamID uint32 + ModelID uint32 + + // cpu resource + TopicScheduleAicpu uint16 + HostCtrlCPU uint16 + HostAicpu uint16 + DeviceAicpu uint16 + TopicCtrlCPUSlot uint16 + + Reserved []uint8 +} + +// CgoMediaResource media resource info +type CgoMediaResource struct { + Jpegd float32 + Jpege float32 + Vpc float32 + Vdec float32 + Pngd float32 + Venc float32 + Reserved []uint8 +} + +// CgoVDevQueryInfo virtual resource special info +type CgoVDevQueryInfo struct { + Name string + Status uint32 + IsContainerUsed uint32 + Vfid uint32 + VfgID uint32 + ContainerID uint64 + Base CgoBaseResource + Computing CgoComputingResource + Media CgoMediaResource +} + +// CgoVDevQueryStru virtual resource info +type CgoVDevQueryStru struct { + VDevID uint32 + QueryInfo CgoVDevQueryInfo +} + +// CgoSocFreeResource soc free resource info +type CgoSocFreeResource struct { + VfgNum uint32 + VfgBitmap uint32 + Base CgoBaseResource + Computing CgoComputingResource + Media CgoMediaResource +} + +// CgoSocTotalResource soc total resource info +type CgoSocTotalResource struct { + VDevNum uint32 + VDevID []uint32 + VfgNum uint32 + VfgBitmap uint32 + Base CgoBaseResource + Computing CgoComputingResource + Media CgoMediaResource +} + +// CgoSuperPodInfo super pod info +type CgoSuperPodInfo struct { + SdId uint32 + ScaleType uint32 + SuperPodId uint32 + ServerId uint32 + Reserve []uint32 +} + +// VirtualDevInfo virtual device infos +type VirtualDevInfo struct { + TotalResource CgoSocTotalResource + FreeResource CgoSocFreeResource + VDevInfo []CgoVDevQueryStru + VDevActivityInfo []VDevActivityInfo +} + +// DevFaultInfo device's fault info +type DevFaultInfo struct { + EventID int64 + LogicID int32 + ModuleType int8 // ModuleType prototype is dcmi node_type + ModuleID int8 // ModuleID prototype is dcmi node_id + SubModuleType int8 // SubModuleType prototype is dcmi sub_node_type + SubModuleID int8 // SubModuleID prototype is dcmi sub_node_id + Severity int8 + Assertion int8 + AlarmRaisedTime int64 +} + +// DevProcessInfo device process info +type DevProcessInfo struct { + DevProcArray []DevProcInfo + ProcNum int32 +} + +// DevProcInfo process info in device side +type DevProcInfo struct { + Pid int32 + // the total amount of memory occupied by the device side OS and allocated by the business, unit is MB + MemUsage float64 +} + +// BoardInfo board info of device +type BoardInfo struct { + BoardId uint32 + PcbId uint32 + BomId uint32 + SlotId uint32 +} + +// VDevActivityInfo vNPU activity info for 310P +type VDevActivityInfo struct { + VDevID uint32 + VDevAiCoreRate uint32 + VDevTotalMem uint64 + VDevUsedMem uint64 + VDevAiCore float64 + IsVirtualDev bool +} + +// PCIEBwStat contains pcie bandwidth +type PCIEBwStat struct { + PcieRxPBw PcieStatValue + PcieRxNPBw PcieStatValue + PcieRxCPLBw PcieStatValue + PcieTxPBw PcieStatValue + PcieTxNPBw PcieStatValue + PcieTxCPLBw PcieStatValue +} + +// PcieStatValue pcie stat three value, like [min_bw,max_bw,avg_bw] +type PcieStatValue struct { + PcieMinBw int32 + PcieMaxBw int32 + PcieAvgBw int32 +} + +// DeviceNetworkHealth dcmi_get_device_network_health api return value +type DeviceNetworkHealth struct { + HealthCode uint32 + RetCode int32 +} + +// ECCInfo dcmi_get_device_ecc_info api return value +type ECCInfo struct { + EnableFlag int32 + SingleBitErrorCnt int64 + DoubleBitErrorCnt int64 + TotalSingleBitErrorCnt int64 + TotalDoubleBitErrorCnt int64 + SingleBitIsolatedPagesCnt int64 + DoubleBitIsolatedPagesCnt int64 +} + +// NpuNetInfo network info of npu +type NpuNetInfo struct { + // The optical info + OpticalInfo *OpticalInfo + // The transfer rate of network port + LinkSpeedInfo *LinkSpeedInfo + // Historical link statistics of network ports + LinkStatInfo *LinkStatInfo + // Statistics about packets + StatInfo *StatInfo + // Network port real-time bandwidth + BandwidthInfo *BandwidthInfo + // LinkStatusInfo refers to the link state + LinkStatusInfo *LinkStatusInfo +} + +// BandwidthInfo contains network port real-time bandwidth +type BandwidthInfo struct { + // TxValue transform speed + TxValue float64 `json:"tx_value"` + // RxValue receive speed + RxValue float64 `json:"rx_value"` +} + +// HccsStatisticInfo contains hccs statistic info +type HccsStatisticInfo struct { + TxCnt []uint64 + RxCnt []uint64 + CrcErrCnt []uint64 + retryCnt []uint64 + reservedFieldCnt []uint64 +} + +// HccsBandwidthInfo contains hccs bandwidth info +type HccsBandwidthInfo struct { + ProfilingTime uint32 + TotalTxbw float64 + TotalRxbw float64 + TxBandwidth []float64 + RxBandwidth []float64 +} + +// SioCrcErrStatisticInfo contains sio crc error statistic info +type SioCrcErrStatisticInfo struct { + TxErrCnt int64 + RxErrCnt int64 + Reserved []uint32 +} + +// StatInfo the statistics about packets +type StatInfo struct { + // Total number of pause frames received by the MAC + MacRxPauseNum float64 + // Total number of pause frames sent by MAC + MacTxPauseNum float64 + // Total number of PFC frames received by MAC + MacRxPfcPktNum float64 + // Total number of PFC frames sent by MAC + MacTxPfcPktNum float64 + // Total number of bad packets received by MAC + MacRxBadPktNum float64 + // Total number of bad packets sent by MAC + MacTxBadPktNum float64 + // The total number of packets received by the RoCE network card + RoceRxAllPktNum float64 + // The total number of packets sent by the RoCE network card + RoceTxAllPktNum float64 + // The number of bad packets received by the RoCE network card + RoceRxErrPktNum float64 + // The number of bad packets sent by the RoCE network card + RoceTxErrPktNum float64 + // The number of CNP type packets received by the RoCE network card + RoceRxCnpPktNum float64 + // The number of CNP type packets sent by the RoCE network card + RoceTxCnpPktNum float64 + // Number of RoCE network card retry messages + RoceNewPktRtyNum float64 + // Total number of bytes of bad packets sent by MAC + MacTxBadOctNum float64 + // Total number of bytes of bad packets received by MAC + MacRxBadOctNum float64 + // The number of unexpected ACK messages received by the RoCE network card + RoceUnexpectedAckNum float64 + // The number of out-of-order packets received by the RoCE network card + RoceOutOfOrderNum float64 + // The number of packets with domain segment verification errors received by the RoCE network card + RoceVerificationErrNum float64 + // The number of messages generated by abnormal QP connection status received by the RoCE network card + RoceQpStatusErrNum float64 + // The number of ecn + RoceEcnDBNum float64 + // The number of err info + MacRXFcsErrPktNum float64 +} + +// LinkStatInfo refers to the historical link statistics, including the times of link-up +type LinkStatInfo struct { + // The times of link-up + LinkUPNum float64 +} + +// LinkStatusInfo refers to the link state +type LinkStatusInfo struct { + // The state of link + LinkState string +} + +// LinkSpeedInfo the transfer rate of network port +type LinkSpeedInfo struct { + // The rate of network port + Speed float64 +} + +// OpticalInfo indicates the optical module information +type OpticalInfo struct { + // Optical module status, indicating whether it is in place (present) + OpticalState float64 + // Power sent by No.0 optical module + OpticalTxPower0 float64 + // Power sent by No.1 optical module + OpticalTxPower1 float64 + // Power sent by No.2 optical module + OpticalTxPower2 float64 + // Power sent by No.3 optical module + OpticalTxPower3 float64 + // Reception power of No.0 optical module + OpticalRxPower0 float64 + // Reception power of No.1 optical module + OpticalRxPower1 float64 + // Reception power of No.2 optical module + OpticalRxPower2 float64 + // Reception power of No.3 optical module + OpticalRxPower3 float64 + // Optical module voltage + OpticalVcc float64 + // Optical module temperature + OpticalTemp float64 +} + +// HccspingMeshOperate refers to the operation of hccsping mesh +type HccspingMeshOperate struct { + DstAddr string + PktSize int + PktSendNum int + PktInterval int + Timeout int + TaskInterval int + TaskId int +} + +// HccspingMeshInfo refers to the result of hccsping mesh +type HccspingMeshInfo struct { + DstAddr []string + SucPktNum []uint + FailPktNum []uint + MaxTime []int + MinTime []int + AvgTime []int + TP95Time []int + ReplyStatNum []int + PingTotalNum []int + DestNum int +} + +// ElabelInfo elabel information structure +type ElabelInfo struct { + ProductName string + Model string + Manufacturer string + ManufacturerDate string + SerialNumber string +} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils.go b/mind-cluster/component/ascend-common/devmanager/common/utils.go new file mode 100644 index 0000000..87e14df --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/utils.go @@ -0,0 +1,305 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common this for util method +package common + +import ( + "fmt" + "math" + "regexp" + "strings" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" +) + +var ( + reg910A = regexp.MustCompile(api.Ascend910APattern) + reg910B = regexp.MustCompile(api.Ascend910BPattern) + reg310P = regexp.MustCompile(api.Ascend310PPattern) +) + +// IsGreaterThanOrEqualInt32 check num range +func IsGreaterThanOrEqualInt32(num int64) bool { + if num >= int64(math.MaxInt32) { + return true + } + + return false +} + +// IsValidUtilizationRate valid utilization rate is 0-100 +func IsValidUtilizationRate(num uint32) bool { + if num > uint32(Percent) || num < 0 { + return false + } + + return true +} + +// IsValidChipInfo valid chip info is or not empty +func IsValidChipInfo(chip *ChipInfo) bool { + return chip.Name != "" || chip.Type != "" || chip.Version != "" +} + +// IsValidBoardInfo check whether the board info is valid +func IsValidBoardInfo(board *BoardInfo) bool { + return board.BoardId != InvalidID || board.PcbId != InvalidID || + board.BomId != InvalidID || board.SlotId != InvalidID +} + +// IsValidMainBoardInfo check whether the mainBoardId is valid +func IsValidMainBoardInfo(mainBoardId uint32) bool { + return mainBoardId != InvalidID +} + +// IsValidCardID valid card id +func IsValidCardID(cardID int32) bool { + // for cardID, please watch the maximum value of the driver is changed in the future version + return cardID >= 0 && cardID < HiAIMaxCardID +} + +// IsValidDeviceID valid device id +func IsValidDeviceID(deviceID int32) bool { + return deviceID >= 0 && deviceID < HiAIMaxDeviceNum +} + +// IsValidLogicIDOrPhyID valid logic id +func IsValidLogicIDOrPhyID(id int32) bool { + return id >= 0 && id < HiAIMaxCardNum*HiAIMaxDeviceNum +} + +// IsValidCardIDAndDeviceID check two params both needs meet the requirement +func IsValidCardIDAndDeviceID(cardID, deviceID int32) bool { + if !IsValidCardID(cardID) { + return false + } + + return IsValidDeviceID(deviceID) +} + +// IsValidDevNumInCard valid devNum in card +func IsValidDevNumInCard(num int32) bool { + return num > 0 && num <= HiAIMaxDeviceNum +} + +// IsValidVDevID valid vir device id +func IsValidVDevID(vDevID uint32) bool { + return vDevID >= MinVDevID && vDevID < MaxVDevID +} + +// IsValidPortID valid port id +func IsValidPortID(portID int) bool { + return portID == DefaultPingMeshPortID +} + +// IsValidTaskID valid task id +func IsValidTaskID(taskID uint) bool { + return taskID == InternalPingMeshTaskID || taskID == ExternalPingMeshTaskID +} + +// IsValidHccspingMeshOperate valid hccsping mesh operate +func IsValidHccspingMeshOperate(operate HccspingMeshOperate) error { + if len(operate.DstAddr) > MaxHccspingMeshAddr { + return fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(operate.DstAddr), + MaxHccspingMeshAddr) + } + if operate.PktSize < MinPktSize || operate.PktSize > MaxPktSize { + return fmt.Errorf("pkt size %d is invalid, should be between %d and %d", operate.PktSize, MinPktSize, MaxPktSize) + } + if operate.PktSendNum < MinPktSendNum || operate.PktSendNum > MaxPktSendNum { + return fmt.Errorf("pkt send num %d is invalid, should be between %d and %d", operate.PktSendNum, + MinPktSendNum, MaxPktSendNum) + } + if operate.PktInterval < MinPktInterval || operate.PktInterval > MaxPktInterval { + return fmt.Errorf("pkt interval %d is invalid, should be between %d and %d", operate.PktInterval, + MinPktInterval, MaxPktInterval) + } + if operate.TaskInterval < MinTaskInterval || operate.TaskInterval > MaxTaskInterval { + return fmt.Errorf("task interval %d is invalid, should be between %d and %d", operate.TaskInterval, + MinTaskInterval, MaxTaskInterval) + } + if !IsValidTaskID(uint(operate.TaskId)) { + return fmt.Errorf("task id %d is invalid", operate.TaskId) + } + return nil +} + +// GetDeviceTypeByChipName get device type by chipName +func GetDeviceTypeByChipName(chipName string) string { + if reg310P.MatchString(chipName) { + return api.Ascend310P + } + if strings.Contains(chipName, api.Ascend310BNo) { + return api.Ascend310B + } + if strings.Contains(chipName, api.Ascend310No) { + return api.Ascend310 + } + if reg910B.MatchString(chipName) { + return api.Ascend910B + } + if reg910A.MatchString(chipName) { + return api.Ascend910A + } + return "" +} + +func get910TemplateNameList() map[string]struct{} { + return map[string]struct{}{"vir16": {}, "vir08": {}, "vir04": {}, "vir02": {}, "vir01": {}} +} + +func get910BTemplateNameList() map[string]struct{} { + return map[string]struct{}{ + "vir03_1c_8g": {}, "vir05_1c_8g": {}, "vir05_1c_16g": {}, + "vir06_1c_16g": {}, "vir10_3c_16g": {}, "vir10_3c_16g_nm": {}, + "vir10_3c_32g": {}, "vir10_4c_16g_m": {}, "vir12_3c_32g": {}} +} + +func get310PTemplateNameList() map[string]struct{} { + return map[string]struct{}{"vir04": {}, "vir02": {}, "vir01": {}, "vir04_3c": {}, "vir02_1c": {}, + "vir04_4c_dvpp": {}, "vir04_3c_ndvpp": {}} +} + +// IsValidTemplateName check template name meet the requirement +func IsValidTemplateName(devType, templateName string) bool { + isTemplateNameValid := false + switch devType { + case api.Ascend310P: + _, isTemplateNameValid = get310PTemplateNameList()[templateName] + case api.Ascend910A: + _, isTemplateNameValid = get910TemplateNameList()[templateName] + case api.Ascend910B: + _, isTemplateNameValid = get910BTemplateNameList()[templateName] + default: + } + return isTemplateNameValid +} + +// RemoveDuplicate remove duplicate device +func RemoveDuplicate(list *[]string) []string { + listValueMap := make(map[string]string, len(*list)) + var rmDupValueList []string + for _, value := range *list { + listValueMap[value] = value + } + for _, value := range listValueMap { + rmDupValueList = append(rmDupValueList, value) + } + return rmDupValueList +} + +// GetNpuName get npu name eg: name-type-version +func GetNpuName(chipInfo *ChipInfo) string { + if chipInfo == nil { + return "" + } + if len(chipInfo.Name) == 0 && len(chipInfo.Type) == 0 && len(chipInfo.Version) == 0 { + return "" + } + return fmt.Sprintf("%s-%s-%s", chipInfo.Name, chipInfo.Type, chipInfo.Version) +} + +// SetExternalParams transmit npu-exporter's startup parameters +func SetExternalParams(profilingTime int) { + ProfilingTime = profilingTime +} + +// SetHccsBWProfilingTime set hccs bw profiling time +func SetHccsBWProfilingTime(hccsbwProfilingTime int) { + HccsBWProfilingTime = hccsbwProfilingTime +} + +// DeepCopyChipInfo copy chip info deeply +func DeepCopyChipInfo(chipInfo *ChipInfo) *ChipInfo { + if chipInfo == nil { + return nil + } + + return &ChipInfo{ + Type: chipInfo.Type, + Name: chipInfo.Name, + Version: chipInfo.Version, + } +} + +// DeepCopyVDevActivityInfo copy VDevActivityInfo deeply +func DeepCopyVDevActivityInfo(vDevActivityInfo *VDevActivityInfo) *VDevActivityInfo { + if vDevActivityInfo == nil { + return nil + } + + return &VDevActivityInfo{ + VDevID: vDevActivityInfo.VDevID, + VDevAiCoreRate: vDevActivityInfo.VDevAiCoreRate, + VDevTotalMem: vDevActivityInfo.VDevTotalMem, + VDevUsedMem: vDevActivityInfo.VDevUsedMem, + VDevAiCore: vDevActivityInfo.VDevAiCore, + IsVirtualDev: vDevActivityInfo.IsVirtualDev, + } +} + +// DeepCopySlice Deep copy slice +func deepCopySlice(slice interface{}) interface{} { + + switch v := slice.(type) { + case []int: + newSlice := make([]int, len(v)) + copy(newSlice, v) + return newSlice + case []uint32: + newSlice := make([]uint32, len(v)) + copy(newSlice, v) + return newSlice + case []float64: + newSlice := make([]float64, len(v)) + copy(newSlice, v) + return newSlice + default: + hwlog.RunLog.Warn("Unsupported slice type") + return slice + } +} + +// GetDevType get device type by chip name,boardId +func GetDevType(chipName string, boardId uint32) string { + var devType string + if Is910A3Chip(boardId) { + devType = api.Ascend910A3 + } else { + devType = GetDeviceTypeByChipName(chipName) + } + return devType +} + +// Is910A3Chip current chip is 910A3 or not,include A900A3 and A9000A3 +func Is910A3Chip(boardId uint32) bool { + return a3BoardIds.Has(int32(boardId)) +} + +// IsA900A3SuperPod current product is A900A3 super pod or not +func IsA900A3SuperPod(mainBoardId uint32) bool { + return a900A3SuperPodMainBoardIds.Has(int32(mainBoardId)) +} + +// IsA9000A3SuperPod current product is A9000A3 super pod or not +func IsA9000A3SuperPod(mainBoardId uint32) bool { + return a9000A3SuperPodMainBoardIds.Has(int32(mainBoardId)) +} + +// Is800IA3Chip current chip is 800IA3 or not +func Is800IA3Chip(mainBoardId uint32) bool { + return mainBoardId == A800IA3MainBoardId +} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils_test.go b/mind-cluster/component/ascend-common/devmanager/common/utils_test.go new file mode 100644 index 0000000..548a1c0 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/common/utils_test.go @@ -0,0 +1,163 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package common + +import ( + "fmt" + "strings" + "testing" + + "github.com/smartystreets/goconvey/convey" +) + +// TestDeepCopyHccsBandwidthInfo TestDeepCopySlice +func TestDeepCopyHccsBandwidthInfo(t *testing.T) { + + convey.Convey("should copy a new []int", t, func() { + slice := []int{1, 2} + newSlice := deepCopySlice(slice) + convey.So(&newSlice, convey.ShouldNotEqual, &slice) + }) + + convey.Convey("should copy a new []int32", t, func() { + slice := []uint32{1, 2} + + newSlice := deepCopySlice(slice) + convey.So(&newSlice, convey.ShouldNotEqual, &slice) + }) + + convey.Convey("should copy a new []float64", t, func() { + slice := []float64{1, 2} + newSlice := deepCopySlice(slice) + convey.So(&newSlice, convey.ShouldNotEqual, &slice) + }) +} + +func TestIsValidPortID(t *testing.T) { + convey.Convey("Given a port ID", t, func() { + convey.Convey("01-When the port ID is invalid, should return false", func() { + portID1 := 1 + convey.So(IsValidPortID(portID1), convey.ShouldBeFalse) + }) + + convey.Convey("02-When the port ID is the default, should return true", func() { + portID3 := DefaultPingMeshPortID + convey.So(IsValidPortID(portID3), convey.ShouldBeTrue) + }) + }) +} + +func TestIsValidTaskID(t *testing.T) { + convey.Convey("Given a task ID", t, func() { + convey.Convey("01-When the task ID is valid, should return true", func() { + taskID1 := InternalPingMeshTaskID + convey.So(IsValidTaskID(taskID1), convey.ShouldBeTrue) + + taskID2 := ExternalPingMeshTaskID + convey.So(IsValidTaskID(taskID2), convey.ShouldBeTrue) + }) + + convey.Convey("02-When the task ID is invalid, should return false", func() { + const taskID3 = 3 + convey.So(IsValidTaskID(taskID3), convey.ShouldBeFalse) + }) + }) +} + +func defaultHccspingMeshOperate() HccspingMeshOperate { + return HccspingMeshOperate{ + DstAddr: "1111", + PktSize: MinPktSize, + PktSendNum: MinPktSendNum, + PktInterval: MinPktInterval, + TaskInterval: MinTaskInterval, + TaskId: int(InternalPingMeshTaskID), + } +} + +func check(op HccspingMeshOperate, expectedErr error) { + err := IsValidHccspingMeshOperate(op) + convey.So(err, convey.ShouldResemble, expectedErr) +} + +func expectedError(pattern string, current, min, max int) error { + return fmt.Errorf(pattern, current, min, max) +} + +func TestIsValidHccspingMeshOperate01(t *testing.T) { + convey.Convey("Given a pingmesh operate", t, func() { + op := defaultHccspingMeshOperate() + convey.Convey("01-When operation valid, should return nil", func() { + check(op, nil) + }) + var expectedErr error + convey.Convey("01-When the dst addr is invalid, should return error", func() { + op.DstAddr = strings.Repeat("a", MaxHccspingMeshAddr+1) + expectedErr = fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(op.DstAddr), + MaxHccspingMeshAddr) + check(op, expectedErr) + }) + op.DstAddr = "1111" + convey.Convey("02-When the pkt size is invalid, should return error", func() { + pattern := "pkt size %d is invalid, should be between %d and %d" + op.PktSize = MinPktSize - 1 + check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) + op.PktSize = MaxPktSize + 1 + check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) + }) + op.PktSize = MinPktSize + convey.Convey("03-When the pkt send num is invalid, should return error", func() { + pattern := "pkt send num %d is invalid, should be between %d and %d" + op.PktSendNum = MinPktSendNum - 1 + check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) + op.PktSendNum = MaxPktSendNum + 1 + check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) + }) + op.TaskInterval = MinTaskInterval + convey.Convey("06-When the task id is invalid, should return error", func() { + op.TaskId = int(ExternalPingMeshTaskID) + 1 + expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) + check(op, expectedErr) + }) + }) +} + +func TestIsValidHccspingMeshOperate02(t *testing.T) { + convey.Convey("Given a pingmesh operate", t, func() { + op := defaultHccspingMeshOperate() + convey.Convey("04-When the pkt interval is invalid, should return error", func() { + pattern := "pkt interval %d is invalid, should be between %d and %d" + op.PktInterval = MinPktInterval - 1 + check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) + op.PktInterval = MaxPktInterval + 1 + check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) + }) + op.PktInterval = MinPktInterval + convey.Convey("05-When the task interval is invalid, should return error", func() { + pattern := "task interval %d is invalid, should be between %d and %d" + op.TaskInterval = MinTaskInterval - 1 + check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) + op.TaskInterval = MaxTaskInterval + 1 + check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) + }) + op.TaskInterval = MinTaskInterval + var expectedErr error + convey.Convey("06-When the task id is invalid, should return error", func() { + op.TaskId = int(ExternalPingMeshTaskID) + 1 + expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) + check(op, expectedErr) + }) + }) +} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go b/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go new file mode 100644 index 0000000..bd68af3 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go @@ -0,0 +1,78 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package dcmi this for constants +package dcmi + +// MainCmd main command enum +type MainCmd uint32 + +// VDevMngSubCmd virtual device manager sub command +type VDevMngSubCmd uint32 + +// DieType present chip die type +type DieType int32 + +const ( + // dcmiMaxVdevNum is max number of vdevice, value is from driver specification + dcmiMaxVdevNum = 32 + // dcmiMaxReserveNum is max number of reserve, value is from driver specification + dcmiMaxReserveNum = 8 + // dcmiVDevResNameLen length of vnpu resource name + dcmiVDevResNameLen = 16 + // dcmiHccsMaxPcsNum max pcs number for hccs + dcmiHccsMaxPcsNum = 16 + + maxChipNameLen = 32 + productTypeLen = 64 + dcmiVersionLen = 32 + + // MainCmdChipInf main cmd chip inf + MainCmdChipInf MainCmd = 12 + // MainCmdHccs main cmd of hccs + MainCmdHccs MainCmd = 16 + // MainCmdVDevMng virtual device manager + MainCmdVDevMng MainCmd = 52 + // MainCmdSio SIO status between die + MainCmdSio MainCmd = 56 + + // VmngSubCmdGetVDevResource get virtual device resource info + VmngSubCmdGetVDevResource VDevMngSubCmd = 0 + // VmngSubCmdGetTotalResource get total resource info + VmngSubCmdGetTotalResource VDevMngSubCmd = 1 + // VmngSubCmdGetFreeResource get free resource info + VmngSubCmdGetFreeResource VDevMngSubCmd = 2 + // VmngSubCmdGetVDevActivity get vir device activity info + VmngSubCmdGetVDevActivity VDevMngSubCmd = 5 + // CinfSubCmdGetSPodInfo get super pod info + CinfSubCmdGetSPodInfo VDevMngSubCmd = 1 + // SioSubCmdCrcErrStatistics get SIO err statistics info + SioSubCmdCrcErrStatistics VDevMngSubCmd = 0 + // HccsSubCmdGetStatisticInfo get statistic info + HccsSubCmdGetStatisticInfo VDevMngSubCmd = 3 + // HccsSubCmdGetStatisticInfoU64 get statistic info in u64 + HccsSubCmdGetStatisticInfoU64 VDevMngSubCmd = 5 + + // NDIE NDie ID, only Ascend910 has + NDIE DieType = 0 + // VDIE VDie ID, it can be the uuid of chip + VDIE DieType = 1 + // DieIDCount die id array max length + DieIDCount = 5 + + // ipAddrTypeV6 ip address type of IPv6 + ipAddrTypeV6 = 1 + + agentdrvProfDataNum = 3 +) diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go new file mode 100644 index 0000000..834397c --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go @@ -0,0 +1,2213 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package dcmi this for dcmi manager +package dcmi + +// #cgo LDFLAGS: -ldl +/* + #include + #include + #include + #include + + #include "dcmi_interface_api.h" + + static void *dcmiHandle; + #define SO_NOT_FOUND -99999 + #define FUNCTION_NOT_FOUND -99998 + #define SUCCESS 0 + #define ERROR_UNKNOWN -99997 + #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); + + // dcmi + static int (*dcmi_init_func)(); + static int dcmi_init_new(){ + CALL_FUNC(dcmi_init) + } + + static int (*dcmi_get_card_num_list_func)(int *card_num,int *card_list,int list_length); + static int dcmi_get_card_num_list_new(int *card_num,int *card_list,int list_length){ + CALL_FUNC(dcmi_get_card_num_list,card_num,card_list,list_length) + } + + static int (*dcmi_get_device_num_in_card_func)(int card_id,int *device_num); + static int dcmi_get_device_num_in_card_new(int card_id,int *device_num){ + CALL_FUNC(dcmi_get_device_num_in_card,card_id,device_num) + } + + static int (*dcmi_get_device_logic_id_func)(int *device_logic_id,int card_id,int device_id); + static int dcmi_get_device_logic_id_new(int *device_logic_id,int card_id,int device_id){ + CALL_FUNC(dcmi_get_device_logic_id,device_logic_id,card_id,device_id) + } + + static int (*dcmi_create_vdevice_func)(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, + struct dcmi_create_vdev_out *out); + int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, + struct dcmi_create_vdev_out *out){ + CALL_FUNC(dcmi_create_vdevice,card_id,device_id,vdev,out) + } + + static int (*dcmi_get_device_info_func)(int card_id, int device_id, enum dcmi_main_cmd main_cmd, + unsigned int sub_cmd,void *buf, unsigned int *size); + int dcmi_get_device_info(int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, + unsigned int *size){ + CALL_FUNC(dcmi_get_device_info,card_id,device_id,main_cmd,sub_cmd,buf,size) + } + + static int (*dcmi_get_hccs_link_bandwidth_info_func)(int card_id, int device_id, +struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); + int dcmi_get_hccs_link_bandwidth_info(int card_id, int device_id, +struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info){ + CALL_FUNC(dcmi_get_hccs_link_bandwidth_info,card_id,device_id,hccs_bandwidth_info) + } + + static int (*dcmi_set_destroy_vdevice_func)(int card_id,int device_id, unsigned int VDevid); + int dcmi_set_destroy_vdevice(int card_id,int device_id, unsigned int VDevid){ + CALL_FUNC(dcmi_set_destroy_vdevice,card_id,device_id,VDevid) + } + + static int (*dcmi_get_device_type_func)(int card_id,int device_id,enum dcmi_unit_type *device_type); + int dcmi_get_device_type(int card_id,int device_id,enum dcmi_unit_type *device_type){ + CALL_FUNC(dcmi_get_device_type,card_id,device_id,device_type) + } + + static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); + int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ + CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) + } + + static int (*dcmi_get_device_utilization_rate_func)(int card_id, int device_id, int input_type, + unsigned int *utilization_rate); + int dcmi_get_device_utilization_rate(int card_id, int device_id, int input_type, unsigned int *utilization_rate){ + CALL_FUNC(dcmi_get_device_utilization_rate,card_id,device_id,input_type,utilization_rate) + } + + static int (*dcmi_get_device_temperature_func)(int card_id, int device_id, int *temperature); + int dcmi_get_device_temperature(int card_id, int device_id, int *temperature){ + CALL_FUNC(dcmi_get_device_temperature,card_id,device_id,temperature) + } + + static int (*dcmi_get_device_voltage_func)(int card_id, int device_id, unsigned int *voltage); + int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage){ + CALL_FUNC(dcmi_get_device_voltage,card_id,device_id,voltage) + } + + static int (*dcmi_get_device_power_info_func)(int card_id, int device_id, int *power); + int dcmi_get_device_power_info(int card_id, int device_id, int *power){ + CALL_FUNC(dcmi_get_device_power_info,card_id,device_id,power) + } + + static int (*dcmi_get_device_frequency_func)(int card_id, int device_id, enum dcmi_freq_type input_type, + unsigned int *frequency); + int dcmi_get_device_frequency(int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency){ + CALL_FUNC(dcmi_get_device_frequency,card_id,device_id,input_type,frequency) + } + + static int (*dcmi_get_device_memory_info_v3_func)(int card_id, int device_id, + struct dcmi_get_memory_info_stru *memory_info); + int dcmi_get_device_memory_info_v3(int card_id, int device_id, struct dcmi_get_memory_info_stru *memory_info){ + CALL_FUNC(dcmi_get_device_memory_info_v3,card_id,device_id,memory_info) + } + + static int (*dcmi_get_device_hbm_info_func)(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); + int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info){ + CALL_FUNC(dcmi_get_device_hbm_info,card_id,device_id,hbm_info) + } + + static int (*dcmi_get_device_errorcode_v2_func)(int card_id, int device_id, int *error_count, + unsigned int *error_code_list, unsigned int list_len); + int dcmi_get_device_errorcode_v2(int card_id, int device_id, int *error_count, + unsigned int *error_code_list, unsigned int list_len){ + CALL_FUNC(dcmi_get_device_errorcode_v2,card_id,device_id,error_count,error_code_list,list_len) + } + + static int (*dcmi_get_device_chip_info_func)(int card_id, int device_id, struct dcmi_chip_info *chip_info); + int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info){ + CALL_FUNC(dcmi_get_device_chip_info,card_id,device_id,chip_info) + } + + static int (*dcmi_get_device_chip_info_v2_func)(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); + int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info){ + CALL_FUNC(dcmi_get_device_chip_info_v2,card_id,device_id,chip_info) + } + + static int (*dcmi_get_device_phyid_from_logicid_func)(unsigned int logicid, unsigned int *phyid); + int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid){ + CALL_FUNC(dcmi_get_device_phyid_from_logicid,logicid,phyid) + } + + static int (*dcmi_get_device_logicid_from_phyid_func)(unsigned int phyid, unsigned int *logicid); + int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid){ + CALL_FUNC(dcmi_get_device_logicid_from_phyid,phyid,logicid) + } + + static int (*dcmi_get_device_ip_func)(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, + struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); + int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, + struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask){ + CALL_FUNC(dcmi_get_device_ip,card_id,device_id,input_type,port_id,ip,mask) + } + + static int (*dcmi_get_device_network_health_func)(int card_id, int device_id, + enum dcmi_rdfx_detect_result *result); + int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result){ + CALL_FUNC(dcmi_get_device_network_health,card_id,device_id,result) + } + + static int (*dcmi_get_card_list_func)(int *card_num, int *card_list, int list_len); + int dcmi_get_card_list(int *card_num, int *card_list, int list_len){ + CALL_FUNC(dcmi_get_card_list,card_num,card_list,list_len) + } + + static int (*dcmi_get_device_id_in_card_func)(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); + int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id){ + CALL_FUNC(dcmi_get_device_id_in_card,card_id,device_id_max,mcu_id,cpu_id) + } + + static int (*dcmi_get_memory_info_func)(int card_id, int device_id, + struct dcmi_memory_info_stru *device_memory_info); + int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info){ + CALL_FUNC(dcmi_get_memory_info,card_id,device_id,device_memory_info) + } + + static int (*dcmi_get_device_errorcode_func)(int card_id, int device_id, int *error_count, unsigned int *error_code, + int *error_width); + int dcmi_get_device_errorcode(int card_id, int device_id, int *error_count, unsigned int *error_code, + int *error_width){ + CALL_FUNC(dcmi_get_device_errorcode,card_id,device_id,error_count,error_code,error_width) + } + + static int (*dcmi_get_card_id_device_id_from_logicid_func)(int *card_id, int *device_id, + unsigned int device_logic_id); + int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id){ + CALL_FUNC(dcmi_get_card_id_device_id_from_logicid,card_id,device_id,device_logic_id) + } + + static int (*dcmi_mcu_get_power_info_func)(int card_id, int *power); + static int dcmi_mcu_get_power_info_new(int card_id, int *power){ + CALL_FUNC(dcmi_mcu_get_power_info,card_id,power) + } + + static int (*dcmi_get_product_type_func)(int card_id, int device_id, char *product_type_str, int buf_size); + int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size){ + CALL_FUNC(dcmi_get_product_type,card_id,device_id,product_type_str,buf_size) + } + + static int (*dcmi_get_card_elabel_v2_func)(int card_id, struct dcmi_elabel_info *elabel_info); + int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info){ + CALL_FUNC(dcmi_get_card_elabel_v2,card_id,elabel_info) + } + + static int (*dcmi_set_device_reset_func)(int card_id, int device_id, enum dcmi_reset_channel channel_type); + int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type){ + CALL_FUNC(dcmi_set_device_reset,card_id,device_id,channel_type) + } + + static int (*dcmi_get_device_outband_channel_state_func)(int card_id, int device_id, int* channel_state); + int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state){ + CALL_FUNC(dcmi_get_device_outband_channel_state,card_id,device_id,channel_state) + } + + static int (*dcmi_pre_reset_soc_func)(int card_id, int device_id); + int dcmi_pre_reset_soc(int card_id, int device_id){ + CALL_FUNC(dcmi_pre_reset_soc,card_id,device_id) + } + + static int (*dcmi_rescan_soc_func)(int card_id, int device_id); + int dcmi_rescan_soc(int card_id, int device_id){ + CALL_FUNC(dcmi_rescan_soc,card_id,device_id) + } + + static int (*dcmi_get_netdev_brother_device_func)(int card_id, int device_id, int* brother_card_id); + int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id){ + CALL_FUNC(dcmi_get_netdev_brother_device,card_id,device_id,brother_card_id) + } + + static int (*dcmi_get_device_boot_status_func)(int card_id, int device_id, enum dcmi_boot_status *boot_status); + int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status){ + CALL_FUNC(dcmi_get_device_boot_status,card_id,device_id,boot_status) + } + + void goEventFaultCallBack(struct dcmi_dms_fault_event); + static void event_handler(struct dcmi_event *fault_event) { + goEventFaultCallBack(fault_event->event_t.dms_event); + } + + static int (*dcmi_subscribe_fault_event_func)(int card_id, int device_id, struct dcmi_event_filter filter, + void (*f_name)(struct dcmi_event *fault_event)); + int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter){ + CALL_FUNC(dcmi_subscribe_fault_event,card_id,device_id,filter,event_handler) + } + + static int (*dcmi_get_npu_work_mode_func)(int card_id, unsigned char *work_mode); + int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode){ + CALL_FUNC(dcmi_get_npu_work_mode,card_id,work_mode) + } + + static int (*dcmi_get_device_die_v2_func)(int card_id, int device_id, enum dcmi_die_type input_type, + struct dcmi_die_id *die_id); + int dcmi_get_device_die_v2(int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id){ + CALL_FUNC(dcmi_get_device_die_v2,card_id,device_id,input_type,die_id) + } + + static int (*dcmi_get_device_resource_info_func)(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, + int *proc_num); + int dcmi_get_device_resource_info(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, int *proc_num){ + CALL_FUNC(dcmi_get_device_resource_info,card_id,device_id,proc_info,proc_num) + } + + static int (*dcmi_get_device_pcie_info_v2_func)(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); + int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info){ + CALL_FUNC(dcmi_get_device_pcie_info_v2,card_id,device_id,pcie_info) + } + + static int (*dcmi_get_device_board_info_func)(int card_id, int device_id, struct dcmi_board_info *board_info); + int dcmi_get_device_board_info(int card_id, int device_id, struct dcmi_board_info *board_info){ + CALL_FUNC(dcmi_get_device_board_info,card_id,device_id,board_info) + } + + static int (*dcmi_get_pcie_link_bandwidth_info_func)(int card_id, int device_id, + struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); + int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, + struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info){ + CALL_FUNC(dcmi_get_pcie_link_bandwidth_info,card_id,device_id,pcie_link_bandwidth_info) + } + + static int (*dcmi_get_dcmi_version_func)(char *dcmi_ver, int buf_size); + int dcmi_get_dcmi_version(char *dcmi_ver, int buf_size){ + CALL_FUNC(dcmi_get_dcmi_version,dcmi_ver,buf_size) + } + + static int (*dcmi_get_device_ecc_info_func)(int card_id, int device_id, enum dcmi_device_type input_type, + struct dcmi_ecc_info *device_ecc_info); + int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, + struct dcmi_ecc_info *device_ecc_info){ + CALL_FUNC(dcmi_get_device_ecc_info,card_id,device_id,input_type,device_ecc_info) + } + + static int (*dcmi_get_mainboard_id_func)(int card_id, int device_id, unsigned int *mainboard_id); + int dcmi_get_mainboard_id(int card_id, int device_id, unsigned int *mainboard_id){ + CALL_FUNC(dcmi_get_mainboard_id,card_id,device_id,mainboard_id) + } + + static int (*dcmi_start_hccsping_mesh_func)(int card_id, int device_id, int port_id, +struct dcmi_hccsping_mesh_operate *hccsping_mesh); + int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, +struct dcmi_hccsping_mesh_operate *hccsping_mesh){ + CALL_FUNC(dcmi_start_hccsping_mesh,card_id,device_id,port_id,hccsping_mesh) +} + static int (*dcmi_stop_hccsping_mesh_func)(int card_id, int device_id, int port_id, unsigned int task_id); + int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id){ + CALL_FUNC(dcmi_stop_hccsping_mesh,card_id,device_id,port_id,task_id) + } + + static int (*dcmi_get_hccsping_mesh_info_func)(int card_id, int device_id, int port_id, unsigned int task_id, +struct dcmi_hccsping_mesh_info *hccsping_mesh_info); + int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, +struct dcmi_hccsping_mesh_info *hccsping_mesh_info){ + CALL_FUNC(dcmi_get_hccsping_mesh_info,card_id,device_id,port_id,task_id,hccsping_mesh_info) +} + + static int (*dcmi_get_hccsping_mesh_state_func)(int card_id, int device_id, int port_id, unsigned int task_id, +unsigned int *state); + int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, +unsigned int *state){ + CALL_FUNC(dcmi_get_hccsping_mesh_state,card_id,device_id,port_id,task_id,state) +} + + static int (*dcmi_get_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int *status); + int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status){ + CALL_FUNC(dcmi_get_spod_node_status,card_id,device_id,sdid,status) + } + + static int (*dcmi_set_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int status); + int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status){ + CALL_FUNC(dcmi_set_spod_node_status,card_id,device_id,sdid,status) + } + + // load .so files and functions + static int dcmiInit_dl(const char* dcmiLibPath){ + if (dcmiLibPath == NULL) { + fprintf (stderr,"lib path is null\n"); + return SO_NOT_FOUND; + } + dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); + if (dcmiHandle == NULL){ + fprintf (stderr,"%s\n",dlerror()); + return SO_NOT_FOUND; + } + + dcmi_init_func = dlsym(dcmiHandle,"dcmi_init"); + + dcmi_get_card_num_list_func = dlsym(dcmiHandle,"dcmi_get_card_num_list"); + + dcmi_get_device_num_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_num_in_card"); + + dcmi_get_device_logic_id_func = dlsym(dcmiHandle,"dcmi_get_device_logic_id"); + + dcmi_create_vdevice_func = dlsym(dcmiHandle,"dcmi_create_vdevice"); + + dcmi_get_device_info_func = dlsym(dcmiHandle,"dcmi_get_device_info"); + + dcmi_set_destroy_vdevice_func = dlsym(dcmiHandle,"dcmi_set_destroy_vdevice"); + + dcmi_get_device_type_func = dlsym(dcmiHandle,"dcmi_get_device_type"); + + dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); + + dcmi_get_device_utilization_rate_func = dlsym(dcmiHandle,"dcmi_get_device_utilization_rate"); + + dcmi_get_device_temperature_func = dlsym(dcmiHandle,"dcmi_get_device_temperature"); + + dcmi_get_device_voltage_func = dlsym(dcmiHandle,"dcmi_get_device_voltage"); + + dcmi_get_device_power_info_func = dlsym(dcmiHandle,"dcmi_get_device_power_info"); + + dcmi_get_device_frequency_func = dlsym(dcmiHandle,"dcmi_get_device_frequency"); + + dcmi_get_device_memory_info_v3_func = dlsym(dcmiHandle,"dcmi_get_device_memory_info_v3"); + + dcmi_get_device_hbm_info_func = dlsym(dcmiHandle,"dcmi_get_device_hbm_info"); + + dcmi_get_device_errorcode_v2_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode_v2"); + + dcmi_get_device_chip_info_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info"); + + dcmi_get_device_chip_info_v2_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info_v2"); + + dcmi_get_device_phyid_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_device_phyid_from_logicid"); + + dcmi_get_device_logicid_from_phyid_func = dlsym(dcmiHandle,"dcmi_get_device_logicid_from_phyid"); + + dcmi_get_device_ip_func = dlsym(dcmiHandle,"dcmi_get_device_ip"); + + dcmi_get_device_network_health_func = dlsym(dcmiHandle,"dcmi_get_device_network_health"); + + dcmi_get_card_list_func = dlsym(dcmiHandle,"dcmi_get_card_list"); + + dcmi_get_device_id_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_id_in_card"); + + dcmi_get_memory_info_func = dlsym(dcmiHandle,"dcmi_get_memory_info"); + + dcmi_get_device_errorcode_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode"); + + dcmi_get_card_id_device_id_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_card_id_device_id_from_logicid"); + + dcmi_mcu_get_power_info_func = dlsym(dcmiHandle,"dcmi_mcu_get_power_info"); + + dcmi_get_product_type_func = dlsym(dcmiHandle,"dcmi_get_product_type"); + + dcmi_get_card_elabel_v2_func = dlsym(dcmiHandle,"dcmi_get_card_elabel_v2"); + + dcmi_set_device_reset_func = dlsym(dcmiHandle,"dcmi_set_device_reset"); + + dcmi_get_device_outband_channel_state_func = dlsym(dcmiHandle,"dcmi_get_device_outband_channel_state"); + + dcmi_pre_reset_soc_func = dlsym(dcmiHandle,"dcmi_pre_reset_soc"); + + dcmi_rescan_soc_func = dlsym(dcmiHandle,"dcmi_rescan_soc"); + + dcmi_get_netdev_brother_device_func = dlsym(dcmiHandle,"dcmi_get_netdev_brother_device"); + + dcmi_get_device_boot_status_func = dlsym(dcmiHandle,"dcmi_get_device_boot_status"); + + dcmi_subscribe_fault_event_func = dlsym(dcmiHandle,"dcmi_subscribe_fault_event"); + + dcmi_get_npu_work_mode_func = dlsym(dcmiHandle, "dcmi_get_npu_work_mode"); + + dcmi_get_device_die_v2_func = dlsym(dcmiHandle, "dcmi_get_device_die_v2"); + + dcmi_get_device_resource_info_func = dlsym(dcmiHandle, "dcmi_get_device_resource_info"); + + dcmi_get_device_pcie_info_v2_func = dlsym(dcmiHandle, "dcmi_get_device_pcie_info_v2"); + + dcmi_get_device_board_info_func = dlsym(dcmiHandle, "dcmi_get_device_board_info"); + + dcmi_get_pcie_link_bandwidth_info_func = dlsym(dcmiHandle, "dcmi_get_pcie_link_bandwidth_info"); + + dcmi_get_dcmi_version_func = dlsym(dcmiHandle,"dcmi_get_dcmi_version"); + + dcmi_get_device_ecc_info_func = dlsym(dcmiHandle,"dcmi_get_device_ecc_info"); + + dcmi_get_mainboard_id_func = dlsym(dcmiHandle, "dcmi_get_mainboard_id"); + + dcmi_get_hccs_link_bandwidth_info_func = dlsym(dcmiHandle,"dcmi_get_hccs_link_bandwidth_info"); + + dcmi_start_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_start_hccsping_mesh"); + + dcmi_stop_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_stop_hccsping_mesh"); + + dcmi_get_hccsping_mesh_info_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_info"); + + dcmi_get_hccsping_mesh_state_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_state"); + + dcmi_get_spod_node_status_func = dlsym(dcmiHandle,"dcmi_get_spod_node_status"); + + dcmi_set_spod_node_status_func = dlsym(dcmiHandle,"dcmi_set_spod_node_status"); + + return SUCCESS; + } + + static int dcmiShutDown(void){ + if (dcmiHandle == NULL) { + return SUCCESS; + } + return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); + } +*/ +import "C" +import ( + "errors" + "fmt" + "math" + "net" + "strconv" + "strings" + "time" + "unsafe" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "ascend-common/devmanager/common" +) + +// CDcmiMemoryInfoV3 the c struct of memoryInfo for v3 +type CDcmiMemoryInfoV3 = C.struct_dcmi_get_memory_info_stru + +// CDcmiMemoryInfoV1 the c struct of memoryInfo for v1 +type CDcmiMemoryInfoV1 = C.struct_dcmi_memory_info_stru + +// DcDriverInterface interface for dcmi +type DcDriverInterface interface { + DcInit() error + DcShutDown() error + + DcGetDcmiVersion() (string, error) + DcGetDeviceCount() (int32, error) + DcGetLogicIDList() (int32, []int32, error) + DcGetDeviceHealth(int32, int32) (int32, error) + DcGetDeviceNetWorkHealth(int32, int32) (uint32, error) + DcGetDeviceUtilizationRate(int32, int32, common.DeviceType) (int32, error) + DcGetDeviceTemperature(int32, int32) (int32, error) + DcGetDeviceVoltage(int32, int32) (float32, error) + DcGetDevicePowerInfo(int32, int32) (float32, error) + DcGetDeviceFrequency(int32, int32, common.DeviceType) (uint32, error) + DcGetMemoryInfo(int32, int32) (*common.MemoryInfo, error) + DcGetHbmInfo(int32, int32) (*common.HbmInfo, error) + DcGetDeviceErrorCode(int32, int32) (int32, int64, error) + DcGetChipInfo(int32, int32) (*common.ChipInfo, error) + DcGetPhysicIDFromLogicID(int32) (int32, error) + DcGetLogicIDFromPhysicID(int32) (int32, error) + DcGetDeviceLogicID(int32, int32) (int32, error) + DcGetDeviceIPAddress(int32, int32, int32) (string, error) + DcGetMcuPowerInfo(int32) (float32, error) + DcGetDieID(int32, int32, DieType) (string, error) + DcGetPCIeBusInfo(int32, int32) (string, error) + + DcGetCardList() (int32, []int32, error) + DcGetDeviceNumInCard(int32) (int32, error) + DcSetDestroyVirtualDevice(int32, int32, uint32) error + DcCreateVirtualDevice(int32, int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) + DcGetDeviceVDevResource(int32, int32, uint32) (common.CgoVDevQueryStru, error) + DcGetDeviceTotalResource(int32, int32) (common.CgoSocTotalResource, error) + DcGetDeviceFreeResource(int32, int32) (common.CgoSocFreeResource, error) + DcGetVDevActivityInfo(int32, int32, uint32) (common.VDevActivityInfo, error) + DcVGetDeviceInfo(int32, int32) (common.VirtualDevInfo, error) + DcGetCardIDDeviceID(int32) (int32, int32, error) + DcCreateVDevice(int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) + DcGetVDeviceInfo(int32) (common.VirtualDevInfo, error) + DcDestroyVDevice(int32, uint32) error + DcGetProductType(int32, int32) (string, error) + DcGetNpuWorkMode(int32) (int, error) + DcSetDeviceReset(int32, int32) error + DcGetBrotherCardID(int32, int32) (int32, error) + DcPreResetSoc(int32, int32) error + DcGetOutBandChannelState(int32, int32) error + DcSetDeviceResetOutBand(int32, int32) error + DcRescanSoc(int32, int32) error + DcGetDeviceBootStatus(int32) (int, error) + DcGetSuperPodInfo(int32, int32) (common.CgoSuperPodInfo, error) + + DcGetDeviceAllErrorCode(int32, int32) (int32, []int64, error) + DcSubscribeDeviceFaultEvent(int32, int32) error + DcSetFaultEventCallFunc(func(common.DevFaultInfo)) + DcGetDevProcessInfo(int32, int32) (*common.DevProcessInfo, error) + DcGetDeviceBoardInfo(int32, int32) (common.BoardInfo, error) + DcGetPCIEBandwidth(int32, int32, int) (common.PCIEBwStat, error) + DcGetDeviceEccInfo(int32, int32, common.DcmiDeviceType) (*common.ECCInfo, error) + DcGetSioInfo(int32, int32) (common.SioCrcErrStatisticInfo, error) + DcGetHccsStatisticInfo(int32, int32) (common.HccsStatisticInfo, error) + DcGetHccsStatisticInfoU64(int32, int32) (common.HccsStatisticInfo, error) + DcGetDeviceMainBoardInfo(int32, int32) (uint32, error) + DcGetHccsBandwidthInfo(int32, int32, int) (common.HccsBandwidthInfo, error) + + DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error + DcStopHccsPingMesh(int32, int32, int, uint) error + DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) + DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) + DcGetSuperPodStatus(int32, int32, uint32) (int, error) + DcSetSuperPodStatus(int32, int32, uint32, uint32) error + DcGetCardElabelV2(int32) (common.ElabelInfo, error) +} + +const ( + dcmiLibraryName = "libdcmi.so" + templateNameLen = 32 + ipAddrListLen = 1024 + hcclpingMeshMaxNum = 48 +) + +var faultEventCallFunc func(common.DevFaultInfo) = nil +var ( + dcmiErrMap = map[int32]string{ + -8001: "The input parameter is incorrect", + -8002: "Permission error", + -8003: "The memory interface operation failed", + -8004: "The security function failed to be executed", + -8005: "Internal errors", + -8006: "Response timed out", + -8007: "Invalid deviceID", + -8008: "The device does not exist", + -8009: "ioctl returns failed", + -8010: "The message failed to be sent", + -8011: "Message reception failed", + -8012: "Not ready yet,please try again", + -8013: "This API is not supported in containers", + -8014: "The file operation failed", + -8015: "Reset failed", + -8016: "Reset cancels", + -8017: "Upgrading", + -8020: "Device resources are occupied", + -8022: "Partition consistency check,inconsistent partitions were found", + -8023: "The configuration information does not exist", + -8255: "Device ID/function is not supported", + -99997: "dcmi shutdown failed", + -99998: "The called function is missing,please upgrade the driver", + -99999: "dcmi libdcmi.so failed to load", + } +) + +// DcManager for manager dcmi interface +type DcManager struct{} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DcManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, + operate common.HccspingMeshOperate) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return fmt.Errorf("portID(%d) is invalid", portID) + } + if err := common.IsValidHccspingMeshOperate(operate); err != nil { + return fmt.Errorf("operate(%v) is invalid, err: %v", operate, err) + } + dtsAddrLsit := [ipAddrListLen]C.char{0} + for i := 0; i < len(operate.DstAddr) && i < len(dtsAddrLsit); i++ { + dtsAddrLsit[i] = C.char(operate.DstAddr[i]) + } + + op := C.struct_dcmi_hccsping_mesh_operate{ + dst_addr_list: dtsAddrLsit, + pkt_size: C.int(operate.PktSize), + pkt_send_num: C.int(operate.PktSendNum), + pkt_interval: C.int(operate.PktInterval), + timeout: C.int(operate.Timeout), + task_interval: C.int(operate.TaskInterval), + task_id: C.int(operate.TaskId), + } + if retCode := C.dcmi_start_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), + &op); retCode != common.Success { + return fmt.Errorf("dcmi start hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", + cardID, deviceID, int32(retCode)) + } + + return nil +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DcManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return fmt.Errorf("portID(%d) is invalid", portID) + } + if !common.IsValidTaskID(taskID) { + return fmt.Errorf("taskID(%d) is invalid", taskID) + } + if retCode := C.dcmi_stop_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), + C.uint(taskID)); retCode != common.Success { + return fmt.Errorf("dcmi stop hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", + cardID, deviceID, int32(retCode)) + } + return nil +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DcManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, + taskID uint) (*common.HccspingMeshInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return nil, fmt.Errorf("portID(%d) is invalid", portID) + } + if !common.IsValidTaskID(taskID) { + return nil, fmt.Errorf("taskID(%d) is invalid", taskID) + } + var info C.struct_dcmi_hccsping_mesh_info + if retCode := C.dcmi_get_hccsping_mesh_info(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), + &info); retCode != common.Success { + return nil, fmt.Errorf("dcmi get hccs ping mesh info failed cardID(%d) deviceID(%d) error code: %d", + cardID, deviceID, int32(retCode)) + } + return convertHccspingMeshInfo(&info) +} + +func convertHccspingMeshInfo(cInfo *C.struct_dcmi_hccsping_mesh_info) (*common.HccspingMeshInfo, error) { + if int(cInfo.dest_num) > hcclpingMeshMaxNum { + return nil, fmt.Errorf("dest_num(%d) is invalid, should not be greater than %d", int(cInfo.dest_num), + hcclpingMeshMaxNum) + } + info := &common.HccspingMeshInfo{} + for i := 0; i < int(cInfo.dest_num); i++ { + info.DstAddr = append(info.DstAddr, convertToString(cInfo.dst_addr[i])) + info.SucPktNum = append(info.SucPktNum, uint(cInfo.suc_pkt_num[i])) + info.FailPktNum = append(info.FailPktNum, uint(cInfo.fail_pkt_num[i])) + info.MaxTime = append(info.MaxTime, int(cInfo.max_time[i])) + info.MinTime = append(info.MinTime, int(cInfo.min_time[i])) + info.AvgTime = append(info.AvgTime, int(cInfo.avg_time[i])) + info.TP95Time = append(info.TP95Time, int(cInfo.tp95_time[i])) + info.ReplyStatNum = append(info.ReplyStatNum, int(cInfo.reply_stat_num[i])) + info.PingTotalNum = append(info.PingTotalNum, int(cInfo.ping_total_num[i])) + } + info.DestNum = int(cInfo.dest_num) + return info, nil +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DcManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidPortID(portID) { + return common.RetError, fmt.Errorf("portID(%d) is invalid", portID) + } + if !common.IsValidTaskID(taskID) { + return common.RetError, fmt.Errorf("taskID(%d) is invalid", taskID) + } + var state C.uint + if retCode := C.dcmi_get_hccsping_mesh_state(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), + &state); retCode != common.Success { + return common.RetError, fmt.Errorf("dcmi get hccs ping mesh state failed cardID(%d) deviceID(%d) error "+ + "code: %d", cardID, deviceID, int32(retCode)) + } + return int(state), nil +} + +// DcInit load symbol and initialize dcmi +func (d *DcManager) DcInit() error { + dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) + if err != nil { + return err + } + cDcmiTemplateName := C.CString(dcmiLibPath) + defer C.free(unsafe.Pointer(cDcmiTemplateName)) + if retCode := C.dcmiInit_dl(cDcmiTemplateName); retCode != C.SUCCESS { + return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) + } + if retCode := C.dcmi_init_new(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi init failed, error code: %d", int32(retCode)) + } + return nil +} + +// DcShutDown clean the dynamically loaded resource +func (d *DcManager) DcShutDown() error { + if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) + } + + return nil +} + +// DcGetCardList get card list +func (d *DcManager) DcGetCardList() (int32, []int32, error) { + var ids [common.HiAIMaxCardNum]C.int + var cNum C.int + if retCode := C.dcmi_get_card_list(&cNum, &ids[0], common.HiAIMaxCardNum); int32(retCode) != common. + Success { + return common.RetError, nil, fmt.Errorf("get card list failed, error code: %d", int32(retCode)) + } + // checking card's quantity + if cNum <= 0 || cNum > common.HiAIMaxCardNum { + return common.RetError, nil, fmt.Errorf("get error card quantity: %d", int32(cNum)) + } + var cardNum = int32(cNum) + var i int32 + var cardIDList []int32 + for i = 0; i < cardNum; i++ { + cardID := int32(ids[i]) + if cardID < 0 { + hwlog.RunLog.Errorf("get invalid card ID: %d", cardID) + continue + } + cardIDList = append(cardIDList, cardID) + } + return cardNum, cardIDList, nil +} + +// DcGetDeviceNumInCard get device number in the npu card +func (d *DcManager) DcGetDeviceNumInCard(cardID int32) (int32, error) { + if !common.IsValidCardID(cardID) { + return common.RetError, fmt.Errorf("cardID(%d) is invalid", cardID) + } + var deviceNum C.int + if retCode := C.dcmi_get_device_num_in_card_new(C.int(cardID), &deviceNum); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device count on the card failed, error code: %d", int32(retCode)) + } + if !common.IsValidDevNumInCard(int32(deviceNum)) { + return common.RetError, fmt.Errorf("get error device quantity: %d", int32(deviceNum)) + } + return int32(deviceNum), nil +} + +// DcGetDeviceLogicID get device logicID +func (d *DcManager) DcGetDeviceLogicID(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var logicID C.int + if retCode := C.dcmi_get_device_logic_id_new(&logicID, C.int(cardID), + C.int(deviceID)); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("failed to get logicID by cardID(%d) and deviceID(%d), error code: %d", + cardID, deviceID, int32(retCode)) + } + + // check whether logicID is invalid + if !common.IsValidLogicIDOrPhyID(int32(logicID)) { + return common.RetError, fmt.Errorf("get invalid logicID: %d", int32(logicID)) + } + return int32(logicID), nil +} + +// DcSetDestroyVirtualDevice destroy virtual device +func (d *DcManager) DcSetDestroyVirtualDevice(cardID, deviceID int32, vDevID uint32) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if retCode := C.dcmi_set_destroy_vdevice(C.int(cardID), C.int(deviceID), + C.uint(vDevID)); int32(retCode) != common.Success { + return fmt.Errorf("destroy virtual device failed, error code: %d", int32(retCode)) + } + return nil +} + +func convertCreateVDevOut(cCreateVDevOut C.struct_dcmi_create_vdev_out) common.CgoCreateVDevOut { + cgoCreateVDevOut := common.CgoCreateVDevOut{ + VDevID: uint32(cCreateVDevOut.vdev_id), + PcieBus: uint32(cCreateVDevOut.pcie_bus), + PcieDevice: uint32(cCreateVDevOut.pcie_device), + PcieFunc: uint32(cCreateVDevOut.pcie_func), + VfgID: uint32(cCreateVDevOut.vfg_id), + } + return cgoCreateVDevOut +} + +// DcCreateVirtualDevice create virtual device +func (d *DcManager) DcCreateVirtualDevice(cardID, deviceID int32, vDevInfo common.CgoCreateVDevRes) (common. + CgoCreateVDevOut, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoCreateVDevOut{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if len(vDevInfo.TemplateName) > templateNameLen { + return common.CgoCreateVDevOut{}, fmt.Errorf("the length of template name exceeds the upper limit") + } + cTemplateName := [templateNameLen]C.char{0} + for i := 0; i < len(vDevInfo.TemplateName); i++ { + cTemplateName[i] = C.char(vDevInfo.TemplateName[i]) + } + deviceCreateStr := C.struct_dcmi_create_vdev_res_stru{ + vdev_id: C.uint(vDevInfo.VDevID), + vfg_id: C.uint(vDevInfo.VfgID), + template_name: cTemplateName, + } + + var createVDevOut C.struct_dcmi_create_vdev_out + if retCode := C.dcmi_create_vdevice(C.int(cardID), C.int(deviceID), &deviceCreateStr, + &createVDevOut); int32(retCode) != common.Success { + return common.CgoCreateVDevOut{}, fmt.Errorf("create vdevice failed, error is: %d", int32(retCode)) + } + + return convertCreateVDevOut(createVDevOut), nil +} + +func convertToString(cgoArr [dcmiVDevResNameLen]C.char) string { + var charArr []rune + for _, v := range cgoArr { + if v == 0 { + break + } + charArr = append(charArr, rune(v)) + } + return string(charArr) +} + +func convertBaseResource(cBaseResource C.struct_dcmi_base_resource) common.CgoBaseResource { + baseResource := common.CgoBaseResource{ + Token: uint64(cBaseResource.token), + TokenMax: uint64(cBaseResource.token_max), + TaskTimeout: uint64(cBaseResource.task_timeout), + VfgID: uint32(cBaseResource.vfg_id), + VipMode: uint8(cBaseResource.vip_mode), + } + return baseResource +} + +func convertComputingResource(cComputingResource C.struct_dcmi_computing_resource) common.CgoComputingResource { + computingResource := common.CgoComputingResource{ + Aic: float32(cComputingResource.aic), + Aiv: float32(cComputingResource.aiv), + Dsa: uint16(cComputingResource.dsa), + Rtsq: uint16(cComputingResource.rtsq), + Acsq: uint16(cComputingResource.acsq), + Cdqm: uint16(cComputingResource.cdqm), + CCore: uint16(cComputingResource.c_core), + Ffts: uint16(cComputingResource.ffts), + Sdma: uint16(cComputingResource.sdma), + PcieDma: uint16(cComputingResource.pcie_dma), + MemorySize: uint64(cComputingResource.memory_size), + EventID: uint32(cComputingResource.event_id), + NotifyID: uint32(cComputingResource.notify_id), + StreamID: uint32(cComputingResource.stream_id), + ModelID: uint32(cComputingResource.model_id), + TopicScheduleAicpu: uint16(cComputingResource.topic_schedule_aicpu), + HostCtrlCPU: uint16(cComputingResource.host_ctrl_cpu), + HostAicpu: uint16(cComputingResource.host_aicpu), + DeviceAicpu: uint16(cComputingResource.device_aicpu), + TopicCtrlCPUSlot: uint16(cComputingResource.topic_ctrl_cpu_slot), + } + return computingResource +} + +func convertMediaResource(cMediaResource C.struct_dcmi_media_resource) common.CgoMediaResource { + mediaResource := common.CgoMediaResource{ + Jpegd: float32(cMediaResource.jpegd), + Jpege: float32(cMediaResource.jpege), + Vpc: float32(cMediaResource.vpc), + Vdec: float32(cMediaResource.vdec), + Pngd: float32(cMediaResource.pngd), + Venc: float32(cMediaResource.venc), + } + return mediaResource +} + +func convertVDevQueryInfo(cVDevQueryInfo C.struct_dcmi_vdev_query_info) common.CgoVDevQueryInfo { + name := convertToString(cVDevQueryInfo.name) + vDevQueryInfo := common.CgoVDevQueryInfo{ + Name: string(name), + Status: uint32(cVDevQueryInfo.status), + IsContainerUsed: uint32(cVDevQueryInfo.is_container_used), + Vfid: uint32(cVDevQueryInfo.vfid), + VfgID: uint32(cVDevQueryInfo.vfg_id), + ContainerID: uint64(cVDevQueryInfo.container_id), + Base: convertBaseResource(cVDevQueryInfo.base), + Computing: convertComputingResource(cVDevQueryInfo.computing), + Media: convertMediaResource(cVDevQueryInfo.media), + } + return vDevQueryInfo +} + +func convertVDevQueryStru(cVDevQueryStru C.struct_dcmi_vdev_query_stru) common.CgoVDevQueryStru { + vDevQueryStru := common.CgoVDevQueryStru{ + VDevID: uint32(cVDevQueryStru.vdev_id), + QueryInfo: convertVDevQueryInfo(cVDevQueryStru.query_info), + } + return vDevQueryStru +} + +// DcGetDeviceVDevResource get virtual device resource info +func (d *DcManager) DcGetDeviceVDevResource(cardID, deviceID int32, vDevID uint32) (common.CgoVDevQueryStru, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoVDevQueryStru{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetVDevResource + var vDevResource C.struct_dcmi_vdev_query_stru + size := C.uint(unsafe.Sizeof(vDevResource)) + vDevResource.vdev_id = C.uint(vDevID) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&vDevResource), &size); int32(retCode) != common.Success { + return common.CgoVDevQueryStru{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) + } + return convertVDevQueryStru(vDevResource), nil +} + +func convertSocTotalResource(cSocTotalResource C.struct_dcmi_soc_total_resource) common.CgoSocTotalResource { + socTotalResource := common.CgoSocTotalResource{ + VDevNum: uint32(cSocTotalResource.vdev_num), + VfgNum: uint32(cSocTotalResource.vfg_num), + VfgBitmap: uint32(cSocTotalResource.vfg_bitmap), + Base: convertBaseResource(cSocTotalResource.base), + Computing: convertComputingResource(cSocTotalResource.computing), + Media: convertMediaResource(cSocTotalResource.media), + } + for i := uint32(0); i < uint32(cSocTotalResource.vdev_num) && i < dcmiMaxVdevNum; i++ { + socTotalResource.VDevID = append(socTotalResource.VDevID, uint32(cSocTotalResource.vdev_id[i])) + } + return socTotalResource +} + +// DcGetDeviceTotalResource get device total resource info +func (d *DcManager) DcGetDeviceTotalResource(cardID, deviceID int32) (common.CgoSocTotalResource, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoSocTotalResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetTotalResource + var totalResource C.struct_dcmi_soc_total_resource + size := C.uint(unsafe.Sizeof(totalResource)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&totalResource), &size); int32(retCode) != common.Success { + return common.CgoSocTotalResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) + } + if uint32(totalResource.vdev_num) > dcmiMaxVdevNum { + return common.CgoSocTotalResource{}, fmt.Errorf("get error virtual quantity: %d", + uint32(totalResource.vdev_num)) + } + + return convertSocTotalResource(totalResource), nil +} + +func convertSuperPodInfo(cSuperPodInfo C.struct_dcmi_spod_info) common.CgoSuperPodInfo { + superPodInfo := common.CgoSuperPodInfo{ + SdId: uint32(cSuperPodInfo.sdid), + ScaleType: uint32(cSuperPodInfo.scale_type), + SuperPodId: uint32(cSuperPodInfo.super_pod_id), + ServerId: uint32(cSuperPodInfo.server_id), + } + + for i := uint32(0); i < dcmiMaxReserveNum; i++ { + superPodInfo.Reserve = append(superPodInfo.Reserve, uint32(cSuperPodInfo.reserve[i])) + } + + return superPodInfo +} + +// DcGetSuperPodInfo get device total resource info +func (d *DcManager) DcGetSuperPodInfo(cardID, deviceID int32) (common.CgoSuperPodInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoSuperPodInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var unitType C.enum_dcmi_unit_type + if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { + return common.CgoSuperPodInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) + } + if int32(unitType) != common.NpuType { + return common.CgoSuperPodInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) + } + + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdChipInf) + subCmd := CinfSubCmdGetSPodInfo + var sPodInfo C.struct_dcmi_spod_info + size := C.uint(unsafe.Sizeof(sPodInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&sPodInfo), &size); int32(retCode) != common.Success { + return common.CgoSuperPodInfo{}, fmt.Errorf("get super pod info failed, error is: %d", int32(retCode)) + } + + return convertSuperPodInfo(sPodInfo), nil +} + +func convertSocFreeResource(cSocFreeResource C.struct_dcmi_soc_free_resource) common.CgoSocFreeResource { + socFreeResource := common.CgoSocFreeResource{ + VfgNum: uint32(cSocFreeResource.vfg_num), + VfgBitmap: uint32(cSocFreeResource.vfg_bitmap), + Base: convertBaseResource(cSocFreeResource.base), + Computing: convertComputingResource(cSocFreeResource.computing), + Media: convertMediaResource(cSocFreeResource.media), + } + return socFreeResource +} + +// DcGetDeviceFreeResource get device free resource info +func (d *DcManager) DcGetDeviceFreeResource(cardID, deviceID int32) (common.CgoSocFreeResource, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.CgoSocFreeResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetFreeResource + var freeResource C.struct_dcmi_soc_free_resource + size := C.uint(unsafe.Sizeof(freeResource)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&freeResource), &size); int32(retCode) != common.Success { + return common.CgoSocFreeResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) + } + return convertSocFreeResource(freeResource), nil +} + +// DcGetVDevActivityInfo get vir device activity info by virtual device id +func (d *DcManager) DcGetVDevActivityInfo(cardID, deviceID int32, vDevID uint32) (common.VDevActivityInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.VDevActivityInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if !common.IsValidVDevID(vDevID) { + return common.VDevActivityInfo{}, fmt.Errorf("vDevID(%d) invalid", vDevID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) + subCmd := VmngSubCmdGetVDevActivity + var vDevActivityInfo C.struct_dcmi_vdev_query_stru + size := C.uint(unsafe.Sizeof(vDevActivityInfo)) + vDevActivityInfo.vdev_id = C.uint(vDevID) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + unsafe.Pointer(&vDevActivityInfo), &size); int32(retCode) != common.Success { + return common.VDevActivityInfo{}, fmt.Errorf("retCode: %d", int32(retCode)) + } + totalMemSize := uint64(vDevActivityInfo.query_info.computing.vdev_memory_total) + usedMemSize := totalMemSize - uint64(vDevActivityInfo.query_info.computing.vdev_memory_free) + if usedMemSize < 0 { + return common.VDevActivityInfo{}, errors.New("used memory value abnormal") + } + return common.VDevActivityInfo{ + VDevID: vDevID, + VDevAiCoreRate: uint32(vDevActivityInfo.query_info.computing.vdev_aicore_utilization), + VDevTotalMem: totalMemSize, + VDevUsedMem: usedMemSize, + IsVirtualDev: true, + }, nil +} + +// DcVGetDeviceInfo get vdevice resource info +func (d *DcManager) DcVGetDeviceInfo(cardID, deviceID int32) (common.VirtualDevInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.VirtualDevInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var unitType C.enum_dcmi_unit_type + if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { + return common.VirtualDevInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) + } + if int32(unitType) != common.NpuType { + return common.VirtualDevInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) + } + + cgoDcmiSocTotalResource, err := d.DcGetDeviceTotalResource(cardID, deviceID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get device total resource failed, error is: %v", err) + } + + cgoDcmiSocFreeResource, err := d.DcGetDeviceFreeResource(cardID, deviceID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get device free resource failed, error is: %v", err) + } + dcmiVDevInfo := common.VirtualDevInfo{ + TotalResource: cgoDcmiSocTotalResource, + FreeResource: cgoDcmiSocFreeResource, + } + for _, vDevID := range cgoDcmiSocTotalResource.VDevID { + cgoVDevQueryStru, err := d.DcGetDeviceVDevResource(cardID, deviceID, vDevID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get device virtual resource failed, error is: %v", err) + } + dcmiVDevInfo.VDevInfo = append(dcmiVDevInfo.VDevInfo, cgoVDevQueryStru) + vDevActivityInfo, err := d.DcGetVDevActivityInfo(cardID, deviceID, vDevID) + if err != nil { + hwlog.RunLog.Warnf("get cur vDev's activity info failed, err: %s", err) + continue + } + vDevActivityInfo.VDevAiCore = float64(cgoVDevQueryStru.QueryInfo.Computing.Aic) + dcmiVDevInfo.VDevActivityInfo = append(dcmiVDevInfo.VDevActivityInfo, vDevActivityInfo) + } + return dcmiVDevInfo, nil +} + +// DcGetCardIDDeviceID get card id and device id from logic id +func (d *DcManager) DcGetCardIDDeviceID(logicID int32) (int32, int32, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) + } + var cardID, deviceID C.int + if retCode := C.dcmi_get_card_id_device_id_from_logicid(&cardID, &deviceID, + C.uint(logicID)); int32(retCode) != common.Success { + return common.RetError, common.RetError, + fmt.Errorf("failed to get card id and device id by logicID(%d), errorcode is: %d", logicID, + int32(retCode)) + } + if !common.IsValidCardIDAndDeviceID(int32(cardID), int32(deviceID)) { + return common.RetError, common.RetError, fmt.Errorf("failed to get card id and device id, "+ + "cardID(%d) or deviceID(%d) is invalid", int32(cardID), int32(deviceID)) + } + + return int32(cardID), int32(deviceID), nil +} + +// DcCreateVDevice create virtual device by logic id +func (d *DcManager) DcCreateVDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. + CgoCreateVDevOut, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return common.CgoCreateVDevOut{}, fmt.Errorf("get card id and device id failed, error is: %v", err) + } + + createVDevOut, err := d.DcCreateVirtualDevice(cardID, deviceID, vDevInfo) + if err != nil { + return common.CgoCreateVDevOut{}, fmt.Errorf("create virtual device failed, error is: %v", err) + } + return createVDevOut, nil +} + +// DcGetVDeviceInfo get virtual device info by logic id +func (d *DcManager) DcGetVDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.VirtualDevInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get card id and device id failed, error is: %v", err) + } + + dcmiVDevInfo, err := d.DcVGetDeviceInfo(cardID, deviceID) + if err != nil { + return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v", err) + } + return dcmiVDevInfo, nil +} + +// DcDestroyVDevice destroy spec virtual device by logic id +func (d *DcManager) DcDestroyVDevice(logicID int32, vDevID uint32) error { + if !common.IsValidLogicIDOrPhyID(logicID) { + return fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return fmt.Errorf("get card id and device id failed, error is: %v", err) + } + + if err = d.DcSetDestroyVirtualDevice(cardID, deviceID, vDevID); err != nil { + return fmt.Errorf("destroy virtual device failed, error is: %v", err) + } + return nil +} + +// DcGetDeviceVoltage the accuracy is 0.01v. +func (d *DcManager) DcGetDeviceVoltage(cardID, deviceID int32) (float32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var vol C.uint + if retCode := C.dcmi_get_device_voltage(C.int(cardID), C.int(deviceID), &vol); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("failed to obtain the voltage based on card_id(%d) and "+ + "device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) + } + // the voltage's value is error if it's greater than or equal to MaxInt32 + if common.IsGreaterThanOrEqualInt32(int64(vol)) { + return common.RetError, fmt.Errorf("voltage value out of range(max is int32), "+ + "card_id(%d) and device_id(%d), voltage: %d", cardID, deviceID, int64(vol)) + } + + return float32(vol) * common.ReduceOnePercent, nil +} + +// DcGetDevicePowerInfo the accuracy is 0.1w, the result like: 8.2 +func (d *DcManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cpower C.int + if retCode := C.dcmi_get_device_power_info(C.int(cardID), C.int(deviceID), + &cpower); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("failed to obtain the power based on card_id(%d) and device_id(%d)"+ + ", error code: %d", cardID, deviceID, int32(retCode)) + } + parsedPower := float32(cpower) + if parsedPower < 0 { + return common.RetError, fmt.Errorf("get wrong device power, card_id(%d) and device_id(%d), power: %f", + cardID, deviceID, parsedPower) + } + + return parsedPower * common.ReduceTenth, nil + +} + +// DcGetDeviceFrequency get device frequency, unit MHz +// Ascend910B with frequency type: 2,6,7,9 +// Ascend910 with frequency type: 2,6,7,9 +// Ascend310 with frequency type: 1,2,6,7,9 +// Ascend310P with frequency type: 1,2,7,9,12 +// more information see common.DeviceType +func (d *DcManager) DcGetDeviceFrequency(cardID, deviceID int32, devType common.DeviceType) (uint32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cFrequency C.uint + if retCode := C.dcmi_get_device_frequency(C.int(cardID), C.int(deviceID), C.enum_dcmi_freq_type(devType.Code), + &cFrequency); int32(retCode) != common.Success { + return common.UnRetError, + buildDcmiErr(cardID, deviceID, fmt.Sprintf("frequency (name: %v, code:%d)", devType.Name, devType.Code), retCode) + } + // check whether cFrequency is too big + if common.IsGreaterThanOrEqualInt32(int64(cFrequency)) || int64(cFrequency) < 0 { + return common.UnRetError, fmt.Errorf("frequency value out of range [0, int32),card_id(%d) and device_id(%d), "+ + "frequency (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, int64(cFrequency)) + } + return uint32(cFrequency), nil +} + +// DcGetMemoryInfo use v3 interface to query memory info +func (d *DcManager) DcGetMemoryInfo(cardID, deviceID int32) (*common.MemoryInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cmInfoV3 CDcmiMemoryInfoV3 + if retCode := C.dcmi_get_device_memory_info_v3(C.int(cardID), C.int(deviceID), + &cmInfoV3); int32(retCode) != common.Success { + return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ + "%d) and device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) + } + + if uint64(cmInfoV3.memory_size) < uint64(cmInfoV3.memory_available) { + return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ + "%d) and device_id(%d), total memory is less than available memory", cardID, deviceID) + } + + return &common.MemoryInfo{ + MemorySize: uint64(cmInfoV3.memory_size), + MemoryAvailable: uint64(cmInfoV3.memory_available), + Frequency: uint32(cmInfoV3.freq), + Utilization: uint32(cmInfoV3.utiliza), + }, nil + +} + +// FuncDcmiGetDeviceHbmInfo dcmi_get_device_hbm_info function for outer invoke, only for Ascend910 +func FuncDcmiGetDeviceHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cHbmInfo C.struct_dcmi_hbm_info + if retCode := C.dcmi_get_device_hbm_info(C.int(cardID), C.int(deviceID), + &cHbmInfo); int32(retCode) != common.Success { + return nil, buildDcmiErr(cardID, deviceID, "high bandwidth memory info", retCode) + } + hbmTemp := int32(cHbmInfo.temp) + if hbmTemp < 0 { + return nil, fmt.Errorf("get wrong device HBM temporary, card_id(%d) and device_id(%d), HBM.temp: %d", + cardID, deviceID, hbmTemp) + } + return &common.HbmInfo{ + MemorySize: uint64(cHbmInfo.memory_size), + Frequency: uint32(cHbmInfo.freq), + Usage: uint64(cHbmInfo.memory_usage), + Temp: hbmTemp, + BandWidthUtilRate: uint32(cHbmInfo.bandwith_util_rate)}, nil +} + +// DcGetHbmInfo get HBM information A310/A310P not support +func (d *DcManager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { + return &common.HbmInfo{ + MemorySize: 0, + Frequency: 0, + Usage: 0, + Temp: 0, + BandWidthUtilRate: 0}, nil +} + +// DcGetDeviceErrorCode get the error count and errorcode of the device,only return the first errorcode +func (d *DcManager) DcGetDeviceErrorCode(cardID, deviceID int32) (int32, int64, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, + deviceID) + } + var errCount C.int + var errCodeArray [common.MaxErrorCodeCount]C.uint + if retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], + common.MaxErrorCodeCount); int32(retCode) != common.Success { + return common.RetError, common.RetError, fmt.Errorf("failed to obtain the device errorcode based on "+ + "card_id(%d) and device_id(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), + int32(errCount)) + } + + if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { + return common.RetError, common.RetError, fmt.Errorf("get wrong errorcode count, "+ + "card_id(%d) and device_id(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) + } + + return int32(errCount), int64(errCodeArray[0]), nil +} + +// DcGetDeviceCount get device count +func (d *DcManager) DcGetDeviceCount() (int32, error) { + devNum, _, err := d.DcGetLogicIDList() + if err != nil { + return common.RetError, fmt.Errorf("get device count failed, error: %v", err) + } + return devNum, nil +} + +// DcGetLogicIDList get device logic id list +func (d *DcManager) DcGetLogicIDList() (int32, []int32, error) { + logicIDs := make([]int32, 0) + var totalNum int32 + _, cardList, err := d.DcGetCardList() + if err != nil { + return common.RetError, logicIDs, fmt.Errorf("get card list failed, error: %v", err) + } + for _, cardID := range cardList { + devNumInCard, err := d.DcGetDeviceNumInCard(cardID) + if err != nil { + return common.RetError, logicIDs, fmt.Errorf("get device num by cardID: %d failed, error: %v", + cardID, err) + } + totalNum += devNumInCard + if totalNum > common.HiAIMaxDeviceNum*common.HiAIMaxCardNum { + return common.RetError, nil, fmt.Errorf("get device num: %d greater than %d", + totalNum, common.HiAIMaxDeviceNum*common.HiAIMaxCardNum) + } + for devID := int32(0); devID < devNumInCard; devID++ { + logicID, err := d.DcGetDeviceLogicID(cardID, devID) + if err != nil { + return common.RetError, nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ + "failed, error: %v", cardID, devID, err) + } + logicIDs = append(logicIDs, logicID) + } + } + return totalNum, logicIDs, nil +} + +// DcGetDeviceHealth get device health +func (d *DcManager) DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var health C.uint + if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), + &health); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ + "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) + } + if common.IsGreaterThanOrEqualInt32(int64(health)) { + return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ + "health: %d", cardID, deviceID, int64(health)) + } + return int32(health), nil +} + +// DcGetDeviceUtilizationRate get device utils rate by id +func (d *DcManager) DcGetDeviceUtilizationRate(cardID, deviceID int32, devType common.DeviceType) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var rate C.uint + if retCode := C.dcmi_get_device_utilization_rate(C.int(cardID), C.int(deviceID), C.int(devType.Code), + &rate); int32(retCode) != common.Success { + return common.RetError, + buildDcmiErr(cardID, deviceID, fmt.Sprintf("utilization (name: %v, code:%d)", devType.Name, devType.Code), retCode) + } + if !common.IsValidUtilizationRate(uint32(rate)) { + return common.RetError, fmt.Errorf("get wrong device (cardID: %d, deviceID: %d) "+ + "utilization (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, uint32(rate)) + } + return int32(rate), nil +} + +// DcGetDeviceTemperature get the device temperature +func (d *DcManager) DcGetDeviceTemperature(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var temp C.int + if retCode := C.dcmi_get_device_temperature(C.int(cardID), C.int(deviceID), + &temp); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) temperature failed, error "+ + "code is : %d", cardID, deviceID, int32(retCode)) + } + parsedTemp := int32(temp) + if parsedTemp < int32(common.DefaultTemperatureWhenQueryFailed) { + return common.RetError, fmt.Errorf("get wrong device temperature, devcie (cardID: %d, deviceID: %d), "+ + "temperature: %d", cardID, deviceID, parsedTemp) + } + return parsedTemp, nil +} + +func convertUCharToCharArr(cgoArr [maxChipNameLen]C.uchar) []byte { + var charArr []byte + for _, v := range cgoArr { + if v == 0 { + break + } + charArr = append(charArr, byte(v)) + } + return charArr +} + +// DcGetChipInfo get the chip info by cardID and deviceID +func (d *DcManager) DcGetChipInfo(cardID, deviceID int32) (*common.ChipInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var chipInfo C.struct_dcmi_chip_info_v2 + chip := &common.ChipInfo{} + if rCode := C.dcmi_get_device_chip_info_v2(C.int(cardID), C.int(deviceID), &chipInfo); int32(rCode) != common.Success { + hwlog.RunLog.Debugf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ + " error code: %d", cardID, deviceID, int32(rCode)) + var oldChipInfo C.struct_dcmi_chip_info + if rCode = C.dcmi_get_device_chip_info(C.int(cardID), C.int(deviceID), &oldChipInfo); int32(rCode) != common.Success { + return nil, fmt.Errorf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ + " error code: %d", cardID, deviceID, int32(rCode)) + } + chip.Name = string(convertUCharToCharArr(oldChipInfo.chip_name)) + chip.Type = string(convertUCharToCharArr(oldChipInfo.chip_type)) + chip.Version = string(convertUCharToCharArr(oldChipInfo.chip_ver)) + chip.AICoreCnt = int(oldChipInfo.aicore_cnt) + } else { + chip.Name = string(convertUCharToCharArr(chipInfo.chip_name)) + chip.Type = string(convertUCharToCharArr(chipInfo.chip_type)) + chip.Version = string(convertUCharToCharArr(chipInfo.chip_ver)) + chip.AICoreCnt = int(chipInfo.aicore_cnt) + chip.NpuName = string(convertUCharToCharArr(chipInfo.npu_name)) + } + if !common.IsValidChipInfo(chip) { + return nil, fmt.Errorf("get device ChipInfo information failed, chip info is empty,"+ + " cardID(%d), deviceID(%d)", cardID, deviceID) + } + + return chip, nil +} + +// DcGetPhysicIDFromLogicID get physicID from logicID +func (d *DcManager) DcGetPhysicIDFromLogicID(logicID int32) (int32, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, fmt.Errorf("logicID(%d) is invalid", logicID) + } + var physicID C.uint + if rCode := C.dcmi_get_device_phyid_from_logicid(C.uint(logicID), &physicID); int32(rCode) != common.Success { + return common.RetError, fmt.Errorf("get physic id from logicID(%d) failed, error code: %d", logicID, int32(rCode)) + } + if !common.IsValidLogicIDOrPhyID(int32(physicID)) { + return common.RetError, fmt.Errorf("get wrong physicID(%d) from logicID(%d)", uint32(physicID), logicID) + } + return int32(physicID), nil +} + +// DcGetDeviceIPAddress get device IP address by cardID and deviceID +func (d *DcManager) DcGetDeviceIPAddress(cardID, deviceID, ipType int32) (string, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var portType C.enum_dcmi_port_type = 1 + var portID C.int + var ipAddress C.struct_dcmi_ip_addr + var maskAddress C.struct_dcmi_ip_addr + if ipType == ipAddrTypeV6 { + ipAddress.ip_type = ipAddrTypeV6 + } + rCode := C.dcmi_get_device_ip(C.int(cardID), C.int(deviceID), portType, portID, &ipAddress, &maskAddress) + if int32(rCode) != common.Success { + return "", fmt.Errorf("get device IP address failed, cardID(%d), deviceID(%d), error code: %d", + cardID, deviceID, int32(rCode)) + } + if ipType == ipAddrTypeV6 { + return d.buildIPv6Addr(ipAddress) + } + return d.buildIPv4Addr(ipAddress) +} + +func (d *DcManager) buildIPv4Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { + deviceIP := make([]string, 0, net.IPv4len) + for key, val := range ipAddress.u_addr { + if key >= net.IPv4len { + break + } + deviceIP = append(deviceIP, fmt.Sprintf("%v", val)) + } + if netIP := net.ParseIP(strings.Join(deviceIP, ".")); netIP != nil { + return netIP.String(), nil + } + return "", fmt.Errorf("the device IPv4 address is invalid, value: %v", deviceIP) +} + +func (d *DcManager) buildIPv6Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { + deviceIP := make([]byte, 0, net.IPv6len) + for key, val := range ipAddress.u_addr { + if key >= net.IPv6len { + break + } + deviceIP = append(deviceIP, byte(val)) + } + if netIP := net.IP(deviceIP); netIP != nil { + return netIP.String(), nil + } + return "", fmt.Errorf("the device IPv6 address is invalid, value: %v", deviceIP) +} + +func callDcmiGetDeviceNetworkHealth(cardID, deviceID int32, result chan<- common.DeviceNetworkHealth) { + var healthCode C.enum_dcmi_rdfx_detect_result + rCode := C.dcmi_get_device_network_health(C.int(cardID), C.int(deviceID), &healthCode) + result <- common.DeviceNetworkHealth{HealthCode: uint32(healthCode), RetCode: int32(rCode)} +} + +// DcGetDeviceNetWorkHealth get device network health by cardID and deviceID +func (d *DcManager) DcGetDeviceNetWorkHealth(cardID, deviceID int32) (uint32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + result := make(chan common.DeviceNetworkHealth, 1) + go callDcmiGetDeviceNetworkHealth(cardID, deviceID, result) + select { + case res := <-result: + if res.RetCode != common.Success { + return common.UnRetError, fmt.Errorf("get device network healthCode failed, cardID(%d),"+ + " deviceID(%d), ret code: %d, health code: %d", cardID, deviceID, res.RetCode, res.HealthCode) + } + + if int32(res.HealthCode) < 0 || int32(res.HealthCode) > int32(math.MaxInt8) { + return common.UnRetError, fmt.Errorf("get wrong device network healthCode, cardID(%d), deviceID(%d),"+ + " error healthCode: %d", cardID, deviceID, int32(res.HealthCode)) + } + + return res.HealthCode, nil + // dcmi_get_device_network_health is occasionally blocked for a long time, because of retrying, + // after the card dropped. This method is used to interrupt the execution of the dcmi interface, + // if invoking time excceeds 1 second. + case <-time.After(common.DcmiApiTimeout * time.Second): + return common.UnRetError, fmt.Errorf("accessing dcmi_get_device_network_health interface timeout, "+ + "cardID(%d), deviceID(%d)", cardID, deviceID) + } +} + +// DcGetLogicIDFromPhysicID get logicID from physicID +func (d *DcManager) DcGetLogicIDFromPhysicID(physicID int32) (int32, error) { + if !common.IsValidLogicIDOrPhyID(physicID) { + return common.RetError, fmt.Errorf("physicID(%d) is invalid", physicID) + } + var logicID C.uint + if rCode := C.dcmi_get_device_logicid_from_phyid(C.uint(physicID), &logicID); int32(rCode) != common.Success { + return common.RetError, fmt.Errorf("get logicID from physicID(%d) failed, error code: %d", + physicID, int32(rCode)) + } + + if !common.IsValidLogicIDOrPhyID(int32(logicID)) { + return common.RetError, fmt.Errorf("get wrong logicID(%d) from physicID(%d)", uint32(logicID), physicID) + } + return int32(logicID), nil +} + +// FuncDcmiMcuGetPowerInfo dcmi_mcu_get_power_info_new function for outer invoke +func FuncDcmiMcuGetPowerInfo(cardID int32) (float32, error) { + var power C.int + if retCode := C.dcmi_mcu_get_power_info_new(C.int(cardID), &power); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("mcu_get_power_info failed, error code is:%d", int32(retCode)) + } + parsedPower := float32(power) + if parsedPower < 0 { + return common.RetError, fmt.Errorf("get wrong mcu_get_power_info, cardID: %d, power: %f", cardID, + parsedPower) + } + return parsedPower * common.ReduceTenth, nil +} + +// DcGetMcuPowerInfo this function is only for Ascend310P, A910/A310 not support +func (d *DcManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { + return 0, nil +} + +// DcGetProductType get product type by dcmi interface +func (d *DcManager) DcGetProductType(cardID, deviceID int32) (string, error) { + cProductType := C.CString(string(make([]byte, productTypeLen))) + defer C.free(unsafe.Pointer(cProductType)) + err := C.dcmi_get_product_type(C.int(cardID), C.int(deviceID), (*C.char)(cProductType), productTypeLen+1) + if err != 0 { + return "", fmt.Errorf("get product type failed, errCode: %d", int32(err)) + } + return C.GoString(cProductType), nil +} + +// DcGetNpuWorkMode get npu work mode, this function is only for Ascend910, A310/310P not support +func (d *DcManager) DcGetNpuWorkMode(cardID int32) (int, error) { + var cWorkMode C.uchar + err := C.dcmi_get_npu_work_mode(C.int(cardID), &cWorkMode) + if err != 0 { + return common.RetError, fmt.Errorf("get npu work mode failed, errCode: %d", int32(err)) + } + return int(cWorkMode), nil +} + +// DcSetDeviceReset reset spec device chip +func (d *DcManager) DcSetDeviceReset(cardID, deviceID int32) error { + var channelType C.enum_dcmi_reset_channel = C.INBAND_CHANNEL + return d.setDeviceReset(cardID, deviceID, channelType) +} + +// DcGetBrotherCardID get brother card id +func (d *DcManager) DcGetBrotherCardID(cardID, deviceID int32) (int32, error) { + var broCardID C.int + errCode := C.dcmi_get_netdev_brother_device(C.int(cardID), C.int(deviceID), &broCardID) + if errCode != common.Success { + return common.RetError, fmt.Errorf("unable to get brother card, errCode: %v", errCode) + } + return int32(broCardID), nil +} + +// DcGetOutBandChannelState get out band channel state +func (d *DcManager) DcGetOutBandChannelState(cardID, deviceID int32) error { + var channelState C.int + errCode := C.dcmi_get_device_outband_channel_state(C.int(cardID), C.int(deviceID), &channelState) + if errCode != common.Success { + return fmt.Errorf("get out band channel state error, errCode: %v", errCode) + } + if channelState != common.ChannelStateOk { + return fmt.Errorf("chip reset not support, channel state: %v", channelState) + } + return nil +} + +// DcPreResetSoc pre reset soc, used before reset out band +func (d *DcManager) DcPreResetSoc(cardID, deviceID int32) error { + errCode := C.dcmi_pre_reset_soc(C.int(cardID), C.int(deviceID)) + if errCode != common.Success { + return fmt.Errorf("pre reset failed, cardID: %v, deviceID: %v, errCode: %v", cardID, deviceID, errCode) + } + return nil +} + +// DcSetDeviceResetOutBand reset spec device chip out band +func (d *DcManager) DcSetDeviceResetOutBand(cardID, deviceID int32) error { + var channelType C.enum_dcmi_reset_channel = C.OUTBAND_CHANNEL + return d.setDeviceReset(cardID, deviceID, channelType) +} + +// DcRescanSoc trigger soc rescan, non-blocking +func (d *DcManager) DcRescanSoc(cardID, deviceID int32) error { + errCode := C.dcmi_rescan_soc(C.int(cardID), C.int(deviceID)) + if errCode != common.Success { + return fmt.Errorf("fail to rescan chip cardID %d, deviceID %v, errCode: %v", cardID, deviceID, errCode) + } + return nil +} + +func (d *DcManager) setDeviceReset(cardID, deviceID int32, channelType C.enum_dcmi_reset_channel) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if errCode := C.dcmi_set_device_reset(C.int(cardID), C.int(deviceID), channelType); errCode != 0 { + return fmt.Errorf("cardID(%d) and deviceID(%d) hot reset errCode: %v", cardID, deviceID, errCode) + } + return nil +} + +// DcGetDeviceBootStatus get NPU boot status +func (d *DcManager) DcGetDeviceBootStatus(logicID int32) (int, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) + } + cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) + if err != nil { + return common.RetError, fmt.Errorf("failed to get cardID and deviceID by logicID(%d)", logicID) + } + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var bootStatus C.enum_dcmi_boot_status = C.DCMI_BOOT_STATUS_FINISH + if errCode := C.dcmi_get_device_boot_status(C.int(cardID), C.int(deviceID), &bootStatus); errCode != 0 { + return common.RetError, fmt.Errorf("device boot status errCode: %v", errCode) + } + return int(bootStatus), nil +} + +// DcGetDeviceAllErrorCode get the error count and all error codes of the device +func (d *DcManager) DcGetDeviceAllErrorCode(cardID, deviceID int32) (int32, []int64, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, + deviceID) + } + var errCount C.int + var errCodeArray [common.MaxErrorCodeCount]C.uint + retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], + common.MaxErrorCodeCount) + + var health C.uint + healthRetCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), &health) + + if int32(retCode) != common.Success && int32(healthRetCode) != common.DeviceNotReadyErrCode { + return common.RetError, nil, fmt.Errorf("failed to obtain the device errorcode based on cardID("+ + "%d) and deviceID(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), int32(errCount)) + } + + errCodes := make([]int64, 0, len(errCodeArray)) + for _, errCode := range errCodeArray { + if int64(errCode) != 0 { + errCodes = append(errCodes, int64(errCode)) + } + } + + if int32(healthRetCode) == common.DeviceNotReadyErrCode { + hwlog.RunLog.Errorf("device errorcode v2 ret code: %d, device health ret code: %d, device not ready, "+ + "maybe a card drop fault occurred on cardID(%d) and deviceID(%d)", int32(retCode), int32(healthRetCode), + cardID, deviceID) + errCount += 1 + errCodes = append(errCodes, common.CardDropFaultCode) + } + + if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { + return common.RetError, nil, fmt.Errorf("get wrong errorcode count, "+ + "cardID(%d) and deviceID(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) + } + + return int32(errCount), errCodes, nil +} + +// DcSubscribeDeviceFaultEvent subscribe device fault, callback with func 'faultEventCallFunc' +func (d *DcManager) DcSubscribeDeviceFaultEvent(cardID, deviceID int32) error { + if faultEventCallFunc == nil { + return errors.New("callFunc is invalid, can't start subscribe") + } + + var filter C.struct_dcmi_event_filter + if rCode := C.dcmi_subscribe_fault_event(C.int(cardID), C.int(deviceID), filter); int32(rCode) != common.Success { + return fmt.Errorf("subscribe fault event failed, cardID(%d) and deviceID(%d), error code: %d", + cardID, deviceID, int32(rCode)) + } + return nil +} + +// DcSetFaultEventCallFunc set fault event call back func +func (d *DcManager) DcSetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) { + faultEventCallFunc = businessFunc +} + +//export goEventFaultCallBack +func goEventFaultCallBack(event C.struct_dcmi_dms_fault_event) { + if faultEventCallFunc == nil { + hwlog.RunLog.Errorf("no fault event call back func") + return + } + // recovery event recorded fault event occurrence time, the recovery event time cannot be obtained. + // Therefore, all event occurrence time is recorded as the current host time when the event is received. + devFaultInfo := common.DevFaultInfo{ + EventID: int64(event.event_id), + LogicID: int32(event.deviceid), + ModuleType: int8(event.node_type), + ModuleID: int8(event.node_id), + SubModuleType: int8(event.sub_node_type), + SubModuleID: int8(event.sub_node_id), + Severity: int8(event.severity), + Assertion: int8(event.assertion), + AlarmRaisedTime: time.Now().UnixMilli(), + } + faultEventCallFunc(devFaultInfo) +} + +// DcGetDieID get chip die ID, like VDieID or NDieID, only Ascend910 has NDieID +func (d *DcManager) DcGetDieID(cardID, deviceID int32, dcmiDieType DieType) (string, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + if dcmiDieType != VDIE && dcmiDieType != NDIE { + return "", fmt.Errorf("dcmi die type can only be one of %d or %d", VDIE, NDIE) + } + + var dieIDObj C.struct_dcmi_die_id + if retCode := C.dcmi_get_device_die_v2(C.int(cardID), C.int(deviceID), + C.enum_dcmi_die_type(dcmiDieType), &dieIDObj); int32(retCode) != common.Success { + return "", buildDcmiErr(cardID, deviceID, "chip die ID", retCode) + } + + const hexBase = 16 + dieIDStr := make([]string, DieIDCount) + + hwlog.RunLog.Debugf("cardID(%d), deviceID(%d) get die type(%d) value %v", cardID, deviceID, dcmiDieType, + dieIDObj.soc_die) + for i := 0; i < DieIDCount; i++ { + s := strconv.FormatUint(uint64(dieIDObj.soc_die[i]), hexBase) + // Each part of the die id consists of 8 characters, and if the length is not enough, + // zero is added at the beginning + dieIDStr[i] = fmt.Sprintf("%08s", s) + } + return strings.ToUpper(strings.Join(dieIDStr, "-")), nil +} + +// DcGetDevProcessInfo chip process info +func (d *DcManager) DcGetDevProcessInfo(cardID, deviceID int32) (*common.DevProcessInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info + var procNum C.int + + if retCode := C.dcmi_get_device_resource_info(C.int(cardID), C.int(deviceID), &procList[0], + &procNum); int32(retCode) != common.Success { + return nil, buildDcmiErr(cardID, deviceID, "device resource", retCode) + } + + if int32(procNum) < 0 || int32(procNum) > common.MaxProcNum { + return nil, fmt.Errorf("get invalid proccess num (%d), cardID(%d) and deviceID(%d)", int32(procNum), cardID, + deviceID) + } + + return convertToDevResourceInfo(procList, int32(procNum)), nil +} + +func convertToDevResourceInfo(procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info, + procNum int32) *common.DevProcessInfo { + if procNum < 0 || procNum > common.MaxProcNum { + hwlog.RunLog.Errorf("process num %v is not within in the range [0~%v]", procNum, common.MaxProcNum) + return nil + } + + info := new(common.DevProcessInfo) + if procNum == 0 { + return info + } + + info.ProcNum = procNum + for i := int32(0); i < procNum; i++ { + proc := common.DevProcInfo{ + Pid: int32(procList[i].proc_id), + MemUsage: float64(procList[i].proc_mem_usage) / common.UnitMB, // convert byte to MB + } + info.DevProcArray = append(info.DevProcArray, proc) + } + + return info +} + +// DcGetPCIeBusInfo pcie bus info +func (d *DcManager) DcGetPCIeBusInfo(cardID, deviceID int32) (string, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var pcieInfo C.struct_dcmi_pcie_info_all + + if retCode := C.dcmi_get_device_pcie_info_v2(C.int(cardID), + C.int(deviceID), &pcieInfo); int32(retCode) != common.Success { + return "", buildDcmiErr(cardID, deviceID, "pcie bus", retCode) + } + + info := fmt.Sprintf("%04X:%02X:%02X.%-4X", int32(pcieInfo.domain), uint32(pcieInfo.bdf_busid), + uint32(pcieInfo.bdf_deviceid), uint32(pcieInfo.bdf_funcid)) + hwlog.RunLog.Debugf("pcie bus info is: '%s'", info) + + return strings.TrimRight(info, " "), nil +} + +// DcGetDeviceBoardInfo return board info of device +func (d *DcManager) DcGetDeviceBoardInfo(cardID, deviceID int32) (common.BoardInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.BoardInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + + var cBoardInfo C.struct_dcmi_board_info + + if retCode := C.dcmi_get_device_board_info(C.int(cardID), C.int(deviceID), + &cBoardInfo); int32(retCode) != common.Success { + return common.BoardInfo{}, buildDcmiErr(cardID, deviceID, "board info", retCode) + } + + return common.BoardInfo{ + BoardId: uint32(cBoardInfo.board_id), + PcbId: uint32(cBoardInfo.pcb_id), + BomId: uint32(cBoardInfo.bom_id), + SlotId: uint32(cBoardInfo.slot_id), + }, nil +} + +// DcGetPCIEBandwidth get pcie bandwidth value +func (d *DcManager) DcGetPCIEBandwidth(cardID, deviceID int32, profilingTime int) (common.PCIEBwStat, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.PCIEBwStat{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var dcmiPCIEBandwidth C.struct_dcmi_pcie_link_bandwidth_info + var pcieBandwidth common.PCIEBwStat + dcmiPCIEBandwidth.profiling_time = C.int(profilingTime) + retCode := C.dcmi_get_pcie_link_bandwidth_info(C.int(cardID), C.int(deviceID), &dcmiPCIEBandwidth) + if int32(retCode) != common.Success { + return pcieBandwidth, buildDcmiErr(cardID, deviceID, "PCIEBandwidth", retCode) + } + + pcieBandwidth.PcieRxPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_p_bw) + pcieBandwidth.PcieRxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_np_bw) + pcieBandwidth.PcieRxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_cpl_bw) + + pcieBandwidth.PcieTxPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_p_bw) + pcieBandwidth.PcieTxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_np_bw) + pcieBandwidth.PcieTxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_cpl_bw) + + return pcieBandwidth, nil +} + +func (d *DcManager) convertPcieBw(pcieBwArr [agentdrvProfDataNum]C.uint) common.PcieStatValue { + return common.PcieStatValue{ + PcieMinBw: int32(pcieBwArr[0]), + PcieMaxBw: int32(pcieBwArr[1]), + PcieAvgBw: int32(pcieBwArr[agentdrvProfDataNum-1]), + } +} + +// DcGetDcmiVersion return dcmi version +func (d *DcManager) DcGetDcmiVersion() (string, error) { + cDcmiVer := C.CString(string(make([]byte, dcmiVersionLen))) + defer C.free(unsafe.Pointer(cDcmiVer)) + if retCode := C.dcmi_get_dcmi_version((*C.char)(cDcmiVer), dcmiVersionLen+1); int32(retCode) != common.Success { + return "", fmt.Errorf("get dcmi version failed, errCode: %d", int32(retCode)) + } + return C.GoString(cDcmiVer), nil +} + +// DcGetDeviceEccInfo get ECC info +func (d *DcManager) DcGetDeviceEccInfo(cardID, deviceID int32, inputType common.DcmiDeviceType) ( + *common.ECCInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + dcmiDeviceType, err := d.getInputType(inputType) + if err != nil { + return nil, err + } + var deviceEccInfo C.struct_dcmi_ecc_info + if retCode := C.dcmi_get_device_ecc_info(C.int(cardID), C.int(deviceID), dcmiDeviceType, + &deviceEccInfo); retCode != 0 { + return nil, buildDcmiErr(cardID, deviceID, "dcmi device ECC", retCode) + } + eccInfo := &common.ECCInfo{ + EnableFlag: int32(deviceEccInfo.enable_flag), + SingleBitErrorCnt: int64(deviceEccInfo.single_bit_error_cnt), + DoubleBitErrorCnt: int64(deviceEccInfo.double_bit_error_cnt), + TotalSingleBitErrorCnt: int64(deviceEccInfo.total_single_bit_error_cnt), + TotalDoubleBitErrorCnt: int64(deviceEccInfo.total_double_bit_error_cnt), + SingleBitIsolatedPagesCnt: int64(deviceEccInfo.single_bit_isolated_pages_cnt), + DoubleBitIsolatedPagesCnt: int64(deviceEccInfo.double_bit_isolated_pages_cnt), + } + return eccInfo, nil +} + +// DcGetHccsStatisticInfo get HCCS statistic info +func (d *DcManager) DcGetHccsStatisticInfo(cardID, deviceID int32) (common.HccsStatisticInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) + subCmd := HccsSubCmdGetStatisticInfo + var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info + // Use a secure function to get the address (for cleanCode) + addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) + if err != nil { + return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) + } + size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + addr, &size); int32(retCode) != common.Success { + return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) + } + return convertHccsStatisticInfoStruct(hccsStatisticInfo), nil +} + +// DcGetHccsStatisticInfoU64 get HCCS statistic info +func (d *DcManager) DcGetHccsStatisticInfoU64(cardID, deviceID int32) (common.HccsStatisticInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) + subCmd := HccsSubCmdGetStatisticInfoU64 + var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64 + // Use a secure function to get the address (for cleanCode) + addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) + if err != nil { + return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) + } + size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + addr, &size); int32(retCode) != common.Success { + return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) + } + return convertHccsStatisticInfoStructU64(hccsStatisticInfo), nil +} + +func convertHccsStatisticInfoStruct(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info) common.HccsStatisticInfo { + cgoHccsStatisticInfo := common.HccsStatisticInfo{} + for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { + cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) + cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) + cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) + } + return cgoHccsStatisticInfo +} + +func convertHccsStatisticInfoStructU64(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64) common.HccsStatisticInfo { + cgoHccsStatisticInfo := common.HccsStatisticInfo{} + for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { + cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) + cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) + cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) + } + return cgoHccsStatisticInfo +} + +// DcGetHccsBandwidthInfo get HCCS bandwidth info +func (d *DcManager) DcGetHccsBandwidthInfo(cardID int32, deviceID int32, + profilingTime int) (common.HccsBandwidthInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.HccsBandwidthInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info + hccsBandwidthInfo.profiling_time = C.int(profilingTime) + if retCode := C.dcmi_get_hccs_link_bandwidth_info(C.int(cardID), C.int(deviceID), + &hccsBandwidthInfo); int32(retCode) != common.Success { + return common.HccsBandwidthInfo{}, buildDcmiErr(cardID, deviceID, "hccs bandwidth", retCode) + } + return convertHccsBandwidthInfoStruct(hccsBandwidthInfo), nil +} + +func convertHccsBandwidthInfoStruct(hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info) common.HccsBandwidthInfo { + cgoHccsBWInfo := common.HccsBandwidthInfo{} + cgoHccsBWInfo.ProfilingTime = uint32(hccsBandwidthInfo.profiling_time) + cgoHccsBWInfo.TotalTxbw = float64(hccsBandwidthInfo.total_txbw) + cgoHccsBWInfo.TotalRxbw = float64(hccsBandwidthInfo.total_rxbw) + for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { + cgoHccsBWInfo.TxBandwidth = append(cgoHccsBWInfo.TxBandwidth, float64(hccsBandwidthInfo.tx_bandwidth[i])) + cgoHccsBWInfo.RxBandwidth = append(cgoHccsBWInfo.RxBandwidth, float64(hccsBandwidthInfo.rx_bandwidth[i])) + } + return cgoHccsBWInfo +} + +// DcGetSioInfo get SIO info +func (d *DcManager) DcGetSioInfo(cardID, deviceID int32) (common.SioCrcErrStatisticInfo, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.SioCrcErrStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainCmd = C.enum_dcmi_main_cmd(MainCmdSio) + subCmd := SioSubCmdCrcErrStatistics + var sioInfo C.struct_dcmi_sio_crc_err_statistic_info + // Use a secure function to get the address (for cleanCode) + addr, err := getAddrWithOffset(unsafe.Pointer(&sioInfo), unsafe.Sizeof(sioInfo), 0) + if err != nil { + return common.SioCrcErrStatisticInfo{}, fmt.Errorf("get sioInfo addr failed, error is: %v", err) + } + size := C.uint(unsafe.Sizeof(sioInfo)) + if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), + addr, &size); int32(retCode) != common.Success { + return common.SioCrcErrStatisticInfo{}, buildDcmiErr(cardID, deviceID, "super pod sio", retCode) + } + return convertSioInfoStruct(sioInfo), nil +} + +func convertSioInfoStruct(sPodSioInfo C.struct_dcmi_sio_crc_err_statistic_info) common.SioCrcErrStatisticInfo { + cgoSPodSioInfo := common.SioCrcErrStatisticInfo{ + TxErrCnt: int64(sPodSioInfo.tx_error_count), + RxErrCnt: int64(sPodSioInfo.rx_error_count), + } + for i := uint32(0); i < dcmiMaxReserveNum; i++ { + cgoSPodSioInfo.Reserved = append(cgoSPodSioInfo.Reserved, uint32(sPodSioInfo.reserved[i])) + } + return cgoSPodSioInfo +} + +func (d *DcManager) getInputType(inputType common.DcmiDeviceType) (C.enum_dcmi_device_type, error) { + switch inputType { + case common.DcmiDeviceTypeDDR: + return C.DCMI_DEVICE_TYPE_DDR, nil + case common.DcmiDeviceTypeSRAM: + return C.DCMI_DEVICE_TYPE_SRAM, nil + case common.DcmiDeviceTypeHBM: + return C.DCMI_DEVICE_TYPE_HBM, nil + case common.DcmiDeviceTypeNPU: + return C.DCMI_DEVICE_TYPE_NPU, nil + case common.DcmiDeviceTypeNONE: + return C.DCMI_DEVICE_TYPE_NONE, nil + default: + return C.DCMI_DEVICE_TYPE_NONE, fmt.Errorf("invalid input type for getting device ecc info") + } +} + +// Define a safe function to get address offsets (for cleanCode) +func getAddrWithOffset(addr unsafe.Pointer, length uintptr, offset uintptr) (unsafe.Pointer, error) { + if offset > length { + return nil, fmt.Errorf("offset(%d) is invalid, length(%d)", offset, length) + } + return (unsafe.Pointer)(uintptr(addr) + offset), nil +} + +// DcGetDeviceMainBoardInfo return mainboardId of device +func (d *DcManager) DcGetDeviceMainBoardInfo(cardID, deviceID int32) (uint32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var cMainBoardId C.uint + if retCode := C.dcmi_get_mainboard_id(C.int(cardID), C.int(deviceID), + &cMainBoardId); int32(retCode) != common.Success { + return 0, buildDcmiErr(cardID, deviceID, "mainBoardId", retCode) + } + + return uint32(cMainBoardId), nil +} +func buildDcmiErr(cardID, deviceID int32, msg string, errCode C.int) error { + errDesc, ok := dcmiErrMap[int32(errCode)] + if !ok { + errDesc = "unknown error code" + } + return fmt.Errorf("cardID(%d),deviceID(%d):get %s info failed,error code: %v,error desc: %v", + cardID, deviceID, msg, errCode, errDesc) +} + +// DcGetSuperPodStatus get super pod status +func (d *DcManager) DcGetSuperPodStatus(cardID, deviceID int32, sdid uint32) (int, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var status C.uint + if retCode := C.dcmi_get_spod_node_status(C.int(cardID), C.int(deviceID), + C.unsigned(sdid), &status); int32(retCode) != common.Success { + return 0, buildDcmiErr(cardID, deviceID, "GetSuperPodStatus", retCode) + } + return int(status), nil +} + +// DcSetSuperPodStatus set super pod status +func (d *DcManager) DcSetSuperPodStatus(cardID, deviceID int32, sdid, status uint32) error { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + if retCode := C.dcmi_set_spod_node_status(C.int(cardID), C.int(deviceID), + C.uint(sdid), C.uint(status)); int32(retCode) != common.Success { + return buildDcmiErr(cardID, deviceID, "DcSetSuperPodStatus", retCode) + } + return nil +} + +// DcGetCardElabelV2 get card elabel information +func (d *DcManager) DcGetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + if !common.IsValidCardID(cardID) { + return common.ElabelInfo{}, fmt.Errorf("cardID(%d) is invalid", cardID) + } + var elabelInfo C.struct_dcmi_elabel_info + if retCode := C.dcmi_get_card_elabel_v2(C.int(cardID), &elabelInfo); int32(retCode) != common.Success { + return common.ElabelInfo{}, fmt.Errorf("cardID(%d): get elabel info failed, error code: %v", cardID, retCode) + } + return common.ElabelInfo{ + ProductName: C.GoString(&elabelInfo.product_name[0]), + Model: C.GoString(&elabelInfo.model[0]), + Manufacturer: C.GoString(&elabelInfo.manufacturer[0]), + ManufacturerDate: C.GoString(&elabelInfo.manufacturer_date[0]), + SerialNumber: C.GoString(&elabelInfo.serial_number[0]), + }, nil +} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h new file mode 100644 index 0000000..7ffe468 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h @@ -0,0 +1,596 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __DCMI_INTERFACE_API_H__ +#define __DCMI_INTERFACE_API_H__ + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif +#endif /* __cplusplus */ + +#define DCMIDLLEXPORT static + +#define MAX_CHIP_NAME_LEN 32 // Maximum length of chip name +#define TEMPLATE_NAME_LEN 32 +#define DIE_ID_COUNT 5 // Number of die ID characters +#define AGENTDRV_PROF_DATA_NUM 3 +#define MAX_LENGTH 256 // Maximum length for elabel info fields + +/*----------------------------------------------* + * Structure description * + *----------------------------------------------*/ +struct dcmi_chip_info { + unsigned char chip_type[MAX_CHIP_NAME_LEN]; + unsigned char chip_name[MAX_CHIP_NAME_LEN]; + unsigned char chip_ver[MAX_CHIP_NAME_LEN]; + unsigned int aicore_cnt; +}; + +struct dcmi_chip_info_v2 { + unsigned char chip_type[MAX_CHIP_NAME_LEN]; + unsigned char chip_name[MAX_CHIP_NAME_LEN]; + unsigned char chip_ver[MAX_CHIP_NAME_LEN]; + unsigned int aicore_cnt; + unsigned char npu_name[MAX_CHIP_NAME_LEN]; +}; + +struct dcmi_pcie_info_all { + unsigned int venderid; /* 厂商id */ + unsigned int subvenderid; /* 厂商子id */ + unsigned int deviceid; /* 设备id */ + unsigned int subdeviceid; /* 设备子id */ + int domain; + unsigned int bdf_busid; + unsigned int bdf_deviceid; + unsigned int bdf_funcid; + unsigned char reserve[32]; /* the size of dcmi_pcie_info_all is 64 */ +}; + +struct dcmi_die_id { + unsigned int soc_die[DIE_ID_COUNT]; +}; + +struct dcmi_ecc_info { + int enable_flag; + unsigned int single_bit_error_cnt; + unsigned int double_bit_error_cnt; + unsigned int total_single_bit_error_cnt; + unsigned int total_double_bit_error_cnt; + unsigned int single_bit_isolated_pages_cnt; + unsigned int double_bit_isolated_pages_cnt; + unsigned int single_bit_next_isolated_pages_cnt; + unsigned int double_bit_next_isolated_pages_cnt; +}; + +struct dcmi_hbm_info { + unsigned long long memory_size; + unsigned int freq; + unsigned long long memory_usage; + int temp; + unsigned int bandwith_util_rate; +}; + +struct dcmi_get_memory_info_stru { + unsigned long long memory_size; /* unit:MB */ + unsigned long long memory_available; /* free + hugepages_free * hugepagesize */ + unsigned int freq; + unsigned long hugepagesize; /* unit:KB */ + unsigned long hugepages_total; + unsigned long hugepages_free; + unsigned int utiliza; /* ddr memory info usages */ + unsigned char reserve[60]; /* the size of dcmi_memory_info is 96 */ +}; + +enum dcmi_ip_addr_type { + DCMI_IPADDR_TYPE_V4 = 0, /** IPv4 */ + DCMI_IPADDR_TYPE_V6 = 1, /** IPv6 */ + DCMI_IPADDR_TYPE_ANY = 2 /** IPv4+IPv6 ("dual-stack") */ +}; + +struct dcmi_ip_addr { + union { + unsigned char ip6[16]; + unsigned char ip4[4]; + } u_addr; + enum dcmi_ip_addr_type ip_type; +}; + +enum dcmi_unit_type { + NPU_TYPE = 0, + MCU_TYPE = 1, + CPU_TYPE = 2, + INVALID_TYPE = 0xFF +}; + +enum dcmi_rdfx_detect_result { + DCMI_RDFX_DETECT_OK = 0, + DCMI_RDFX_DETECT_SOCK_FAIL = 1, + DCMI_RDFX_DETECT_RECV_TIMEOUT = 2, + DCMI_RDFX_DETECT_UNREACH = 3, + DCMI_RDFX_DETECT_TIME_EXCEEDED = 4, + DCMI_RDFX_DETECT_FAULT = 5, + DCMI_RDFX_DETECT_INIT = 6, + DCMI_RDFX_DETECT_THREAD_ERR = 7, + DCMI_RDFX_DETECT_IP_SET = 8, + DCMI_RDFX_DETECT_MAX = 0xFF +}; + +enum dcmi_port_type { + DCMI_VNIC_PORT = 0, + DCMI_ROCE_PORT = 1, + DCMI_INVALID_PORT +}; + +enum dcmi_main_cmd { + DCMI_MAIN_CMD_DVPP = 0, + DCMI_MAIN_CMD_ISP, + DCMI_MAIN_CMD_TS_GROUP_NUM, + DCMI_MAIN_CMD_CAN, + DCMI_MAIN_CMD_UART, + DCMI_MAIN_CMD_UPGRADE = 5, + DCMI_MAIN_CMD_HCCS = 16, + DCMI_MAIN_CMD_TEMP = 50, + DCMI_MAIN_CMD_SVM = 51, + DCMI_MAIN_CMD_VDEV_MNG, + DCMI_MAIN_CMD_SIO = 56, + DCMI_MAIN_CMD_DEVICE_SHARE = 0x8001, + DCMI_MAIN_CMD_MAX +}; + +enum dcmi_freq_type { + DCMI_FREQ_DDR = 1, + DCMI_FREQ_CTRLCPU = 2, + DCMI_FREQ_HBM = 6, + DCMI_FREQ_AICORE_CURRENT_ = 7, + DCMI_FREQ_AICORE_MAX = 9, + DCMI_FREQ_VECTORCORE_CURRENT = 12 +}; + +enum dcmi_reset_channel { + OUTBAND_CHANNEL = 0, // out-of-band reset + INBAND_CHANNEL // in-band reset +}; + +enum dcmi_boot_status { + DCMI_BOOT_STATUS_UNINIT = 0, // not init + DCMI_BOOT_STATUS_BIOS, // BIOS starting + DCMI_BOOT_STATUS_OS, // OS starting + DCMI_BOOT_STATUS_FINISH // started +}; + +enum dcmi_device_type { + DCMI_DEVICE_TYPE_DDR, + DCMI_DEVICE_TYPE_SRAM, + DCMI_DEVICE_TYPE_HBM, + DCMI_DEVICE_TYPE_NPU, + DCMI_DEVICE_TYPE_NONE = 0xff +}; + +enum dcmi_event_type { + DCMI_DMS_FAULT_EVENT = 0, +}; + +enum dcmi_die_type { + NDIE, + VDIE +}; + +#define DCMI_VDEV_RES_NAME_LEN 16 +#define DCMI_VDEV_SIZE 20 +#define DCMI_VDEV_FOR_RESERVE 32 +#define DCMI_SOC_SPLIT_MAX 32 +#define DCMI_MAX_EVENT_NAME_LENGTH 256 +#define DCMI_MAX_EVENT_DATA_LENGTH 32 +#define DCMI_EVENT_FILTER_FLAG_EVENT_ID (1UL << 0) +#define DCMI_EVENT_FILTER_FLAG_SERVERITY (1UL << 1) +#define DCMI_EVENT_FILTER_FLAG_NODE_TYPE (1UL << 2) +#define DCMI_MAX_EVENT_RESV_LENGTH 32 +#define HCCS_MAX_PCS_NUM 16 +#define HCCS_RES_PCS_NUM 64 +#define IP_ADDR_LIST_LEN 1024 +#define HCCS_PING_MESH_MAX_NUM 48 +#define ADDR_MAX_LEN 16 + +struct dcmi_base_resource { + unsigned long long token; + unsigned long long token_max; + unsigned long long task_timeout; + unsigned int vfg_id; + unsigned char vip_mode; + unsigned char reserved[DCMI_VDEV_FOR_RESERVE - 1]; /* bytes aligned */ +}; + +/* total types of computing resource */ +struct dcmi_computing_resource { + /* accelator resource */ + float aic; + float aiv; + unsigned short dsa; + unsigned short rtsq; + unsigned short acsq; + unsigned short cdqm; + unsigned short c_core; + unsigned short ffts; + unsigned short sdma; + unsigned short pcie_dma; + + /* memory resource, MB as unit */ + unsigned long long memory_size; + + /* id resource */ + unsigned int event_id; + unsigned int notify_id; + unsigned int stream_id; + unsigned int model_id; + + /* cpu resource */ + unsigned short topic_schedule_aicpu; + unsigned short host_ctrl_cpu; + unsigned short host_aicpu; + unsigned short device_aicpu; + unsigned short topic_ctrl_cpu_slot; + + /* vnpu resource */ + unsigned int vdev_aicore_utilization; + unsigned long long vdev_memory_total; + unsigned long long vdev_memory_free; + + unsigned char reserved[DCMI_VDEV_FOR_RESERVE-DCMI_VDEV_SIZE]; +}; + +struct dcmi_media_resource { + /* dvpp resource */ + float jpegd; + float jpege; + float vpc; + float vdec; + float pngd; + float venc; + unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; +}; + +struct dcmi_create_vdev_out { + unsigned int vdev_id; + unsigned int pcie_bus; + unsigned int pcie_device; + unsigned int pcie_func; + unsigned int vfg_id; + unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; +}; + +struct dcmi_create_vdev_res_stru { + unsigned int vdev_id; + unsigned int vfg_id; + char template_name[TEMPLATE_NAME_LEN]; + unsigned char reserved[64]; +}; + +struct dcmi_vdev_query_info { + char name[DCMI_VDEV_RES_NAME_LEN]; + unsigned int status; + unsigned int is_container_used; + unsigned int vfid; + unsigned int vfg_id; + unsigned long long container_id; + struct dcmi_base_resource base; + struct dcmi_computing_resource computing; + struct dcmi_media_resource media; +}; + +/* for single search */ +struct dcmi_vdev_query_stru { + unsigned int vdev_id; + struct dcmi_vdev_query_info query_info; +}; + +struct dcmi_soc_free_resource { + unsigned int vfg_num; + unsigned int vfg_bitmap; + struct dcmi_base_resource base; + struct dcmi_computing_resource computing; + struct dcmi_media_resource media; +}; + +struct dcmi_soc_total_resource { + unsigned int vdev_num; + unsigned int vdev_id[DCMI_SOC_SPLIT_MAX]; + unsigned int vfg_num; + unsigned int vfg_bitmap; + struct dcmi_base_resource base; + struct dcmi_computing_resource computing; + struct dcmi_media_resource media; +}; + +struct dcmi_spod_info { + unsigned int sdid; + unsigned int scale_type; + unsigned int super_pod_id; + unsigned int server_id; + unsigned int reserve[8]; +}; + +struct dcmi_dms_fault_event { + unsigned int event_id; /* Event ID */ + unsigned short deviceid; /* Device ID */ + unsigned char node_type; /* Node type */ + unsigned char node_id; /* Node ID */ + unsigned char sub_node_type; /* Subnode type */ + unsigned char sub_node_id; /* Subnode ID */ + unsigned char severity; /* Event severity. 0: warning; 1: minor; 2: major; 3: critical */ + unsigned char assertion; /* Event type. 0: fault recovery; 1: fault generation; 2: one-off event */ + int event_serial_num; /* Alarm serial number */ + int notify_serial_num; /* Notification serial number*/ + /* Time when the event occurs, presenting as the number of seconds that have elapsed since the Unix epoch. */ + unsigned long long alarm_raised_time; + char event_name[DCMI_MAX_EVENT_NAME_LENGTH]; /* Event description */ + char additional_info[DCMI_MAX_EVENT_DATA_LENGTH]; /* Additional event information */ + unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /**< Reserves 32 bytes */ +}; + +struct dcmi_event { + enum dcmi_event_type type; /* Event type */ + union { + struct dcmi_dms_fault_event dms_event; /* Event content */ + } event_t; +}; + +struct dcmi_event_filter { + /* It can be used to enable one or all filter criteria. The filter criteria are as follows: + 0: disables the filter criteria. + DCMI_EVENT_FILTER_FLAG_EVENT_ID: receives only specified events. + DCMI_EVENT_FILTER_FLAG_SERVERITY: receives only the events of a specified level and higher levels. + DCMI_EVENT_FILTER_FLAG_NODE_TYPE: receives only events of a specified node type. */ + unsigned long long filter_flag; + /* Receives a specified event. For details, see the Health Management Error Definition. */ + unsigned int event_id; + /* Receives events of a specified level and higher levels. For details, + see the severity definition in the struct dcmi_dms_fault_event structure. */ + unsigned char severity; + /* Receives only events of a specified node type. For details, see the Health Management Error Definition. */ + unsigned char node_type; + unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /* < Reserves 32 bytes. */ +}; + +struct dcmi_proc_mem_info { + int proc_id; + // unit is byte + unsigned long proc_mem_usage; +}; + +struct dcmi_board_info { + unsigned int board_id; + unsigned int pcb_id; + unsigned int bom_id; + unsigned int slot_id; // slot_id indicates pcie slot ID of the chip +}; + +struct dcmi_pcie_link_bandwidth_info { + int profiling_time; + unsigned int tx_p_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int tx_np_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int tx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int tx_np_lantency[AGENTDRV_PROF_DATA_NUM]; + unsigned int rx_p_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int rx_np_bw[AGENTDRV_PROF_DATA_NUM]; + unsigned int rx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; +}; + +struct dcmi_hccs_statistic_info { + unsigned int tx_cnt[HCCS_MAX_PCS_NUM]; + unsigned int rx_cnt[HCCS_MAX_PCS_NUM]; + unsigned int crc_err_cnt[HCCS_MAX_PCS_NUM]; + unsigned int retry_cnt[HCCS_MAX_PCS_NUM]; + unsigned int reserved_field_cnt[HCCS_RES_PCS_NUM]; +}; + +struct dcmi_hccs_statistic_info_u64 { + unsigned long long tx_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long rx_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long crc_err_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long retry_cnt[HCCS_MAX_PCS_NUM]; + unsigned long long reserved[HCCS_RES_PCS_NUM]; +}; + +struct dcmi_hccs_bandwidth_info { + int profiling_time; + double total_txbw; + double total_rxbw; + double tx_bandwidth[HCCS_MAX_PCS_NUM]; + double rx_bandwidth[HCCS_MAX_PCS_NUM]; +}; + +struct dcmi_sio_crc_err_statistic_info { + unsigned short tx_error_count; + unsigned short rx_error_count; + unsigned char reserved[8]; +}; + +struct dcmi_elabel_info { + char product_name[MAX_LENGTH]; + char model[MAX_LENGTH]; + char manufacturer[MAX_LENGTH]; + char manufacturer_date[MAX_LENGTH]; + char serial_number[MAX_LENGTH]; +}; + +struct dcmi_hccsping_mesh_operate { + char dst_addr_list[IP_ADDR_LIST_LEN]; + int pkt_size; + int pkt_send_num; + int pkt_interval; + int timeout; + int task_interval; + int task_id; +}; + +struct dcmi_hccsping_mesh_info { + char dst_addr[HCCS_PING_MESH_MAX_NUM][ADDR_MAX_LEN]; + unsigned int suc_pkt_num[HCCS_PING_MESH_MAX_NUM]; + unsigned int fail_pkt_num[HCCS_PING_MESH_MAX_NUM]; + long max_time[HCCS_PING_MESH_MAX_NUM]; + long min_time[HCCS_PING_MESH_MAX_NUM]; + long avg_time[HCCS_PING_MESH_MAX_NUM]; + long tp95_time[HCCS_PING_MESH_MAX_NUM]; + int reply_stat_num[HCCS_PING_MESH_MAX_NUM]; + unsigned long long ping_total_num[HCCS_PING_MESH_MAX_NUM]; + int dest_num; +}; + +#define DCMI_VERSION_1 +#define DCMI_VERSION_2 + +#if defined DCMI_VERSION_2 + +DCMIDLLEXPORT int dcmi_init(void); + +DCMIDLLEXPORT int dcmi_get_card_list(int *card_num, int *card_list, int list_len); + +DCMIDLLEXPORT int dcmi_get_device_num_in_card(int card_id, int *device_num); + +DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); + +DCMIDLLEXPORT int dcmi_get_device_type(int card_id, int device_id, enum dcmi_unit_type *device_type); + +DCMIDLLEXPORT int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); + +DCMIDLLEXPORT int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info); + +DCMIDLLEXPORT int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); + +DCMIDLLEXPORT int dcmi_get_device_power_info(int card_id, int device_id, int *power); + +DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); + +DCMIDLLEXPORT int dcmi_get_device_errorcode_v2( + int card_id, int device_id, int *error_count, unsigned int *error_code_list, unsigned int list_len); + +DCMIDLLEXPORT int dcmi_get_device_temperature(int card_id, int device_id, int *temperature); + +DCMIDLLEXPORT int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage); + +DCMIDLLEXPORT int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, + struct dcmi_ecc_info *device_ecc_info); + +DCMIDLLEXPORT int dcmi_get_device_frequency( + int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency); + +DCMIDLLEXPORT int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); + +DCMIDLLEXPORT int dcmi_get_device_memory_info_v3(int card_id, int device_id, + struct dcmi_get_memory_info_stru *memory_info); + +DCMIDLLEXPORT int dcmi_get_device_utilization_rate( + int card_id, int device_id, int input_type, unsigned int *utilization_rate); + +DCMIDLLEXPORT int dcmi_get_device_info( + int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, unsigned int *size); + +DCMIDLLEXPORT int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, + struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); + +DCMIDLLEXPORT int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result); + +DCMIDLLEXPORT int dcmi_get_device_logic_id(int *device_logic_id, int card_id, int device_id); + +DCMIDLLEXPORT int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, + struct dcmi_create_vdev_out *out); + +DCMIDLLEXPORT int dcmi_set_destroy_vdevice(int card_id, int device_id, unsigned int vdevid); + +DCMIDLLEXPORT int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid); + +DCMIDLLEXPORT int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid); + +DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id); + +DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_phyid(int *card_id, int *device_id, unsigned int device_phy_id); + +DCMIDLLEXPORT int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size); + +DCMIDLLEXPORT int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type); + +DCMIDLLEXPORT int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state); + +DCMIDLLEXPORT int dcmi_pre_reset_soc(int card_id, int device_id); + +DCMIDLLEXPORT int dcmi_rescan_soc(int card_id, int device_id); + +DCMIDLLEXPORT int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id); + +DCMIDLLEXPORT int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status); + +DCMIDLLEXPORT int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter); + +DCMIDLLEXPORT int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode); + +DCMIDLLEXPORT int dcmi_get_device_die_v2( + int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id); + +DCMIDLLEXPORT int dcmi_get_device_resource_info (int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, + int *proc_num); + +DCMIDLLEXPORT int dcmi_get_device_board_info (int card_id, int device_id, struct dcmi_board_info *board_info); + +DCMIDLLEXPORT int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, + struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); + +DCMIDLLEXPORT int dcmi_get_dcmi_version (char *dcmi_ver, int buf_size); + +DCMIDLLEXPORT int dcmi_get_mainboard_id (int card_id, int device_id, unsigned int *mainboard_id); + +DCMIDLLEXPORT int dcmi_get_hccs_link_bandwidth_info (int card_id, int device_id, struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); + +DCMIDLLEXPORT int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, struct dcmi_hccsping_mesh_operate *hccsping_mesh); + +DCMIDLLEXPORT int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id); + +DCMIDLLEXPORT int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, struct dcmi_hccsping_mesh_info *hccsping_mesh_reply); + +DCMIDLLEXPORT int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, unsigned int *state); + +DCMIDLLEXPORT int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status); + +DCMIDLLEXPORT int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status); + +#endif + +#if defined DCMI_VERSION_1 +/* The following interfaces are V1 version interfaces. In order to ensure the compatibility is temporarily reserved, + * the later version will be deleted. Please switch to the V2 version interface as soon as possible */ + +struct dcmi_memory_info_stru { + unsigned long long memory_size; + unsigned int freq; + unsigned int utiliza; +}; + +DCMIDLLEXPORT int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info); + +DCMIDLLEXPORT int dcmi_get_device_errorcode( + int card_id, int device_id, int *error_count, unsigned int *error_code, int *error_width); + +DCMIDLLEXPORT int dcmi_mcu_get_power_info(int card_id, int *power); + +DCMIDLLEXPORT int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info); +#endif + +#ifdef __cplusplus +#if __cplusplus +} +#endif +#endif /* __cplusplus */ + +#endif /* __DCMI_INTERFACE_API_H__ */ diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager.go b/mind-cluster/component/ascend-common/devmanager/devmanager.go new file mode 100644 index 0000000..fe21931 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager.go @@ -0,0 +1,1197 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager +package devmanager + +import ( + "errors" + "fmt" + "math" + "strings" + "sync" + "time" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// DeviceInterface for common device interface +type DeviceInterface interface { + Init() error + ShutDown() error + GetDcmiVersion() string + GetDeviceCount() (int32, error) + GetCardList() (int32, []int32, error) + GetDeviceNumInCard(cardID int32) (int32, error) + GetDeviceList() (int32, []int32, error) + GetChipBaseInfos() ([]*common.ChipBaseInfo, error) + GetDeviceHealth(logicID int32) (uint32, error) + GetDeviceNetWorkHealth(logicID int32) (uint32, error) + GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) + GetDeviceTemperature(logicID int32) (int32, error) + GetDeviceVoltage(logicID int32) (float32, error) + GetDevicePowerInfo(logicID int32) (float32, error) + GetMcuPowerInfo(cardID int32) (float32, error) + GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) + GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) + GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) + GetDeviceErrorCode(logicID int32) (int32, int64, error) + GetChipInfo(logicID int32) (*common.ChipInfo, error) + GetPhysicIDFromLogicID(logicID int32) (int32, error) + GetLogicIDFromPhysicID(physicID int32) (int32, error) + GetDeviceLogicID(cardID, deviceID int32) (int32, error) + GetCardIDDeviceID(logicID int32) (int32, int32, error) + GetDeviceIPAddress(logicID, ipType int32) (string, error) + CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) + GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) + DestroyVirtualDevice(logicID int32, vDevID uint32) error + GetDevType() string + GetProductTypeArray() []string + GetProductType(cardID, deviceID int32) (string, error) + GetAllProductType() ([]string, error) + GetNpuWorkMode() string + SetDeviceReset(cardID, deviceID int32) error + GetBrotherCardID(int32, int32) (int32, error) + PreResetSoc(int32, int32) error + GetOutBandChannelState(int32, int32) error + SetDeviceResetOutBand(int32, int32) error + RescanSoc(int32, int32) error + GetDeviceBootStatus(logicID int32) (int, error) + GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) + SubscribeDeviceFaultEvent(logicID int32) error + SetFaultEventCallFunc(func(common.DevFaultInfo)) error + GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) + GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) + GetPCIeBusInfo(logicID int32) (string, error) + GetBoardInfo(logicID int32) (common.BoardInfo, error) + GetCardElabelV2(cardID int32) (common.ElabelInfo, error) + GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) + SetIsTrainingCard() error + IsTrainingCard() bool + GetValidChipInfo() (common.ChipInfo, error) + GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) + GetSuperPodInfo(int32) (common.CgoSuperPodInfo, error) + GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) + GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) + GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) + GetMainBoardId() uint32 + GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) + + DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error + DcStopHccsPingMesh(int32, int32, int, uint) error + DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) + DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) + DcGetSuperPodStatus(int32, int32, uint32) (int, error) + DcSetSuperPodStatus(int32, int32, uint32, uint32) error +} + +const ( + // init dcmi interface max retry times + maxRetries = 6 + // init dcmi interface retry delay + defaultRetryDelay = 10 +) + +var ( + devManager *DeviceManager = nil + devManagerOnce sync.Once + idCache sync.Map +) + +// npuIdMapping the mapping between the three IDs +type npuIdMapping struct { + logicId int32 + cardId int32 + deviceId int32 +} + +// GetDeviceManager singleton to init global device manager and init dcmi interface +func GetDeviceManager(resetTimeout int) (*DeviceManager, error) { + devManagerOnce.Do(func() { + // a common dcmi Manager is initiated for init dcmi interface, you can specify an specific manager in later + dcMgr := dcmi.DcManager{} + var retryDelay time.Duration = defaultRetryDelay + hwlog.RunLog.Infof("get card list from dcmi reset timeout is %d", resetTimeout) + for currentTime, retryCount := 0, 0; currentTime <= resetTimeout; currentTime += int(retryDelay) { + if err := dcMgr.DcInit(); err != nil { + hwlog.RunLog.Errorf("deviceManager init failed, prepare dcmi failed, err: %v", err) + return + } + cardNum, cardList, err := dcMgr.DcGetCardList() + if err == nil && int(cardNum) == len(cardList) { + hwlog.RunLog.Infof("deviceManager get cardList is %v, cardList length equal to cardNum: %v", + cardList, cardNum) + break + } + if diffTime := float64(resetTimeout - currentTime); diffTime > 0 { + retryDelay = time.Duration(math.Min(float64(defaultRetryDelay), diffTime)) + } + retryCount++ + hwlog.RunLog.Warnf("deviceManager get card list failed (attempt %d), cardNum=%d, cardList=%v, "+ + "err: %v", retryCount, cardNum, cardList, err) + if currentTime+int(retryDelay) <= resetTimeout { + if err = dcMgr.DcShutDown(); err != nil { + hwlog.RunLog.Errorf("deviceManager shut down failed, err: %v", err) + return + } + time.Sleep(retryDelay * time.Second) + continue + } + if int(cardNum) != len(cardList) { + hwlog.RunLog.Warnf("deviceManager get cardList is %v, but cardNum is %v, "+ + "please check whether the real number of npu matches the cardList", cardList, cardNum) + } + } + devManager = &DeviceManager{} + devManager.DcMgr = &dcMgr + dcmiVer, err := dcMgr.DcGetDcmiVersion() + if err != nil { + hwlog.RunLog.Warnf("deviceManager get dcmi version failed, err: %v", err) + } + hwlog.RunLog.Infof("the dcmi version is %s", dcmiVer) + devManager.dcmiVersion = dcmiVer + }) + if devManager == nil { + return nil, errors.New("device Manager is nil, may encounter an exception during initialization. " + + "You can check the system log to confirm") + } + return devManager, nil +} + +// DeviceManager common device manager for Ascend910/310P/310 +type DeviceManager struct { + // DcMgr for common dev manager + DcMgr dcmi.DcDriverInterface + // DevType the value is the same as the device type corresponding to the DcMgr variable. + // Options: api.Ascend310,api.Ascend310P,api.Ascend910 + DevType string + // ProductTypes product type in server, multi type will be in 310P mix scene + ProductTypes []string + // isTrainingCard whether the device is used for training + isTrainingCard bool + dcmiVersion string + // mainBoardId used to distinguish between A900A3SuperPod and A9000A3SuperPod + mainBoardId uint32 +} + +// GetProductTypeArray return product types +func (d *DeviceManager) GetProductTypeArray() []string { + return d.ProductTypes +} + +// GetDevType return dev type +func (d *DeviceManager) GetDevType() string { + return d.DevType +} + +// AutoInit auto detect npu chip type and return the corresponding processing object +func AutoInit(dType string, resetTimeout int) (*DeviceManager, error) { + chipInfo, boardInfo, err := getDeviceInfoForInit(resetTimeout) + if err != nil { + return nil, fmt.Errorf("auto init failed, err: %s", err) + } + var devMgr *DeviceManager + if devMgr, err = GetDeviceManager(resetTimeout); err != nil || devMgr == nil { + return nil, err + } + mainBoardId, err := getValidMainBoardInfo(devMgr.DcMgr) + if err != nil { + // Non-blocking when the main board ID is not found + hwlog.RunLog.Warn(err) + } + devMgr.mainBoardId = mainBoardId + var devType = common.GetDevType(chipInfo.Name, boardInfo.BoardId) + + switch devType { + case api.Ascend910A, api.Ascend910B, api.Ascend910A3: + devMgr.DcMgr = &A910Manager{} + case api.Ascend310P: + devMgr.DcMgr = &A310PManager{} + case api.Ascend310, api.Ascend310B: + devMgr.DcMgr = &A310Manager{} + default: + return nil, fmt.Errorf("unsupport device type (%s)", devType) + } + hwlog.RunLog.Infof("chipName: %v, devType: %v", chipInfo.Name, devType) + if dType != "" && devType != dType { + return nil, fmt.Errorf("the value of dType(%s) is inconsistent with the actual chip type(%s)", + dType, devType) + } + devMgr.DevType = devType + if err := devMgr.SetIsTrainingCard(); err != nil { + hwlog.RunLog.Errorf("auto recognize training card failed, err: %s", err) + } + + pTypes, err := devMgr.GetAllProductType() + if err != nil { + hwlog.RunLog.Debugf("auto init product types failed, err: %s", err) + } + devMgr.ProductTypes = pTypes + return devMgr, nil +} + +func getDeviceInfoForInit(resetTimeout int) (common.ChipInfo, common.BoardInfo, error) { + var mgr *DeviceManager + var err error + if mgr, err = GetDeviceManager(resetTimeout); err != nil || mgr == nil { + return common.ChipInfo{}, common.BoardInfo{}, fmt.Errorf("get chip info failed, err: %v", err) + } + dcMgr := mgr.DcMgr + chipInfo, err := getValidChipInfo(dcMgr) + if err != nil { + hwlog.RunLog.Error(err) + return common.ChipInfo{}, common.BoardInfo{}, err + } + boardInfo, err := getValidBoardInfo(dcMgr) + if err != nil { + hwlog.RunLog.Error(err) + return chipInfo, common.BoardInfo{}, err + } + + return chipInfo, boardInfo, nil +} + +func getValidChipInfo(dcMgr dcmi.DcDriverInterface) (common.ChipInfo, error) { + // get card list + cardNum, cardList, err := dcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return common.ChipInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) + } + if cardNum == 0 { + return common.ChipInfo{}, fmt.Errorf("get chip info failed, no card found") + } + // get device in card, then get chip info by cardID and deviceID + for _, cardID := range cardList { + devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) + if err != nil || devNum == 0 { + hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) + continue + } + for devID := int32(0); devID < devNum; devID++ { + chipInfo, err := dcMgr.DcGetChipInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Debugf("get chip info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + if !common.IsValidChipInfo(chipInfo) { + hwlog.RunLog.Debugf("invalid chip info by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + return *chipInfo, nil + } + } + return common.ChipInfo{}, errors.New("cannot get valid chip info") +} + +func getValidBoardInfo(dcMgr dcmi.DcDriverInterface) (common.BoardInfo, error) { + // get card list + cardNum, cardList, err := dcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return common.BoardInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) + } + if cardNum == 0 { + return common.BoardInfo{}, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) + } + // get device in card, then get board info by cardID and deviceID + for _, cardID := range cardList { + devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) + if err != nil || devNum == 0 { + hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) + continue + } + for devID := int32(0); devID < devNum; devID++ { + boardInfo, err := dcMgr.DcGetDeviceBoardInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Debugf("get board info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + if !common.IsValidBoardInfo(&boardInfo) { + hwlog.RunLog.Debugf("invalid board info by cardID(%d), deviceID(%d), error: %v", cardID, devID, + err) + continue + } + return boardInfo, nil + } + } + return common.BoardInfo{}, errors.New("cannot get valid board info") +} +func getValidMainBoardInfo(dcMgr dcmi.DcDriverInterface) (uint32, error) { + // get card list + cardNum, cardList, err := dcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return 0, fmt.Errorf(common.ErrMsgInitCardListFailed) + } + if cardNum == 0 { + return 0, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) + } + // get device in card, then get board info by cardID and deviceID + for _, cardID := range cardList { + devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) + if err != nil || devNum == 0 { + hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) + continue + } + for devID := int32(0); devID < devNum; devID++ { + mainBoardId, err := dcMgr.DcGetDeviceMainBoardInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Debug(err) + continue + } + if !common.IsValidMainBoardInfo(mainBoardId) { + hwlog.RunLog.Warnf("invalid mainBoardId info by cardID(%d), deviceID(%d), error: %v", cardID, devID, err) + continue + } + return mainBoardId, nil + } + } + return 0, errors.New("cannot get main board id") +} + +// Init load symbol and initialize dcmi +func (d *DeviceManager) Init() error { + return d.DcMgr.DcInit() +} + +// ShutDown clean the dynamically loaded resource +func (d *DeviceManager) ShutDown() error { + return d.DcMgr.DcShutDown() +} + +// GetDeviceCount get npu device count +func (d *DeviceManager) GetDeviceCount() (int32, error) { + return d.DcMgr.DcGetDeviceCount() +} + +// GetCardList get all card list +func (d *DeviceManager) GetCardList() (int32, []int32, error) { + return d.DcMgr.DcGetCardList() +} + +// GetDeviceNumInCard get all device list in one card +func (d *DeviceManager) GetDeviceNumInCard(cardID int32) (int32, error) { + return d.DcMgr.DcGetDeviceNumInCard(cardID) +} + +// GetDeviceList get all device logicID list +func (d *DeviceManager) GetDeviceList() (int32, []int32, error) { + return d.DcMgr.DcGetLogicIDList() +} + +// GetDeviceHealth query npu device health status +func (d *DeviceManager) GetDeviceHealth(logicID int32) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get health code by logicID(%d)", logicID) + } + healthCode, err := d.DcMgr.DcGetDeviceHealth(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, err + } + + return uint32(healthCode), nil +} + +// GetDeviceNetWorkHealth query npu device network health status +func (d *DeviceManager) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get network health code by logicID(%d)", logicID) + } + healthCode, err := d.DcMgr.DcGetDeviceNetWorkHealth(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, err + } + + return healthCode, nil +} + +// GetDeviceUtilizationRate get npu device utilization +func (d *DeviceManager) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get utilization by logicID(%d)", logicID) + } + rate, err := d.DcMgr.DcGetDeviceUtilizationRate(cardID, deviceID, deviceType) + if err != nil { + return common.UnRetError, err + } + + return uint32(rate), nil +} + +// GetDeviceTemperature get npu device temperature +func (d *DeviceManager) GetDeviceTemperature(logicID int32) (int32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) + } + temp, err := d.DcMgr.DcGetDeviceTemperature(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) + } + + return temp, nil +} + +// GetDeviceVoltage get npu device voltage +func (d *DeviceManager) GetDeviceVoltage(logicID int32) (float32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) + } + voltage, err := d.DcMgr.DcGetDeviceVoltage(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) + } + + return voltage, nil +} + +// GetDevicePowerInfo get npu device power info +func (d *DeviceManager) GetDevicePowerInfo(logicID int32) (float32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) + } + power, err := d.DcMgr.DcGetDevicePowerInfo(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) + } + + return power, nil +} + +// GetDeviceFrequency get npu device work frequency +func (d *DeviceManager) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) + } + frequency, err := d.DcMgr.DcGetDeviceFrequency(cardID, deviceID, deviceType) + if err != nil { + hwlog.RunLog.Error(err) + return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) + } + + return frequency, nil +} + +// GetDeviceMemoryInfo get npu memory information +func (d *DeviceManager) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) + } + + // 910B and 910A3 don't have DDR module. Therefore, DDR information cannot be queried. + if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { + hwlog.RunLog.Debugf("%v doesn't have DDR module. Therefore, DDR information cannot be queried", d.DevType) + return nil, nil + } + + memInfo, err := d.DcMgr.DcGetMemoryInfo(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) + } + + return memInfo, nil +} + +// GetDeviceHbmInfo get npu HBM module memory and frequency information +func (d *DeviceManager) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get hbm info by logicID(%d)", logicID) + } + hbmInfo, err := d.DcMgr.DcGetHbmInfo(cardID, deviceID) + if err != nil { + return nil, err + } + + return hbmInfo, nil +} + +// GetDeviceErrorCode get npu device error code +func (d *DeviceManager) GetDeviceErrorCode(logicID int32) (int32, int64, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", + logicID) + } + errCount, errCode, err := d.DcMgr.DcGetDeviceErrorCode(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", + logicID) + } + + return errCount, errCode, nil +} + +// GetChipInfo get npu device error code +func (d *DeviceManager) GetChipInfo(logicID int32) (*common.ChipInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d), error: %v", logicID, err) + } + chipInfo, err := d.DcMgr.DcGetChipInfo(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get chip info code by logicID(%d)", logicID) + } + + return chipInfo, nil +} + +// GetPhysicIDFromLogicID get device physic id from logic id +func (d *DeviceManager) GetPhysicIDFromLogicID(logicID int32) (int32, error) { + physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get physicID by logicID(%d)", logicID) + } + + return physicID, nil +} + +// GetLogicIDFromPhysicID get device logic id from physic id +func (d *DeviceManager) GetLogicIDFromPhysicID(physicID int32) (int32, error) { + logicID, err := d.DcMgr.DcGetLogicIDFromPhysicID(physicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, fmt.Errorf("failed to get logicID by physicID(%d)", physicID) + } + + return logicID, nil +} + +// GetDeviceLogicID get device logic id from card id and device id +func (d *DeviceManager) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { + return d.DcMgr.DcGetDeviceLogicID(cardID, deviceID) +} + +// GetDeviceIPAddress get device ip address +func (d *DeviceManager) GetDeviceIPAddress(logicID, ipType int32) (string, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return "", fmt.Errorf("failed to get cardID and deviceID by logicID(%d), %w", logicID, err) + } + return d.DcMgr.DcGetDeviceIPAddress(cardID, deviceID, ipType) +} + +// CreateVirtualDevice create virtual device +func (d *DeviceManager) CreateVirtualDevice( + logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { + if !common.IsValidTemplateName(d.DevType, vDevInfo.TemplateName) { + return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid template name: %s", vDevInfo.TemplateName) + } + return d.DcMgr.DcCreateVDevice(logicID, vDevInfo) +} + +// GetVirtualDeviceInfo get virtual device info +func (d *DeviceManager) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + cgoVDevInfo, err := d.DcMgr.DcGetVDeviceInfo(logicID) + if err != nil { + hwlog.RunLog.Debug(err) + return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v "+ + "and vdev num is: %d", err, int32(cgoVDevInfo.TotalResource.VDevNum)) + } + for _, vDevInfo := range cgoVDevInfo.VDevInfo { + if !common.IsValidTemplateName(d.DevType, vDevInfo.QueryInfo.Name) { + return common.VirtualDevInfo{}, fmt.Errorf("vdevice id %d, it's template name is invalid: %s", + vDevInfo.VDevID, vDevInfo.QueryInfo.Name) + } + } + return cgoVDevInfo, nil +} + +// DestroyVirtualDevice destroy virtual device +func (d *DeviceManager) DestroyVirtualDevice(logicID int32, vDevID uint32) error { + return d.DcMgr.DcDestroyVDevice(logicID, vDevID) +} + +// GetMcuPowerInfo get mcu power info for cardID +func (d *DeviceManager) GetMcuPowerInfo(cardID int32) (float32, error) { + return d.DcMgr.DcGetMcuPowerInfo(cardID) +} + +// GetCardIDDeviceID get cardID and deviceID by logicID +func (d *DeviceManager) GetCardIDDeviceID(logicID int32) (int32, int32, error) { + return d.getCardIdAndDeviceId(logicID) +} + +// GetProductType get product type by cardID and deviceID +func (d *DeviceManager) GetProductType(cardID, deviceID int32) (string, error) { + return d.DcMgr.DcGetProductType(cardID, deviceID) +} + +// GetAllProductType get all product type +func (d *DeviceManager) GetAllProductType() ([]string, error) { + productTypes := make([]string, 0) + cardNum, cardList, err := d.GetCardList() + if err != nil || cardNum == 0 { + hwlog.RunLog.Errorf("failed to get card list, err: %v", err) + return productTypes, err + } + for _, cardID := range cardList { + devNum, err := d.GetDeviceNumInCard(cardID) + if err != nil { + hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) + continue + } + if devNum == 0 { + hwlog.RunLog.Debugf("not found device on card %d", cardID) + continue + } + for devID := int32(0); devID < devNum; devID++ { + productType, err := d.GetProductType(cardID, devID) + if err != nil { + hwlog.RunLog.Debugf("get product type by card %d deviceID %d failed, err: %v", cardID, devID, err) + continue + } + productTypes = append(productTypes, productType) + break + } + } + if len(productTypes) != 0 { + productTypes = common.RemoveDuplicate(&productTypes) + } + return productTypes, nil +} + +// GetNpuWorkMode get work mode of NPU +func (d *DeviceManager) GetNpuWorkMode() string { + if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { + hwlog.RunLog.Warnf("only AMP mode is available on %s", d.DevType) + return common.AMPMode + } + + _, cardList, err := d.DcMgr.DcGetCardList() + if err != nil { + hwlog.RunLog.Error(err) + return "" + } + if len(cardList) > 0 { + mode, err := d.DcMgr.DcGetNpuWorkMode(cardList[0]) + if err != nil { + hwlog.RunLog.Error(err) + return "" + } + if mode == 0 { + return common.AMPMode + } + return common.SMPMode + } + return "" +} + +// SetDeviceReset reset spec device +func (d *DeviceManager) SetDeviceReset(cardID, deviceID int32) error { + return d.DcMgr.DcSetDeviceReset(cardID, deviceID) +} + +// GetBrotherCardID get brother card id +func (d *DeviceManager) GetBrotherCardID(cardID, deviceID int32) (int32, error) { + return d.DcMgr.DcGetBrotherCardID(cardID, deviceID) +} + +// GetOutBandChannelState get out band channel state +func (d *DeviceManager) GetOutBandChannelState(cardID, deviceID int32) error { + return d.DcMgr.DcGetOutBandChannelState(cardID, deviceID) +} + +// PreResetSoc pre reset soc, used before reset out band +func (d *DeviceManager) PreResetSoc(cardID, deviceID int32) error { + return d.DcMgr.DcPreResetSoc(cardID, deviceID) +} + +// SetDeviceResetOutBand reset spec device out band +func (d *DeviceManager) SetDeviceResetOutBand(cardID, deviceID int32) error { + return d.DcMgr.DcSetDeviceResetOutBand(cardID, deviceID) +} + +// RescanSoc trigger soc rescan, non-blocking +func (d *DeviceManager) RescanSoc(cardID, deviceID int32) error { + return d.DcMgr.DcRescanSoc(cardID, deviceID) +} + +// GetDeviceBootStatus get device boot status +func (d *DeviceManager) GetDeviceBootStatus(logicID int32) (int, error) { + return d.DcMgr.DcGetDeviceBootStatus(logicID) +} + +// GetDeviceAllErrorCode get npu device all error code +func (d *DeviceManager) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", + logicID) + } + errCount, errCodes, err := d.DcMgr.DcGetDeviceAllErrorCode(cardID, deviceID) + if err != nil { + hwlog.RunLog.Error(err) + return common.RetError, nil, fmt.Errorf("failed to get device error code by logicID(%d)", logicID) + } + return errCount, errCodes, nil +} + +// SubscribeDeviceFaultEvent get npu device error code by subscribe +func (d *DeviceManager) SubscribeDeviceFaultEvent(logicID int32) error { + var cardID, deviceID int32 + if logicID == common.SubscribeAllDevice { + cardID = common.SubscribeAllDevice + deviceID = common.SubscribeAllDevice + } else { + var err error + cardID, deviceID, err = d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return fmt.Errorf("failed to get cardID in subscribe device error code by logicID(%d)", logicID) + } + } + if err := d.DcMgr.DcSubscribeDeviceFaultEvent(cardID, deviceID); err != nil { + hwlog.RunLog.Error(err) + return fmt.Errorf("failed to subscribe device error code by logicID(%d)", logicID) + } + return nil +} + +// SetFaultEventCallFunc set fault event call func +func (d *DeviceManager) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { + if businessFunc == nil { + return errors.New("business func can't be nil") + } + d.DcMgr.DcSetFaultEventCallFunc(businessFunc) + return nil +} + +// GetDieID return die id by dcmi die type, vdie id or ndie id +func (d *DeviceManager) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetDieID(cardID, deviceID, dcmiDieType) +} + +// GetDevProcessInfo get process and process memory in device side +func (d *DeviceManager) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetDevProcessInfo(cardID, deviceID) +} + +// GetPCIeBusInfo pcie bus info +func (d *DeviceManager) GetPCIeBusInfo(logicID int32) (string, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetPCIeBusInfo(cardID, deviceID) +} + +// GetBoardInfo return board info of device +func (d *DeviceManager) GetBoardInfo(logicID int32) (common.BoardInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.BoardInfo{}, fmt.Errorf("failed to get cardID in "+ + "get device error code by logicID(%d)", logicID) + } + + return d.DcMgr.DcGetDeviceBoardInfo(cardID, deviceID) +} + +// GetCardElabelV2 get card elabel information +func (d *DeviceManager) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + return d.DcMgr.DcGetCardElabelV2(cardID) +} + +// GetPCIEBandwidth get pcie bandwidth +func (d *DeviceManager) GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Error(err) + return common.PCIEBwStat{}, fmt.Errorf("get cardID(deviceID) failed, error by logicID(%d)", logicID) + } + pciePCIEBw, err := d.DcMgr.DcGetPCIEBandwidth(cardID, deviceID, profilingTime) + if err != nil { + return common.PCIEBwStat{}, err + } + return pciePCIEBw, nil +} + +// SetIsTrainingCard identifies whether it is a training card according to the usage of card +func (d *DeviceManager) SetIsTrainingCard() error { + devType := d.GetDevType() + if strings.HasPrefix(devType, api.Ascend310) { + d.isTrainingCard = false + return nil + } + + boardInfo := common.BoardInfo{} + cardNum, cardList, err := d.GetCardList() + if err != nil || cardNum == 0 { + hwlog.RunLog.Errorf("failed to get card list when set 'IsTrainingCard' err: %v", err) + return err + } + for _, cardID := range cardList { + devNum, err := d.GetDeviceNumInCard(cardID) + if err != nil { + hwlog.RunLog.Warnf("get device num by cardID(%d) failed when set 'IsTrainingCard', error: %v", cardID, err) + continue + } + if devNum == 0 { + hwlog.RunLog.Warnf("not found device on card %d when set 'IsTrainingCard'", cardID) + continue + } + + for devID := int32(0); devID < devNum; devID++ { + boardInfo, err = d.DcMgr.DcGetDeviceBoardInfo(cardID, devID) + if err != nil { + hwlog.RunLog.Warnf("get board info by card %d deviceID %d failed, err: %v", cardID, devID, err) + continue + } + break + } + if err == nil { + break + } + } + + if devType == api.Ascend910B && + (boardInfo.BoardId == common.A300IA2BoardId || boardInfo.BoardId == common.A300IA2GB64BoardId) { + d.isTrainingCard = false + return nil + } + + d.isTrainingCard = true + return nil +} + +// IsTrainingCard return true if it is a training card +func (d *DeviceManager) IsTrainingCard() bool { + return d.isTrainingCard +} + +// GetDcmiVersion get dcmi version +func (d *DeviceManager) GetDcmiVersion() string { + return d.dcmiVersion +} + +// GetMainBoardId get mainBoardId +func (d *DeviceManager) GetMainBoardId() uint32 { + return d.mainBoardId +} + +// GetValidChipInfo find a valid chip info from all cards +func (d *DeviceManager) GetValidChipInfo() (common.ChipInfo, error) { + chipInfo, err := getValidChipInfo(d.DcMgr) + if err != nil { + hwlog.RunLog.Error("failed to get valid chip info") + return common.ChipInfo{}, err + } + return chipInfo, nil +} + +// GetDeviceEccInfo query device ECC info +func (d *DeviceManager) GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + hwlog.RunLog.Errorf("get cardID and deviceID by logicID(%d) failed, error: %v", logicID, err) + return nil, err + } + return d.DcMgr.DcGetDeviceEccInfo(cardID, deviceID, dcmiDeviceType) +} + +// GetSuperPodInfo get 910A3 super pod info +func (d *DeviceManager) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.CgoSuperPodInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) + } + + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get super pod info, error: %v", logicID, err) + } + cgoSuperPodInfo, err := d.DcMgr.DcGetSuperPodInfo(cardID, deviceID) + if err != nil { + return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get super pod info by logicID(%d), error: %v", + logicID, err) + } + + return cgoSuperPodInfo, nil +} + +// GetSioInfo get SIO info +func (d *DeviceManager) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return nil, fmt.Errorf("input invalid logicID when get sio info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) when get sio info , error: %v", logicID, err) + } + cgoSPodSioInfo, err := d.DcMgr.DcGetSioInfo(cardID, deviceID) + if err != nil { + return nil, err + } + + return &cgoSPodSioInfo, nil +} + +// GetHccsStatisticInfo get HCCS statistic info +func (d *DeviceManager) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get hccs statistic info, error: %v", logicID, err) + } + cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfo(cardID, deviceID) + if err != nil { + return buildFailedHccsInfo(), err + + } + + return &cgoHccsStatusInfo, nil +} + +// GetHccsStatisticInfoInU64 get hccs statistic info in u64 +func (d *DeviceManager) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { + if !common.IsValidLogicIDOrPhyID(logicID) { + return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get hccs statistic info, error: %v", logicID, err) + } + cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfoU64(cardID, deviceID) + if err != nil { + return buildFailedHccsInfo(), err + } + return &cgoHccsStatusInfo, nil +} + +// GetHccsBandwidthInfo get hccs bandwidth info +func (d *DeviceManager) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + + if !common.IsValidLogicIDOrPhyID(logicID) { + return buildFailedHccsBWInfo(), fmt.Errorf("input invalid logicID when get hccs bandwidth info: %d", logicID) + } + cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) + if err != nil { + return buildFailedHccsBWInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ + "when get hccs bandwidth info, error: %v", logicID, err) + } + cgoHccsBandwidthInfo, err := d.DcMgr.DcGetHccsBandwidthInfo(cardID, deviceID, common.HccsBWProfilingTime) + if err != nil { + return buildFailedHccsBWInfo(), fmt.Errorf("failed to get hccs bandwidth info by cardId(%d) deviceID(%d), error: %v", + cardID, deviceID, err) + } + + return &cgoHccsBandwidthInfo, nil +} + +// buildFailedHccsInfo build failed hccs info +func buildFailedHccsInfo() *common.HccsStatisticInfo { + errorResult := &common.HccsStatisticInfo{ + TxCnt: make([]uint64, 8), + RxCnt: make([]uint64, 8), + CrcErrCnt: make([]uint64, 8), + } + for i := 0; i < 8; i++ { + errorResult.TxCnt[i] = common.FailedValue + errorResult.RxCnt[i] = common.FailedValue + errorResult.CrcErrCnt[i] = common.FailedValue + } + return errorResult +} + +// buildFailedHccsBWInfo build failed hccs bandwidth info +func buildFailedHccsBWInfo() *common.HccsBandwidthInfo { + errorResult := &common.HccsBandwidthInfo{ + ProfilingTime: uint32(common.HccsBWProfilingTime), + TotalTxbw: common.FailedValue, + TotalRxbw: common.FailedValue, + TxBandwidth: make([]float64, 8), + RxBandwidth: make([]float64, 8), + } + for i := 0; i < 8; i++ { + errorResult.TxBandwidth[i] = common.FailedValue + errorResult.RxBandwidth[i] = common.FailedValue + } + return errorResult +} + +func (d *DeviceManager) getCardIdAndDeviceId(logicID int32) (int32, int32, error) { + + if !common.IsValidLogicIDOrPhyID(logicID) { + return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) + } + + result, ok := idCache.Load(logicID) + if !ok { + return d.doGetCardIDAndDeviceID(logicID) + } + idMapping, ok := result.(npuIdMapping) + if !ok { + idCache.Delete(logicID) + return d.doGetCardIDAndDeviceID(logicID) + } + hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from cache, cardId:%v, deviceId:%v", + logicID, idMapping.cardId, idMapping.deviceId) + return idMapping.cardId, idMapping.deviceId, nil +} + +func (d *DeviceManager) doGetCardIDAndDeviceID(logicID int32) (int32, int32, error) { + cardId, deviceId, err := d.DcMgr.DcGetCardIDDeviceID(logicID) + if err != nil { + hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID, + "failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err) + return common.RetError, common.RetError, err + } + hwlog.ResetErrCnt(common.DomainForLogicIdErr, logicID) + hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from dcmi, cardId:%v, deviceId:%v", + logicID, cardId, deviceId) + idCache.Store(logicID, npuIdMapping{logicId: logicID, cardId: cardId, deviceId: deviceId}) + return cardId, deviceId, nil +} + +// GetChipBaseInfos get chip base info +func (d *DeviceManager) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { + _, cardList, err := d.DcMgr.DcGetCardList() + if err != nil { + return nil, fmt.Errorf("get card list failed, error: %v", err) + } + var chips = []*common.ChipBaseInfo{} + for _, cardID := range cardList { + devNumInCard, err := d.DcMgr.DcGetDeviceNumInCard(cardID) + if err != nil { + return nil, fmt.Errorf("get device num by cardID: %d failed, error: %v", + cardID, err) + } + for devID := int32(0); devID < devNumInCard; devID++ { + logicID, err := d.DcMgr.DcGetDeviceLogicID(cardID, devID) + if err != nil { + return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ + "failed, error: %v", cardID, devID, err) + } + physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) + if err != nil { + return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) physic id "+"failed, error: %v", + cardID, devID, err) + } + hwlog.RunLog.Infof("get chip base info, cardID: %d, deviceID: %d, logicID: %d, physicID: %d", cardID, + devID, logicID, physicID) + chips = append(chips, &common.ChipBaseInfo{ + PhysicID: physicID, + LogicID: logicID, + CardID: cardID, + DeviceID: devID, + }) + } + } + return chips, nil +} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DeviceManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, + operate common.HccspingMeshOperate) error { + return d.DcMgr.DcStartHccsPingMesh(cardID, deviceID, portID, operate) +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DeviceManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { + return d.DcMgr.DcStopHccsPingMesh(cardID, deviceID, portID, taskID) +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DeviceManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, + taskID uint) (*common.HccspingMeshInfo, error) { + return d.DcMgr.DcGetHccsPingMeshInfo(cardID, deviceID, portID, taskID) +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DeviceManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { + return d.DcMgr.DcGetHccsPingMeshState(cardID, deviceID, portID, taskID) +} + +// DcGetSuperPodStatus get super pod status +func (d *DeviceManager) DcGetSuperPodStatus(cardID int32, deviceID int32, sdid uint32) (int, error) { + var err error + var status int + for i := 0; i < maxRetries; i++ { + if status, err = d.DcMgr.DcGetSuperPodStatus(cardID, deviceID, sdid); err != nil { + hwlog.RunLog.Errorf("get super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ + "sdid: %d, error: %v", i, cardID, deviceID, sdid, err) + continue + } + break + } + return status, err +} + +// DcSetSuperPodStatus set super pod status +func (d *DeviceManager) DcSetSuperPodStatus(cardID int32, deviceID int32, sdid, status uint32) error { + var err error + for i := 0; i < maxRetries; i++ { + if err = d.DcMgr.DcSetSuperPodStatus(cardID, deviceID, sdid, status); err != nil { + hwlog.RunLog.Errorf("set super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ + "sdid: %d, status: %d, error: %v", i, cardID, deviceID, sdid, status, err) + continue + } + break + } + return err +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go new file mode 100644 index 0000000..ca7121b --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go @@ -0,0 +1,30 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager mock +package devmanager + +import ( + "ascend-common/api" +) + +// DeviceManager910A3Mock common device manager mock for Ascend910A3 +type DeviceManager910A3Mock struct { + DeviceManagerMock +} + +// GetDevType return mock type +func (d *DeviceManager910A3Mock) GetDevType() string { + return api.Ascend910A3 +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go new file mode 100644 index 0000000..817f06e --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go @@ -0,0 +1,43 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager error mock +package devmanager + +import ( + "errors" + + "ascend-common/api" + "ascend-common/devmanager/common" +) + +// DeviceManager910A3MockErr common device manager mock error for Ascend910A3 +type DeviceManager910A3MockErr struct { + DeviceManagerMockErr +} + +// GetDevType return mock type +func (d *DeviceManager910A3MockErr) GetDevType() string { + return api.Ascend910A3 +} + +// GetHccsStatisticInfo get hccs statistic info +func (d *DeviceManager910A3MockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + return &common.HccsStatisticInfo{}, errors.New(errorMsg) +} + +// GetHccsBandwidthInfo get hccs statistic info +func (d *DeviceManager910A3MockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + return &common.HccsBandwidthInfo{}, errors.New(errorMsg) +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go new file mode 100644 index 0000000..3d7fff4 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go @@ -0,0 +1,166 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager for device driver manager +package devmanager + +import ( + "errors" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +const ( + mockLogicID int32 = 0 + mockCardID int32 = 0 + mockDeviceID int32 = 0 + invalidLogicID int32 = -1 + mockErrorMsg string = "mock error" + hccsArrayLen int = 8 +) + +type getHccsStatisticInfoInU64TestCase struct { + name string + logicID int32 + isValidID bool + getCardIDErr error + dcmiCallErr error + expectedErr bool +} + +func TestGetHccsStatisticInfoInU64(t *testing.T) { + testCases := buildGetHccsStatisticInfoInU64TestCases() + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + + clearIdCache(tc.logicID) + manager := createMockDeviceManager() + setupGetHccsStatisticInfoInU64Patches(patches, manager, tc) + result, err := manager.GetHccsStatisticInfoInU64(tc.logicID) + verifyGetHccsStatisticInfoInU64Result(result, err, tc) + }) + } +} + +func clearIdCache(logicID int32) { + idCache.Delete(logicID) +} + +func buildGetHccsStatisticInfoInU64TestCases() []getHccsStatisticInfoInU64TestCase { + return []getHccsStatisticInfoInU64TestCase{ + {name: "should return failed info when logicID is invalid", + logicID: invalidLogicID, + isValidID: false, + expectedErr: true}, + {name: "should return failed info when getCardIdAndDeviceId fails", + logicID: mockLogicID, + isValidID: true, + getCardIDErr: errors.New(mockErrorMsg), + expectedErr: true}, + {name: "should return failed info when DcGetHccsStatisticInfoU64 fails", + logicID: mockLogicID, + isValidID: true, + dcmiCallErr: errors.New(mockErrorMsg), + expectedErr: true}, + {name: "should return success info when all operations succeed", + logicID: mockLogicID, + isValidID: true, + expectedErr: false}, + } +} + +func createMockDeviceManager() *DeviceManager { + return &DeviceManager{ + DcMgr: &dcmi.DcManager{}, + } +} + +func setupGetHccsStatisticInfoInU64Patches(patches *gomonkey.Patches, + manager *DeviceManager, tc getHccsStatisticInfoInU64TestCase) { + patches.ApplyFuncReturn(common.IsValidLogicIDOrPhyID, tc.isValidID) + if !tc.isValidID { + return + } + if tc.getCardIDErr != nil { + patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", + mockCardID, mockDeviceID, tc.getCardIDErr) + } else { + patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", + mockCardID, mockDeviceID, nil) + if tc.dcmiCallErr != nil { + patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", + common.HccsStatisticInfo{}, tc.dcmiCallErr) + } else { + mockHccsInfo := createMockHccsStatisticInfo() + patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", + mockHccsInfo, nil) + } + } +} + +func createMockHccsStatisticInfo() common.HccsStatisticInfo { + txCnt := make([]uint64, hccsArrayLen) + rxCnt := make([]uint64, hccsArrayLen) + crcErrCnt := make([]uint64, hccsArrayLen) + for i := 0; i < hccsArrayLen; i++ { + txCnt[i] = uint64(i + 1) + rxCnt[i] = uint64(i + 1) + crcErrCnt[i] = 0 + } + return common.HccsStatisticInfo{ + TxCnt: txCnt, + RxCnt: rxCnt, + CrcErrCnt: crcErrCnt, + } +} + +func verifyGetHccsStatisticInfoInU64Result(result *common.HccsStatisticInfo, + err error, tc getHccsStatisticInfoInU64TestCase) { + if tc.expectedErr { + convey.So(err, convey.ShouldNotBeNil) + convey.So(result, convey.ShouldNotBeNil) + verifyFailedHccsInfo(result) + } else { + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldNotBeNil) + verifySuccessHccsInfo(result) + } +} + +func verifyFailedHccsInfo(result *common.HccsStatisticInfo) { + convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) + for i := 0; i < hccsArrayLen; i++ { + convey.So(result.TxCnt[i], convey.ShouldEqual, common.FailedValue) + convey.So(result.RxCnt[i], convey.ShouldEqual, common.FailedValue) + convey.So(result.CrcErrCnt[i], convey.ShouldEqual, common.FailedValue) + } +} + +func verifySuccessHccsInfo(result *common.HccsStatisticInfo) { + convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) + convey.So(result.TxCnt[0], convey.ShouldEqual, uint64(1)) + convey.So(result.RxCnt[0], convey.ShouldEqual, uint64(1)) +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go new file mode 100644 index 0000000..c3bde2b --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go @@ -0,0 +1,370 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager mock +package devmanager + +import ( + "ascend-common/api" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// DeviceManagerMock common device manager mock for Ascend910/310P/310 +type DeviceManagerMock struct { +} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DeviceManagerMock) DcStartHccsPingMesh(i int32, i2 int32, i3 int, operate common.HccspingMeshOperate) error { + return nil +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DeviceManagerMock) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { + return nil +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DeviceManagerMock) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, error) { + return &common.HccspingMeshInfo{}, nil +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DeviceManagerMock) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { + return 0, nil +} + +// Init load symbol and initialize dcmi +func (d *DeviceManagerMock) Init() error { + return nil +} + +// ShutDown clean the dynamically loaded resource +func (d *DeviceManagerMock) ShutDown() error { + return nil +} + +// GetDevType return mock type +func (d *DeviceManagerMock) GetDevType() string { + return api.Ascend910A +} + +// GetDeviceCount get npu device count +func (d *DeviceManagerMock) GetDeviceCount() (int32, error) { + return 1, nil +} + +// GetCardList get all card list +func (d *DeviceManagerMock) GetCardList() (int32, []int32, error) { + return 1, []int32{0}, nil +} + +// GetDeviceNumInCard get all device list in one card +func (d *DeviceManagerMock) GetDeviceNumInCard(cardID int32) (int32, error) { + return 1, nil +} + +// GetDeviceList get all device logicID list +func (d *DeviceManagerMock) GetDeviceList() (int32, []int32, error) { + return 1, []int32{0}, nil +} + +// GetDeviceHealth query npu device health status +func (d *DeviceManagerMock) GetDeviceHealth(logicID int32) (uint32, error) { + return 0, nil +} + +// GetDeviceNetWorkHealth query npu device network health status +func (d *DeviceManagerMock) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { + return 0, nil +} + +// GetDeviceUtilizationRate get npu device utilization +func (d *DeviceManagerMock) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, nil +} + +// GetDeviceTemperature get npu device temperature +func (d *DeviceManagerMock) GetDeviceTemperature(logicID int32) (int32, error) { + return 1, nil +} + +// GetDeviceVoltage get npu device voltage +func (d *DeviceManagerMock) GetDeviceVoltage(logicID int32) (float32, error) { + return 1, nil +} + +// GetDevicePowerInfo get npu device power info +func (d *DeviceManagerMock) GetDevicePowerInfo(logicID int32) (float32, error) { + return 1, nil +} + +// GetDeviceFrequency get npu device work frequency +func (d *DeviceManagerMock) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, nil +} + +// GetDeviceMemoryInfo get npu memory information +func (d *DeviceManagerMock) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { + return &common.MemoryInfo{ + MemorySize: 1, + MemoryAvailable: 1, + Frequency: 1, + Utilization: 1, + }, nil +} + +// GetDeviceHbmInfo get npu HBM module memory and frequency information +func (d *DeviceManagerMock) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { + return &common.HbmInfo{ + MemorySize: 1, + Frequency: 1, + Usage: 1, + Temp: 1, + BandWidthUtilRate: 1, + }, nil +} + +// GetDeviceErrorCode get npu device error code +func (d *DeviceManagerMock) GetDeviceErrorCode(logicID int32) (int32, int64, error) { + return int32(0), int64(0), nil +} + +// GetChipInfo get npu device error code +func (d *DeviceManagerMock) GetChipInfo(logicID int32) (*common.ChipInfo, error) { + chip := &common.ChipInfo{ + Type: "ascend", + Name: common.Chip910, + Version: "v1", + } + return chip, nil +} + +// GetPhysicIDFromLogicID get device physic id from logic id +func (d *DeviceManagerMock) GetPhysicIDFromLogicID(logicID int32) (int32, error) { + return 1, nil +} + +// GetLogicIDFromPhysicID get device logic id from physic id +func (d *DeviceManagerMock) GetLogicIDFromPhysicID(physicID int32) (int32, error) { + return 1, nil +} + +// GetDeviceLogicID get device logic id from card id and device id +func (d *DeviceManagerMock) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { + return 1, nil +} + +// GetDeviceIPAddress get device ip address +func (d *DeviceManagerMock) GetDeviceIPAddress(logicID, ipType int32) (string, error) { + if ipType == 0 { + return "127.0.0.1", nil + } + return "::1", nil +} + +// CreateVirtualDevice create virtual device +func (d *DeviceManagerMock) CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. + CgoCreateVDevOut, error) { + return common.CgoCreateVDevOut{}, nil +} + +// GetVirtualDeviceInfo get virtual device info +func (d *DeviceManagerMock) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + return common.VirtualDevInfo{}, nil +} + +// DestroyVirtualDevice destroy virtual device +func (d *DeviceManagerMock) DestroyVirtualDevice(logicID int32, vDevID uint32) error { + return nil +} + +// GetMcuPowerInfo get mcu power info for cardID +func (d *DeviceManagerMock) GetMcuPowerInfo(cardID int32) (float32, error) { + return 1, nil +} + +// GetCardIDDeviceID get cardID and deviceID by logicID +func (d *DeviceManagerMock) GetCardIDDeviceID(logicID int32) (int32, int32, error) { + return 0, 0, nil +} + +// GetProductType get product type success +func (d *DeviceManagerMock) GetProductType(cardID, deviceID int32) (string, error) { + return "", nil +} + +// GetAllProductType get all product type success +func (d *DeviceManagerMock) GetAllProductType() ([]string, error) { + return []string{}, nil +} + +// GetNpuWorkMode get npu chip work mode SMP success +func (d *DeviceManagerMock) GetNpuWorkMode() string { + return common.SMPMode +} + +// SetDeviceReset set device reset success +func (d *DeviceManagerMock) SetDeviceReset(cardID, deviceID int32) error { + return nil +} + +// GetDeviceBootStatus get device boot status success +func (d *DeviceManagerMock) GetDeviceBootStatus(logicID int32) (int, error) { + return common.BootStartFinish, nil +} + +// GetDeviceAllErrorCode get device all error code success +func (d *DeviceManagerMock) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { + return 0, []int64{}, nil +} + +// SubscribeDeviceFaultEvent subscribe device fault event success +func (d *DeviceManagerMock) SubscribeDeviceFaultEvent(logicID int32) error { + return nil +} + +// SetFaultEventCallFunc set fault event call func success +func (d *DeviceManagerMock) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { + return nil +} + +// GetDieID get die id success +func (d *DeviceManagerMock) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { + return "ABCDEFGHIGKLMNOPQRSTUVWXYZ01234567890123", nil +} + +// GetDevProcessInfo get process info +func (d *DeviceManagerMock) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { + return &common.DevProcessInfo{}, nil +} + +// GetPCIeBusInfo get pcie bus info +func (d *DeviceManagerMock) GetPCIeBusInfo(logicID int32) (string, error) { + return "0000:61:00.0", nil +} + +// GetBoardInfo Get board info +func (d *DeviceManagerMock) GetBoardInfo(logicID int32) (common.BoardInfo, error) { + return common.BoardInfo{}, nil +} + +// GetCardElabelV2 get card elabel information +func (d *DeviceManagerMock) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + return common.ElabelInfo{}, nil +} + +// GetProductTypeArray test for get product type array +func (d *DeviceManagerMock) GetProductTypeArray() []string { + return []string{common.Atlas200ISoc} +} + +// GetPCIEBandwidth get pcie bandwidth +func (d *DeviceManagerMock) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { + return common.PCIEBwStat{}, nil +} + +// SetIsTrainingCard set IsTrainingCard +func (d *DeviceManagerMock) SetIsTrainingCard() error { + return nil +} + +// IsTrainingCard get IsTrainingCard +func (d *DeviceManagerMock) IsTrainingCard() bool { + return true +} + +// GetDcmiVersion get dcmi version +func (d *DeviceManagerMock) GetDcmiVersion() string { + return "v1" +} + +// GetValidChipInfo get valid chip info from all npu +func (d *DeviceManagerMock) GetValidChipInfo() (common.ChipInfo, error) { + return common.ChipInfo{}, nil +} + +// GetDeviceEccInfo get device ECC info +func (d *DeviceManagerMock) GetDeviceEccInfo(logicID int32, + dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { + return &common.ECCInfo{EnableFlag: 1}, nil +} + +// GetSuperPodInfo get super pod info +func (d *DeviceManagerMock) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { + return common.CgoSuperPodInfo{}, nil +} + +// GetSioInfo get sio info +func (d *DeviceManagerMock) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { + return &common.SioCrcErrStatisticInfo{ + TxErrCnt: 0, + RxErrCnt: 0, + }, nil +} + +// GetHccsStatisticInfo get hccs statistic info +func (d *DeviceManagerMock) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + return &common.HccsStatisticInfo{}, nil +} + +// GetHccsStatisticInfoInU64 get hccs statistic info in u64 +func (d *DeviceManagerMock) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { + return &common.HccsStatisticInfo{}, nil +} + +// GetMainBoardId get main board id +func (d *DeviceManagerMock) GetMainBoardId() uint32 { + return 0 +} + +// GetHccsBandwidthInfo get hccs statistic info +func (d *DeviceManagerMock) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + return &common.HccsBandwidthInfo{}, nil +} + +// GetBrotherCardID get brother card id +func (d *DeviceManagerMock) GetBrotherCardID(cardID, deviceID int32) (int32, error) { + const noneBroCard = -1 + return noneBroCard, nil +} + +// GetOutBandChannelState get out band channel state +func (d *DeviceManagerMock) GetOutBandChannelState(cardID, deviceID int32) error { + return nil +} + +// PreResetSoc pre reset soc, used before reset out band +func (d *DeviceManagerMock) PreResetSoc(cardID, deviceID int32) error { + return nil +} + +// SetDeviceResetOutBand reset spec device out band +func (d *DeviceManagerMock) SetDeviceResetOutBand(cardID, deviceID int32) error { + return nil +} + +// RescanSoc trigger soc rescan, non-blocking +func (d *DeviceManagerMock) RescanSoc(cardID, deviceID int32) error { + return nil +} + +// GetChipBaseInfos get chip base info +func (d *DeviceManagerMock) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { + return nil, nil +} + +func (d *DeviceManagerMock) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } + +func (d *DeviceManagerMock) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go new file mode 100644 index 0000000..8ad8d7c --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go @@ -0,0 +1,369 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package devmanager this for device driver manager error mock +package devmanager + +import ( + "errors" + + "ascend-common/api" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +var errorMsg = "mock error" + +// DeviceManagerMockErr common device manager mock error for Ascend910/310P/310 +type DeviceManagerMockErr struct { +} + +// DcStartHccsPingMesh start hccs ping mesh +func (d *DeviceManagerMockErr) DcStartHccsPingMesh(i int32, i2 int32, i3 int, + operate common.HccspingMeshOperate) error { + return errors.New(errorMsg) +} + +// DcStopHccsPingMesh stop hccs ping mesh +func (d *DeviceManagerMockErr) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { + return errors.New(errorMsg) +} + +// DcGetHccsPingMeshInfo get hccs ping mesh info +func (d *DeviceManagerMockErr) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, + error) { + return nil, errors.New(errorMsg) +} + +// DcGetHccsPingMeshState get hccs ping mesh state +func (d *DeviceManagerMockErr) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { + return 1, errors.New(errorMsg) +} + +// Init load symbol and initialize dcmi +func (d *DeviceManagerMockErr) Init() error { + return errors.New(errorMsg) +} + +// ShutDown clean the dynamically loaded resource +func (d *DeviceManagerMockErr) ShutDown() error { + return errors.New(errorMsg) +} + +// GetDevType return mock type +func (d *DeviceManagerMockErr) GetDevType() string { + return api.Ascend910A +} + +// GetDeviceCount get npu device count +func (d *DeviceManagerMockErr) GetDeviceCount() (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetCardList get all card list +func (d *DeviceManagerMockErr) GetCardList() (int32, []int32, error) { + return 1, []int32{0}, errors.New(errorMsg) +} + +// GetDeviceNumInCard get all device list in one card +func (d *DeviceManagerMockErr) GetDeviceNumInCard(cardID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceList get all device logicID list +func (d *DeviceManagerMockErr) GetDeviceList() (int32, []int32, error) { + return 1, []int32{0}, errors.New(errorMsg) +} + +// GetDeviceHealth query npu device health status +func (d *DeviceManagerMockErr) GetDeviceHealth(logicID int32) (uint32, error) { + return 0, errors.New(errorMsg) +} + +// GetDeviceNetWorkHealth query npu device network health status +func (d *DeviceManagerMockErr) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { + return 0, errors.New(errorMsg) +} + +// GetDeviceUtilizationRate get npu device utilization +func (d *DeviceManagerMockErr) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceTemperature get npu device temperature +func (d *DeviceManagerMockErr) GetDeviceTemperature(logicID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceVoltage get npu device voltage +func (d *DeviceManagerMockErr) GetDeviceVoltage(logicID int32) (float32, error) { + return 1, errors.New(errorMsg) +} + +// GetDevicePowerInfo get npu device power info +func (d *DeviceManagerMockErr) GetDevicePowerInfo(logicID int32) (float32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceFrequency get npu device work frequency +func (d *DeviceManagerMockErr) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceMemoryInfo get npu memory information +func (d *DeviceManagerMockErr) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { + return &common.MemoryInfo{ + MemorySize: 1, + MemoryAvailable: 1, + Frequency: 1, + Utilization: 1, + }, errors.New(errorMsg) +} + +// GetDeviceHbmInfo get npu HBM module memory and frequency information +func (d *DeviceManagerMockErr) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { + return &common.HbmInfo{ + MemorySize: 1, + Frequency: 1, + Usage: 1, + Temp: 1, + BandWidthUtilRate: 1, + }, errors.New(errorMsg) +} + +// GetDeviceErrorCode get npu device error code +func (d *DeviceManagerMockErr) GetDeviceErrorCode(logicID int32) (int32, int64, error) { + return int32(0), int64(0), errors.New(errorMsg) +} + +// GetChipInfo get npu device error code +func (d *DeviceManagerMockErr) GetChipInfo(logicID int32) (*common.ChipInfo, error) { + chip := &common.ChipInfo{ + Type: "ascend", + Name: common.Chip910, + Version: "v1", + } + return chip, errors.New(errorMsg) +} + +// GetPhysicIDFromLogicID get device physic id from logic id +func (d *DeviceManagerMockErr) GetPhysicIDFromLogicID(logicID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetLogicIDFromPhysicID get device logic id from physic id +func (d *DeviceManagerMockErr) GetLogicIDFromPhysicID(physicID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceLogicID get device logic id from card id and device id +func (d *DeviceManagerMockErr) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { + return 1, errors.New(errorMsg) +} + +// GetDeviceIPAddress get device ip address +func (d *DeviceManagerMockErr) GetDeviceIPAddress(logicID, ipType int32) (string, error) { + return "127.0.0.1", errors.New(errorMsg) +} + +// CreateVirtualDevice create virtual device +func (d *DeviceManagerMockErr) CreateVirtualDevice(logicID int32, + vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { + return common.CgoCreateVDevOut{}, errors.New(errorMsg) +} + +// GetVirtualDeviceInfo get virtual device info +func (d *DeviceManagerMockErr) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { + return common.VirtualDevInfo{}, errors.New(errorMsg) +} + +// DestroyVirtualDevice destroy virtual device +func (d *DeviceManagerMockErr) DestroyVirtualDevice(logicID int32, vDevID uint32) error { + return errors.New(errorMsg) +} + +// GetMcuPowerInfo get mcu power info for cardID +func (d *DeviceManagerMockErr) GetMcuPowerInfo(cardID int32) (float32, error) { + return 1, errors.New(errorMsg) +} + +// GetCardIDDeviceID get cardID and deviceID by logicID +func (d *DeviceManagerMockErr) GetCardIDDeviceID(logicID int32) (int32, int32, error) { + return 0, 0, errors.New(errorMsg) +} + +// GetProductType get product type failed +func (d *DeviceManagerMockErr) GetProductType(cardID, deviceID int32) (string, error) { + return "", errors.New("not found product type name") +} + +// GetAllProductType get all product type failed +func (d *DeviceManagerMockErr) GetAllProductType() ([]string, error) { + return []string{}, errors.New("not found product type name") +} + +// GetNpuWorkMode get npu work mode failed +func (d *DeviceManagerMockErr) GetNpuWorkMode() string { + return "" +} + +// SetDeviceReset set device reset failed +func (d *DeviceManagerMockErr) SetDeviceReset(cardID, deviceID int32) error { + return errors.New(errorMsg) +} + +// GetDeviceBootStatus get device boot status failed +func (d *DeviceManagerMockErr) GetDeviceBootStatus(logicID int32) (int, error) { + return common.RetError, errors.New(errorMsg) +} + +// GetDeviceAllErrorCode get device all error code failed +func (d *DeviceManagerMockErr) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { + return common.RetError, nil, errors.New(errorMsg) +} + +// SubscribeDeviceFaultEvent subscribe device fault event failed +func (d *DeviceManagerMockErr) SubscribeDeviceFaultEvent(logicID int32) error { + return errors.New(errorMsg) +} + +// SetFaultEventCallFunc set fault event call func failed +func (d *DeviceManagerMockErr) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { + return errors.New(errorMsg) +} + +// GetDieID get die id failed +func (d *DeviceManagerMockErr) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { + return "", errors.New(errorMsg) +} + +// GetDevProcessInfo get process info +func (d *DeviceManagerMockErr) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetPCIeBusInfo get PCIe bus info +func (d *DeviceManagerMockErr) GetPCIeBusInfo(logicID int32) (string, error) { + return "", errors.New(errorMsg) +} + +// GetBoardInfo get board info +func (d *DeviceManagerMockErr) GetBoardInfo(logicID int32) (common.BoardInfo, error) { + return common.BoardInfo{}, errors.New(errorMsg) +} + +// GetProductTypeArray test for get empty product type array +func (d *DeviceManagerMockErr) GetProductTypeArray() []string { + return nil +} + +// GetPCIEBandwidth get pcie bandwidth +func (d *DeviceManagerMockErr) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { + return common.PCIEBwStat{}, errors.New(errorMsg) +} + +// SetIsTrainingCard set IsTrainingCard +func (d *DeviceManagerMockErr) SetIsTrainingCard() error { + return errors.New(errorMsg) +} + +// IsTrainingCard get IsTrainingCard +func (d *DeviceManagerMockErr) IsTrainingCard() bool { + return false +} + +// GetDcmiVersion get dcmi version failed +func (d *DeviceManagerMockErr) GetDcmiVersion() string { + return "" +} + +// GetValidChipInfo get valid chip info from all npu +func (d *DeviceManagerMockErr) GetValidChipInfo() (common.ChipInfo, error) { + return common.ChipInfo{}, errors.New("failed to find chip info") +} + +// GetDeviceEccInfo get device ECC info +func (d *DeviceManagerMockErr) GetDeviceEccInfo(logicID int32, + dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { + return nil, errors.New("failed to get device ECC info") +} + +// GetSuperPodInfo get super pod info +func (d *DeviceManagerMockErr) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { + return common.CgoSuperPodInfo{}, nil +} + +// GetSioInfo get sio info +func (d *DeviceManagerMockErr) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetHccsStatisticInfo get hccs statistic info +func (d *DeviceManagerMockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetHccsStatisticInfoInU64 get hccs statistic info in u64 +func (d *DeviceManagerMockErr) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetMainBoardId get main board id +func (d *DeviceManagerMockErr) GetMainBoardId() uint32 { + return 0 +} + +// GetHccsBandwidthInfo get hccs statistic info +func (d *DeviceManagerMockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { + return nil, errors.New(errorMsg) +} + +// GetBrotherCardID get brother card id +func (d *DeviceManagerMockErr) GetBrotherCardID(cardID, deviceID int32) (int32, error) { + return -1, nil +} + +// GetOutBandChannelState get out band channel state +func (d *DeviceManagerMockErr) GetOutBandChannelState(cardID, deviceID int32) error { + return nil +} + +// PreResetSoc pre reset soc, used before reset out band +func (d *DeviceManagerMockErr) PreResetSoc(cardID, deviceID int32) error { + return nil +} + +// SetDeviceResetOutBand reset spec device out band +func (d *DeviceManagerMockErr) SetDeviceResetOutBand(cardID, deviceID int32) error { + return nil +} + +// RescanSoc trigger soc rescan, non-blocking +func (d *DeviceManagerMockErr) RescanSoc(cardID, deviceID int32) error { + return nil +} + +// GetChipBaseInfos get chip base info +func (d *DeviceManagerMockErr) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { + return nil, errors.New(errorMsg) +} + +func (d *DeviceManagerMockErr) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } + +func (d *DeviceManagerMockErr) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } + +// GetCardElabelV2 get card elabel information +func (d *DeviceManagerMockErr) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { + return common.ElabelInfo{}, nil +} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_test.go new file mode 100644 index 0000000..221a812 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/devmanager_test.go @@ -0,0 +1,78 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package devmanager for device driver manager +package devmanager + +import ( + "errors" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" +) + +// TestGetCardIdAndDeviceId test the getCardIdAndDeviceId function +func TestGetCardIdAndDeviceId(t *testing.T) { + + var ( + cardId, deviceId = int32(0), int32(0) + err error + returnValue = int32(0) + errReturnValue = int32(-1) + ) + manager := &DeviceManager{DcMgr: &dcmi.DcManager{}} + convey.Convey("failed to get info by dcmi", t, func() { + mk2 := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", + errReturnValue, errReturnValue, errors.New("mock err")) + defer mk2.Reset() + cardId, deviceId, err = manager.getCardIdAndDeviceId(0) + + convey.So(cardId, convey.ShouldEqual, common.RetError) + convey.So(deviceId, convey.ShouldEqual, common.RetError) + convey.So(err, convey.ShouldNotBeNil) + + }) + + mk := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", returnValue, returnValue, nil) + defer mk.Reset() + + convey.Convey("get info from dcmi", t, func() { + testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) + }) + convey.Convey("get info from cache", t, func() { + testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) + }) + +} + +func testGetCardIdAndDeviceId(t *testing.T, cardId int32, deviceId int32, err error, manager *DeviceManager) { + cardId, deviceId, err = manager.getCardIdAndDeviceId(0) + + convey.So(cardId, convey.ShouldEqual, 0) + convey.So(deviceId, convey.ShouldEqual, 0) + convey.So(err, convey.ShouldBeNil) + +} +func init() { + config := hwlog.LogConfig{ + OnlyToStdout: true, + } + hwlog.InitRunLogger(&config, nil) +} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go new file mode 100644 index 0000000..b6388f4 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go @@ -0,0 +1,335 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hccn this for npu hccn info +package hccn + +import ( + "fmt" + "os" + "os/exec" + "strconv" + "strings" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/limiter" + "ascend-common/common-utils/utils" + "ascend-common/devmanager/common" +) + +const ( + space = " " + newLine = "\n" + colon = ":" + + // LinkUp npu interface up + LinkUp string = "UP" + // LinkDown npu interface down + LinkDown string = "DOWN" + + opticalPartLen = 2 + secondIndex = 2 + linkStatusPart = 3 + base64 = 64 + + cardHealthy = 0 + + normalCode = 1 + abnormalCode = 0 + + naValue = "NA" + notSupport = "not supported" + unknownStr = "Unknown!" + + limitSize = 1024 * 1024 +) + +func getInfoFromHccnTool(args ...string) (string, error) { + const hccnTool = "/usr/local/Ascend/driver/tools/hccn_tool" + if _, err := utils.CheckPath(hccnTool); err != nil { + return "", err + } + cmd := exec.Command(hccnTool, args...) + cmd.Env = []string{ + "PATH=" + os.Getenv("PATH"), + utils.LdLibPath + "=" + os.Getenv(utils.LdLibPath), + } + limitStdout := limiter.NewLimitedWriter(limitSize) + cmd.Stdout = limitStdout + cmd.Stderr = limiter.NewLimitedWriter(limitSize) + err := cmd.Run() + if err != nil { + return "", err + } + + return string(limitStdout.GetBufferBytes()), nil +} + +// GetNPULinkStatus exec "hccn_tool -i * -link -g" to get link status +func GetNPULinkStatus(phyID int32) (string, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-link", "-g"} + // command example: hccn_tool -i 0 -link -g + // success result example is: link status: DOWN + outStr, err := getInfoFromHccnTool(args...) + hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) + if err != nil { + return common.Abnormal, buildHccnErr(phyID, "link status", err) + } + replacedStr := strings.ReplaceAll(outStr, newLine, "") + outArr := strings.Split(replacedStr, space) + if len(outArr) != linkStatusPart { + return common.Abnormal, buildHccnErr(phyID, "link status", + fmt.Errorf("length of output %v is not equal to %v", outArr, linkStatusPart)) + } + + status := outArr[secondIndex] + hwlog.RunLog.Debugf("hccn_tool get npu link status: %s", status) + return status, nil +} + +// GetNPULinkSpeed exec "hccn_tool -i * -speed -g" to get link speed +func GetNPULinkSpeed(phyID int32) (int, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-speed", "-g"} + // command example: hccn_tool -i 0 -speed -g + // success result example is: Speed: 100000 Mb/s + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return common.RetError, buildHccnErr(phyID, "link speed", err) + } + return getSpeedFromOutStr(outStr, phyID) +} + +func getSpeedFromOutStr(outStr string, phyID int32) (int, error) { + if strings.Contains(outStr, unknownStr) { + return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("npu link speed is unknown")) + } + replacedStr := strings.ReplaceAll(outStr, newLine, "") + outArr := strings.Split(replacedStr, space) + if len(outArr) != linkStatusPart { + return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("length of output %v is not equal to %v", + outArr, linkStatusPart)) + } + const midIndex = 1 + speed, err := strconv.Atoi(outArr[midIndex]) + if err != nil { + return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("covert speed from string failed: %s", err)) + } + + return speed, nil +} + +// GetNPULinkUpNum exec "hccn_tool -i * -link_stat -g" to get link up count +func GetNPULinkUpNum(phyID int32) (int, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-link_stat", "-g"} + // command example: hccn_tool -i 0 -link_stat -g + // success result include: [device x]link up count : y + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return common.RetError, buildHccnErr(phyID, "link stat", err) + } + + const ( + linkUpArrLen = 6 + linkUpStr = "link up count" + ) + linkUPCount := 0 + lines := strings.Split(outStr, newLine) + for _, line := range lines { + if line == "" || !strings.Contains(line, linkUpStr) { + continue + } + + linkUpArr := strings.Fields(line) + if len(linkUpArr) != linkUpArrLen { + return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("length of output %v is not "+ + "equal to %v", linkUpArr, linkUpArrLen)) + } + if linkUPCount, err = strconv.Atoi(linkUpArr[linkUpArrLen-1]); err != nil { + return common.RetError, buildHccnErr(phyID, "link up num", + fmt.Errorf("covert link up num from string failed: %s", err)) + } + return linkUPCount, nil + } + + return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("did not find link up count")) +} + +// GetNPUStatInfo exec "hccn_tool -i * -stat -g" to get stat info +func GetNPUStatInfo(phyID int32) (map[string]int, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-stat", "-g"} + // command example: hccn_tool -i 0 -stat -g + // success result include: [device x]link up count : y + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return nil, buildHccnErr(phyID, "stat", err) + } + lines := strings.Split(outStr, newLine) + statInfoMap := make(map[string]int) + const statPartLen = 2 + for _, line := range lines { + statParts := strings.Split(line, colon) + if len(statParts) != statPartLen || statParts[1] == "" { + continue + } + statNum, err := strconv.Atoi(statParts[1]) + if err != nil { + hwlog.RunLog.Errorf("covert stat num of [%s] from string failed: %s", statParts[1], err) + continue + } + statInfoMap[statParts[0]] = statNum + } + + return statInfoMap, nil +} + +// GetNPUOpticalInfo exec "hccn_tool -i * -optical -g" to get optical info +func GetNPUOpticalInfo(phyID int32) (map[string]string, error) { + args := []string{"-i", strconv.Itoa(int(phyID)), "-optical", "-g"} + // command example: hccn_tool -i 0 -optical -g + // success result include: [device x]link up count : y + outStr, err := getInfoFromHccnTool(args...) + if err != nil { + return nil, buildHccnErr(phyID, "optical", err) + } + lines := strings.Split(outStr, newLine) + opticalInfoMap := make(map[string]string) + for _, line := range lines { + opticalParts := strings.Split(line, colon) + if len(opticalParts) != opticalPartLen { + continue + } + opticalKey := strings.ReplaceAll(strings.TrimSpace(opticalParts[0]), space, "_") + opticalValue := strings.TrimSpace(opticalParts[1]) + opticalInfoMap[opticalKey] = opticalValue + } + + return opticalInfoMap, nil +} + +// GetNPUInterfaceTraffic exec "hccn_tool -i * -bandwidth -g" to get bandwidth info +func GetNPUInterfaceTraffic(phyID int32) (float64, float64, error) { + const ( + noTraffic = common.RetError + trafficPartLen = 4 + txStr = "TX:" + rxStr = "RX:" + ) + + args := []string{"-i", strconv.Itoa(int(phyID)), "-bandwidth", "-g"} + // command example: hccn_tool -i 0 -bandwidth -g + // success result has two lines: + // Bandwidth TX: 0.00 MB/sec + // Bandwidth RX: 0.00 MB/sec + outStr, err := getInfoFromHccnTool(args...) + hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) + if err != nil { + return noTraffic, noTraffic, buildHccnErr(phyID, "interface traffic", err) + } + + var ( + tx = float64(noTraffic) + rx = float64(noTraffic) + ) + + lines := strings.Split(outStr, newLine) + for _, line := range lines { + if line == "" { + continue + } + + trafficArr := strings.Fields(line) + hwlog.RunLog.Debugf("npu bandwidth split as: %v", trafficArr) + if len(trafficArr) != trafficPartLen { + continue + } + if strings.Contains(line, txStr) { + tmpTx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) + if err != nil { + hwlog.RunLog.Errorf("get float data from Bandwidth TX err: %s", err) + continue + } + tx = tmpTx + } + if strings.Contains(line, rxStr) { + tmpRx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) + if err != nil { + hwlog.RunLog.Errorf("get float data from Bandwidth RX err: %s", err) + continue + } + rx = tmpRx + } + } + return tx, rx, nil +} + +// GetFloatDataFromStr get float data from string with space +func GetFloatDataFromStr(str, dataType string) float64 { + if str == "" || strings.Contains(str, naValue) || strings.Contains(str, notSupport) { + return common.RetError + } + dataParts := strings.Split(str, space) + if len(dataParts) != opticalPartLen { + errMsg := fmt.Sprintf("convert %v optical data type failed, "+ + "the length of optical data %v is %v not equal to %d. ", dataType, dataParts, len(dataParts), opticalPartLen) + hwlog.RunLog.Error(errMsg) + return common.RetError + } + floatData, err := strconv.ParseFloat(dataParts[0], base64) + if err != nil { + hwlog.RunLog.Errorf("convert %v optical data type to a floating-point number failed, "+ + "get float data from string %v failed, err: %v", dataType, dataParts[0], err) + return common.RetError + } + return floatData +} + +// GetHealthCode return union healthy code +func GetHealthCode(healthCode uint32) int { + if healthCode == common.UnRetError { + return common.RetError + } + + if healthCode == cardHealthy { + return normalCode + } + return abnormalCode +} + +// GetLinkStatusCode return union link status code +func GetLinkStatusCode(status string) int { + if status == common.Abnormal { + return common.RetError + } + + if status == LinkUp { + return normalCode + } + return abnormalCode +} + +// GetNetworkHealthy return union network healthy code +func GetNetworkHealthy(netCode uint32) int { + if netCode == common.UnRetError { + return common.RetError + } + + if netCode == common.NetworkInit || netCode == common.NetworkSuccess { + return normalCode + } + return abnormalCode +} + +func buildHccnErr(phyID int32, msg string, err error) error { + return fmt.Errorf("phyID(%d),get npu %s info failed,error is :%v", phyID, msg, err) +} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go new file mode 100644 index 0000000..7d4fe17 --- /dev/null +++ b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go @@ -0,0 +1,49 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package hccn this for npu hccn info +package hccn + +import ( + "fmt" + "strings" + "testing" +) + +func TestBuildHccnErr(t *testing.T) { + t.Run("normal error", func(t *testing.T) { + phyID := int32(1) + msg := "status" + originalErr := fmt.Errorf("permission denied") + + err := buildHccnErr(phyID, msg, originalErr) + + if !strings.Contains(err.Error(), "phyID(1)") { + t.Error("should contain phyID") + } + if !strings.Contains(err.Error(), "npu status") { + t.Error("should contain npu message") + } + if !strings.Contains(err.Error(), "permission denied") { + t.Error("should contain original error") + } + }) + + t.Run("nil error", func(t *testing.T) { + err := buildHccnErr(0, "", nil) + if !strings.Contains(err.Error(), "error is :nil") { + t.Error("should handle nil error") + } + }) +} diff --git a/mind-cluster/component/ascend-common/go.mod b/mind-cluster/component/ascend-common/go.mod new file mode 100644 index 0000000..e1e3bbb --- /dev/null +++ b/mind-cluster/component/ascend-common/go.mod @@ -0,0 +1,55 @@ +module ascend-common + +go 1.18 + +require ( + github.com/agiledragon/gomonkey/v2 v2.8.0 + github.com/fsnotify/fsnotify v1.6.0 + github.com/kubeflow/common v0.4.3 + github.com/smartystreets/goconvey v1.6.4 + k8s.io/api v0.25.3 + k8s.io/apimachinery v0.25.3 + k8s.io/client-go v0.25.3 +) + +require ( + github.com/PuerkitoBio/purell v1.1.1 // indirect + github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.8.0 // indirect + github.com/go-logr/logr v1.2.3 // indirect + github.com/go-openapi/jsonpointer v0.19.5 // indirect + github.com/go-openapi/jsonreference v0.19.5 // indirect + github.com/go-openapi/swag v0.19.14 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/gnostic v0.5.7-v3refs // indirect + github.com/google/go-cmp v0.5.8 // indirect + github.com/google/gofuzz v1.1.0 // indirect + github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/jtolds/gls v4.20.0+incompatible // indirect + github.com/mailru/easyjson v0.7.6 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect + golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect + golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect + golang.org/x/sys v0.8.0 // indirect + golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect + golang.org/x/text v0.3.7 // indirect + golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.28.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.70.1 // indirect + k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect + k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect + sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) diff --git a/mind-cluster/component/ascend-common/go.sum b/mind-cluster/component/ascend-common/go.sum new file mode 100644 index 0000000..000ced7 --- /dev/null +++ b/mind-cluster/component/ascend-common/go.sum @@ -0,0 +1,492 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= +cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= +cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= +cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= +cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= +cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= +cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= +cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= +cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= +cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= +cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= +cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= +cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= +cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= +cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= +cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= +cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= +cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= +github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= +github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/emicklei/go-restful/v3 v3.8.0 h1:eCZ8ulSerjdAiaNpF7GxXIE7ZCMo1moN1qX+S609eVw= +github.com/emicklei/go-restful/v3 v3.8.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= +github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= +github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= +github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonreference v0.19.5 h1:1WJP/wi4OjB4iV8KVbH73rQaoialJrqv8gitZLxGLtM= +github.com/go-openapi/jsonreference v0.19.5/go.mod h1:RdybgQwPxbL4UEjuAruzK1x3nE69AqPYEJeo/TWfEeg= +github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/go-openapi/swag v0.19.14 h1:gm3vOOXfiuw5i9p5N9xJvfjvuofpyvLA9Wr6QfK5Fng= +github.com/go-openapi/swag v0.19.14/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/gnostic v0.5.7-v3refs h1:FhTMOKj2VhjpouxvWJAV1TL304uMlb9zcDqkl6cEI54= +github.com/google/gnostic v0.5.7-v3refs/go.mod h1:73MKFl6jIHelAJNaBGFzt3SPtZULs9dYrGFt8OiIsHQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= +github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= +github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubeflow/common v0.4.3 h1:vVoOMNPOZK4wzZvQ4rsRLvC3SDi+J1fVKNHSXC/QRvU= +github.com/kubeflow/common v0.4.3/go.mod h1:Qb/5aON7/OWVkN8OnjRqqT0i8X/XzMekRIZ8lkLosj4= +github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= +github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= +golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= +golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 h1:RerP+noqYHUQ8CMRcPlC2nvTa4dcBIjegkuWdcUDuqg= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= +golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= +golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= +google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= +google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= +google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= +google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= +google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw= +google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +k8s.io/api v0.25.3 h1:Q1v5UFfYe87vi5H7NU0p4RXC26PPMT8KOpr1TLQbCMQ= +k8s.io/api v0.25.3/go.mod h1:o42gKscFrEVjHdQnyRenACrMtbuJsVdP+WVjqejfzmI= +k8s.io/apimachinery v0.25.3 h1:7o9ium4uyUOM76t6aunP0nZuex7gDf8VGwkR5RcJnQc= +k8s.io/apimachinery v0.25.3/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= +k8s.io/client-go v0.25.3 h1:oB4Dyl8d6UbfDHD8Bv8evKylzs3BXzzufLiO27xuPs0= +k8s.io/client-go v0.25.3/go.mod h1:t39LPczAIMwycjcXkVc+CB+PZV69jQuNx4um5ORDjQA= +k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= +k8s.io/klog/v2 v2.70.1 h1:7aaoSdahviPmR+XkS7FyxlkkXs6tHISSG03RxleQAVQ= +k8s.io/klog/v2 v2.70.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= +k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkIFQtZShWqoha7snGixVgEA= +k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= +k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= +k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= +rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= +sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= +sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/mind-cluster/component/npu-exporter/.gitignore b/mind-cluster/component/npu-exporter/.gitignore new file mode 100644 index 0000000..723ef36 --- /dev/null +++ b/mind-cluster/component/npu-exporter/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/LICENSE b/mind-cluster/component/npu-exporter/LICENSE new file mode 100644 index 0000000..f49a4e1 --- /dev/null +++ b/mind-cluster/component/npu-exporter/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/README.md b/mind-cluster/component/npu-exporter/README.md new file mode 100644 index 0000000..4bde4a9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/README.md @@ -0,0 +1,42 @@ +# NPU-Exporter + +# 组件介绍 + + +Prometheus(普罗米修斯)是一个开源的系统监测和警报工具包,Exporter就是专门为Prometheus提供数据源的组件。由于Prometheus社区的活跃和大量的使用,已经有很多厂商或者服务提供了Exporter,如Prometheus官方的Node Exporter,MySQL官方出的MySQL Server Exporter和NVIDA的NVIDIA GPU Exporter。这些Exporter负责将特定监测对象的指标,转成Prometheus能够识别的数据格式,供Prometheus集成。NPU-Expoter是华为自研的专门收集华为NPU各种监测信息和指标,并封装成Prometheus专用数据格式的一个服务组件。 + + +# 编译NPU-Exporter + +1. 通过git拉取源码,获得npu-exporter。 + + 示例:Npu-Exporter源码放在/home/mind-cluster/component/npu-exporter目录下 + +2. 执行以下命令,进入Npu-Exporter构建目录,执行构建脚本,在“output“目录下生成二进制npu-exporter、yaml文件和Dockerfile等文件。 + + **cd** _/home/mind-cluster/component/_**npu-exporter/build/** + + **chmod +x build.sh** + + **./build.sh** + +3. 执行以下命令,查看**output**生成的软件列表。 + + **ll** _/home/mind-cluster/component/_**npu-exporter/output** + + ``` + drwxr-xr-x 2 root root 4096 Feb 23 07:10 . + drwxr-xr-x 10 root root 4096 Feb 23 07:10 .. + -r-------- 1 root root 623 Feb 23 07:10 Dockerfile + -r-------- 1 root root 623 Feb 23 07:10 Dockerfile-310P-1usoc + -r-------- 1 root root 623 Feb 23 07:10 metricConfiguration.json + -r-x------ 1 root root 25481072 Feb 23 07:10 npu-exporter + -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-310P-1usoc-v6.0.0.yaml + -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-v6.0.0.yaml + -r-------- 1 root root 623 Feb 23 07:10 pluginConfiguration.json + -r-x------ 1 root root 2579 Feb 23 07:10 run_for_310P_1usoc.sh + ``` + +# 说明 + +1. 当前Npu-Exporter仅支持http启动,如果需要使用https启动,请自行完成代码修改并适配Prometheus \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile b/mind-cluster/component/npu-exporter/build/Dockerfile new file mode 100644 index 0000000..24f9943 --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/Dockerfile @@ -0,0 +1,21 @@ +FROM ubuntu:22.04 + +RUN useradd -d /home/HwHiAiUser -u 1000 -m -s /usr/sbin/nologin HwHiAiUser &&\ + usermod root -s /usr/sbin/nologin + +COPY ./npu-exporter /usr/local/bin/ +COPY ./metricConfiguration.json /usr/local/metricConfiguration.json +COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json + +RUN chown root:root /usr/local/bin/npu-exporter &&\ + chmod 750 -R /home/HwHiAiUser &&\ + chmod 550 /usr/local/bin/ &&\ + chmod 500 /usr/local/bin/npu-exporter &&\ + chmod 440 /usr/local/metricConfiguration.json &&\ + chmod 440 /usr/local/pluginConfiguration.json &&\ + echo 'umask 027' >> /etc/profile && \ + echo 'source /etc/profile' >> ~/.bashrc +ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi + +CMD /usr/local/bin/npu-exporter + diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc b/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc new file mode 100644 index 0000000..5927f7d --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc @@ -0,0 +1,31 @@ +FROM ubuntu:22.04 + +RUN groupadd -g 1000 HwHiAiUser && useradd -u 1000 -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser &&\ + groupadd -g 1101 HwDmUser && useradd -u 1101 -g HwDmUser -d /home/HwDmUser -m HwDmUser &&\ + groupadd -g 1102 HwBaseUser && useradd -u 1102 -g HwBaseUser -d /home/HwBaseUser -m HwBaseUser &&\ + usermod -a -G HwBaseUser HwHiAiUser &&\ + usermod -a -G HwDmUser HwHiAiUser &&\ + usermod -a -G HwBaseUser HwDmUser &&\ + usermod -a -G HwHiAiUser HwDmUser &&\ + usermod root -s /usr/sbin/nologin + +COPY ./npu-exporter /usr/local/bin/ +COPY ./run_for_310P_1usoc.sh / +COPY ./metricConfiguration.json /usr/local/metricConfiguration.json +COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json + +RUN chown root:root /usr/local/bin/npu-exporter &&\ + chmod 500 /run_for_310P_1usoc.sh &&\ + chmod 550 /usr/local/bin/ &&\ + chmod 500 /usr/local/bin/npu-exporter &&\ + chmod 440 /usr/local/metricConfiguration.json &&\ + chmod 440 /usr/local/pluginConfiguration.json &&\ + echo 'umask 027' >> /etc/profile && \ + echo 'source /etc/profile' >> ~/.bashrc + +RUN ln -s /lib /lib64 2>&1 >> /dev/null &&\ + mkdir -m 750 /var/driver -m 750 /var/dmp -m 750 /usr/slog -p -m 750 /home/drv/hdc_ppc &&\ + chown HwDmUser:HwDmUser /var/dmp &&\ + chown HwHiAiUser:HwHiAiUser /var/driver &&\ + chown HwHiAiUser:HwHiAiUser /home/drv/hdc_ppc &&\ + chown HwHiAiUser:HwHiAiUser /usr/slog \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/build.sh b/mind-cluster/component/npu-exporter/build/build.sh new file mode 100644 index 0000000..16c101d --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/build.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Perform build npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2020-2023. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +set -e +CUR_DIR=$(dirname $(readlink -f $0)) +TOP_DIR=$(realpath "${CUR_DIR}"/..) +export GO111MODULE="on" +VER_FILE="${TOP_DIR}"/service_config.ini +build_version="v6.0.0" +if [ -f "$VER_FILE" ]; then + line=$(sed -n '1p' "$VER_FILE" 2>&1) + #cut the chars after ':' and add char 'v', the final example is v3.0.0 + build_version="v"${line#*=} +fi + +arch=$(arch 2>&1) +echo "Build Architecture is" "${arch}" + +OUTPUT_NAME="npu-exporter" +DOCKER_FILE_NAME="Dockerfile" +A200ISOC_DOCKER_FILE_NAME="Dockerfile-310P-1usoc" +A200ISOC_RUN_SHELL="run_for_310P_1usoc.sh" + +function clean() { + rm -rf "${TOP_DIR}"/output + mkdir -p "${TOP_DIR}"/output +} + +function build() { + cd "${TOP_DIR}/cmd/npu-exporter" + CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ + -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ + -o ${OUTPUT_NAME} + ls ${OUTPUT_NAME} + if [ $? -ne 0 ]; then + echo "fail to find npu-exporter" + exit 1 + fi +} + +function mv_file() { + mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + cp "${TOP_DIR}"/build/npu-exporter-310P-1usoc.yaml "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml + cp "${TOP_DIR}"/build/metricConfiguration.json "${TOP_DIR}"/output/ + cp "${TOP_DIR}"/build/pluginConfiguration.json "${TOP_DIR}"/output/ + sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml + cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/${A200ISOC_DOCKER_FILE_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/${A200ISOC_RUN_SHELL} "${TOP_DIR}"/output + chmod 400 "${TOP_DIR}"/output/* + chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} + chmod 500 "${TOP_DIR}"/output/${A200ISOC_RUN_SHELL} + +} + +function main() { + clean + build + mv_file +} + +main diff --git a/mind-cluster/component/npu-exporter/build/build_ch.sh b/mind-cluster/component/npu-exporter/build/build_ch.sh new file mode 100644 index 0000000..878fcbd --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/build_ch.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Perform build npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2025-2025. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +set -e +CUR_DIR=$(dirname $(readlink -f $0)) +TOP_DIR=$(realpath "${CUR_DIR}"/..) +export GO111MODULE="on" +VER_FILE="${TOP_DIR}"/service_config.ini +build_version="v6.0.0" +if [ -f "$VER_FILE" ]; then + line=$(sed -n '1p' "$VER_FILE" 2>&1) + #cut the chars after ':' and add char 'v', the final example is v3.0.0 + build_version="v"${line#*=} +fi + +arch=$(arch 2>&1) +echo "Build Architecture is" "${arch}" + +OUTPUT_NAME="npu-exporter" +DOCKER_FILE_NAME="Dockerfile" + + +function clean() { + rm -rf "${TOP_DIR}"/output + mkdir -p "${TOP_DIR}"/output +} + +function build() { + cd "${TOP_DIR}/cmd/npu-exporter" + CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" + go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ + -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ + -o ${OUTPUT_NAME} + ls ${OUTPUT_NAME} + if [ $? -ne 0 ]; then + echo "fail to find npu-exporter" + exit 1 + fi +} + +function mv_file() { + mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output + cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + sed -i "s/ascend*/alan/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml + + cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output + chmod 400 "${TOP_DIR}"/output/* + chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} + +} + +function main() { + clean + build + mv_file +} + +main diff --git a/mind-cluster/component/npu-exporter/build/metricConfiguration.json b/mind-cluster/component/npu-exporter/build/metricConfiguration.json new file mode 100644 index 0000000..3dbd82b --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/metricConfiguration.json @@ -0,0 +1,13 @@ +[ + {"metricsGroup": "ddr", "state": "ON"}, + {"metricsGroup": "hccs", "state": "ON"}, + {"metricsGroup": "npu", "state": "ON"}, + {"metricsGroup": "network", "state": "ON"}, + {"metricsGroup": "pcie", "state": "ON"}, + {"metricsGroup": "roce", "state": "ON"}, + {"metricsGroup": "sio", "state": "ON"}, + {"metricsGroup": "vnpu", "state": "ON"}, + {"metricsGroup": "version", "state": "ON"}, + {"metricsGroup": "optical", "state": "ON"}, + {"metricsGroup": "hbm", "state": "ON"} +] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml new file mode 100644 index 0000000..3b6e22f --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml @@ -0,0 +1,167 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: npu-exporter +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: exporter-network-policy + namespace: npu-exporter +spec: + podSelector: + matchLabels: + app: npu-exporter + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus + egress: + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npu-exporter-310p-1usoc + namespace: npu-exporter +spec: + selector: + matchLabels: + app: npu-exporter + template: + metadata: + ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. + annotations: + seccomp.security.alpha.kubernetes.io/pod: runtime/default + labels: + app: npu-exporter + spec: + ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile +# securityContext: +# seccompProfile: +# type: RuntimeDefault + automountServiceAccountToken: false + nodeSelector: + workerselector: dls-worker-node + servertype: soc + containers: + - name: npu-exporter + image: npu-exporter:v5.0.RC1 + resources: + requests: + memory: 1000Mi + cpu: 1000m + limits: + memory: 1000Mi + cpu: 1000m + imagePullPolicy: Never + command: [ "/bin/bash", "-c", "/run_for_310P_1usoc.sh"] + # pair firstly + securityContext: + privileged: true + readOnlyRootFilesystem: true + runAsUser: 0 + runAsGroup: 0 + ports: + - name: http + containerPort: 8082 + protocol: TCP + volumeMounts: + - name: log-npu-exporter + mountPath: /var/log/mindx-dl/npu-exporter + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: ascend-driver + mountPath: /usr/local/Ascend/driver + readOnly: true + - name: ascend-dcmi + mountPath: /usr/local/dcmi + readOnly: true + - name: libyaml + mountPath: /usr/lib64/libyaml-0.so.2 + readOnly: true + - name: docker-shim # delete when only use containerd + mountPath: /run/dockershim.sock + readOnly: true + - name: docker # delete when only use containerd + mountPath: /run/docker/containerd/containerd.sock + readOnly: true + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + mountPath: /var/run/cri-dockerd.sock + readOnly: true + - name: containerd + mountPath: /run/containerd + readOnly: true + - name: tmp + mountPath: /tmp + - name: dmp + mountPath: /var/dmp_daemon + readOnly: true + - name: slogd + mountPath: /var/slogd + readOnly: true + - name: hbasic + mountPath: /etc/hdcBasic.cfg + readOnly: true + - name: slogconf + mountPath: /etc/slog.conf + readOnly: true + volumes: + - name: log-npu-exporter + hostPath: + path: /var/log/mindx-dl/npu-exporter + type: Directory + - name: localtime + hostPath: + path: /etc/localtime + - name: libyaml + hostPath: + path: /usr/lib64/libyaml-0.so.2 + type: File + - name: ascend-driver + hostPath: + path: /usr/local/Ascend/driver + - name: ascend-dcmi + hostPath: + path: /usr/local/dcmi + - name: docker-shim # delete when only use containerd + hostPath: + path: /run/dockershim.sock + - name: docker # delete when only use containerd + hostPath: + path: /run/docker/containerd/containerd.sock + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + hostPath: + path: /var/run/cri-dockerd.sock + - name: containerd + hostPath: + path: /run/containerd + - name: tmp + hostPath: + path: /tmp + - name: dmp + hostPath: + path: /var/dmp_daemon + type: File + - name: slogd + hostPath: + path: /var/slogd + type: File + - name: hbasic + hostPath: + path: /etc/hdcBasic.cfg + type: File + - name: slogconf + hostPath: + path: /etc/slog.conf + type: File \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter.yaml new file mode 100644 index 0000000..970e3cf --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/npu-exporter.yaml @@ -0,0 +1,140 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: npu-exporter +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: exporter-network-policy + namespace: npu-exporter +spec: + podSelector: + matchLabels: + app: npu-exporter + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus + egress: + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + app: prometheus +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npu-exporter + namespace: npu-exporter +spec: + selector: + matchLabels: + app: npu-exporter + template: + metadata: + ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. + annotations: + seccomp.security.alpha.kubernetes.io/pod: runtime/default + labels: + app: npu-exporter + spec: + ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile +# securityContext: +# seccompProfile: +# type: RuntimeDefault + automountServiceAccountToken: false + nodeSelector: + workerselector: dls-worker-node + containers: + - name: npu-exporter + image: npu-exporter:v5.0.RC1 + resources: + requests: + memory: 1000Mi + cpu: 1000m + limits: + memory: 1000Mi + cpu: 1000m + imagePullPolicy: Never + command: [ "/bin/bash", "-c", "--"] + # pair firstly + args: [ "umask 027;npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 + -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker" ] + securityContext: + privileged: true + readOnlyRootFilesystem: true + runAsUser: 0 + runAsGroup: 0 + ports: + - name: http + containerPort: 8082 + protocol: TCP + volumeMounts: + - name: log-npu-exporter + mountPath: /var/log/mindx-dl/npu-exporter + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: ascend-driver + mountPath: /usr/local/Ascend/driver + readOnly: true + - name: ascend-dcmi + mountPath: /usr/local/dcmi + readOnly: true + - name: docker-shim # delete when only use containerd or isula + mountPath: /var/run/dockershim.sock + readOnly: true + - name: docker # delete when only use containerd or isula + mountPath: /var/run/docker + readOnly: true + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + mountPath: /var/run/cri-dockerd.sock + readOnly: true + - name: containerd # delete when only use isula + mountPath: /run/containerd + readOnly: true + - name: isulad # delete when use containerd or docker + mountPath: /run/isulad.sock + readOnly: true + - name: tmp + mountPath: /tmp + volumes: + - name: log-npu-exporter + hostPath: + path: /var/log/mindx-dl/npu-exporter + type: Directory + - name: localtime + hostPath: + path: /etc/localtime + - name: ascend-driver + hostPath: + path: /usr/local/Ascend/driver + - name: ascend-dcmi + hostPath: + path: /usr/local/dcmi + - name: docker-shim # delete when only use containerd or isula + hostPath: + path: /var/run/dockershim.sock + - name: docker # delete when only use containerd or isula + hostPath: + path: /var/run/docker + - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker + hostPath: + path: /var/run/cri-dockerd.sock + - name: containerd # delete when only use isula + hostPath: + path: /run/containerd + - name: isulad # delete when use containerd or docker + hostPath: + path: /run/isulad.sock + - name: tmp + hostPath: + path: /tmp + diff --git a/mind-cluster/component/npu-exporter/build/pluginConfiguration.json b/mind-cluster/component/npu-exporter/build/pluginConfiguration.json new file mode 100644 index 0000000..68823e0 --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/pluginConfiguration.json @@ -0,0 +1,4 @@ +[ + {"metricsGroup": "MyPlugin", "state": "OFF"}, + {"metricsGroup": "text", "state": "ON"} +] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh b/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh new file mode 100644 index 0000000..055ed41 --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Perform build npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2022-2022. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +set -e + +# log process run in background +echo -e "[INFO]\t $(date +"%F %T:%N")\t start slogd server in background" +su - HwHiAiUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/slogd -d &" +echo -e "[INFO]\t $(date +"%F %T:%N")\t start dmp_daemon server in background" +# dcmi interface process run in background +su - HwDmUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/dmp_daemon -I -M -U 8087 &" + +export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi +# the host is openEuler, so the parameters "endpoint" and "containerd" are set to adapt to "-containerMode=docker" in default +# in openEuler os, the path of parameters "endpoint" and "containerd" are not in the default place +echo -e "[INFO]\t $(date +"%F %T:%N")\t start npu-exporter server" +/usr/local/bin/npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker -endpoint=/run/dockershim.sock -containerd=/run/docker/containerd/containerd.sock + diff --git a/mind-cluster/component/npu-exporter/build/test.sh b/mind-cluster/component/npu-exporter/build/test.sh new file mode 100644 index 0000000..097eb3a --- /dev/null +++ b/mind-cluster/component/npu-exporter/build/test.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Perform test for npu-exporter +# Copyright @ Huawei Technologies CO., Ltd. 2020-2020. All rights reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +set -e + + +# execute go test and echo result to report files +function execute_test() { + if ! (go test -v -race -coverprofile cov.out "${TOP_DIR}"/... >./"$file_input") + then + echo '****** go test cases error! ******' + cat $file_input + exit 1 + else + gocov convert cov.out | gocov-html >"$file_detail_output" + gotestsum --junitfile unit-tests.xml "${TOP_DIR}"/... + + total_coverage=$(go tool cover -func=cov.out | grep "total:" | awk '{print $3}'| sed 's/%//') + # round up + coverage=$(echo "$total_coverage" | awk '{if ($1 >= 0) print ($1 == int($1)) ? int($1) : int($1) + 1;\ + else print ($1 == int($1)) ? int($1) : int($1)}') + if [[ $coverage -ge 80 ]]; then + echo "coverage passed: $coverage%" + exit 0 + else + echo "coverage failed: $coverage%, it needs to be greater than 80%." + exit 1 + fi + fi +} + + +export GO111MODULE="on" +export PATH=$GOPATH/bin:$PATH +export GOFLAGS="-gcflags=all=-l" +unset GOPATH +# if didn't install the following tools, please install firstly +#go get -insecure github.com/axw/gocov/gocov +#go get github.com/matm/gocov-html +CUR_DIR=$(dirname "$(readlink -f "$0")") +TOP_DIR=$(realpath "${CUR_DIR}"/..) + +file_input='testExporter.txt' +file_detail_output='api.html' + +if [ -f "${TOP_DIR}"/test ]; then + rm -rf "${TOP_DIR}"/test +fi +mkdir -p "${TOP_DIR}"/test +cd "${TOP_DIR}"/test +echo "clean old version test results" + +if [ -f "$file_input" ]; then + rm -rf "$file_input" +fi +if [ -f "$file_detail_output" ]; then + rm -rf "$file_detail_output" +fi + +echo "************************************* Start LLT Test *************************************" +execute_test +echo "************************************* End LLT Test *************************************" diff --git a/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go b/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go new file mode 100644 index 0000000..700b248 --- /dev/null +++ b/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go @@ -0,0 +1,545 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package main +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "log" + "net" + "net/http" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "github.com/influxdata/telegraf/plugins/common/shim" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/limiter" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/config" + "huawei.com/npu-exporter/v6/collector/container" + _ "huawei.com/npu-exporter/v6/platforms/inputs/npu" + "huawei.com/npu-exporter/v6/platforms/prom" + "huawei.com/npu-exporter/v6/plugins" + "huawei.com/npu-exporter/v6/utils/logger" + "huawei.com/npu-exporter/v6/versions" +) + +var ( + port int + updateTime int + ip = "" + version bool + concurrency int + containerMode = "" + containerd = "" + endpoint = "" + limitIPReq = "" + platform = "" + textMetricsFilePath = "" + limitIPConn int + limitTotalConn int + cacheSize int + profilingTime int + hccsBWProfilingTime int + pollInterval time.Duration + deviceResetTimeout int +) + +const ( + portConst = 8082 + updateTimeConst = 5 + cacheTime = 100 * time.Second + portLeft = 1025 + portRight = 40000 + oneMinute = 60 + defaultConcurrency = 5 + defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" + containerModeDocker = "docker" + containerModeContainerd = "containerd" + containerModeIsula = "isula" + unixPre = "unix://" + timeout = 10 + maxHeaderBytes = 1024 + // tenDays ten days + tenDays = 10 + maxIPConnLimit = 128 + maxConcurrency = 512 + defaultConnection = 20 + maxProfilingTime = 2000 + minHccsBWProfilingTime = 1 + maxHccsBWProfilingTime = 1000 + defaultShutDownTimeout = 30 * time.Second +) + +const ( + prometheusPlatform = "Prometheus" + telegrafPlatform = "Telegraf" + pollIntervalStr = "poll_interval" + platformStr = "platform" + defaultProfilingTime = 200 + defaultHccsBwProfilingTime = 200 +) + +func main() { + flag.Parse() + if version { + fmt.Printf("NPU-exporter version: %s \n", versions.BuildVersion) + return + } + err := logger.InitLogger(platform) + if err != nil { + fmt.Fprintf(os.Stderr, "%v", err) + return + } + initPaprams() + err = paramValid(platform) + if err != nil { + return + } + dmgr, err := devmanager.AutoInit("", deviceResetTimeout) + if err != nil { + logger.Errorf("new npu collector failed, error is %v", err) + return + } + logger.Infof("npu exporter starting and the version is %s", versions.BuildVersion) + deviceParser := container.MakeDevicesParser(readCntMonitoringFlags()) + defer deviceParser.Close() + + if err := deviceParser.Init(); err != nil { + logger.Errorf("failed to init devices parser: %v", err) + } + deviceParser.Timeout = time.Duration(updateTime) * time.Second + + colcommon.Collector = colcommon.NewNpuCollector(cacheTime, time.Duration(updateTime)*time.Second, deviceParser, dmgr) + plugins.InitTextMetricsDesc(textMetricsFilePath) + plugins.RegisterPlugin() + config.Register(colcommon.Collector) + + ctx, cancel := context.WithCancel(context.Background()) + wg := &sync.WaitGroup{} + colcommon.InitCardInfo(wg, ctx, colcommon.Collector) + colcommon.StartContainerInfoCollect(ctx, cancel, wg, colcommon.Collector) + + colcommon.StartCollect(wg, ctx, colcommon.Collector) + switch platform { + case prometheusPlatform: + prometheusProcss(wg, ctx, cancel) + case telegrafPlatform: + telegrafProcess() + default: + err = fmt.Errorf("err platform input") + } + wg.Wait() +} + +func prometheusProcss(wg *sync.WaitGroup, ctx context.Context, cancel context.CancelFunc) { + c := prom.NewPrometheusCollector(colcommon.Collector) + reg := prometheus.NewRegistry() + reg.MustRegister(c) + + wg.Add(1) + go func() { + startServe(ctx, cancel, reg) + wg.Done() + }() +} + +func initPaprams() { + common.SetHccsBWProfilingTime(hccsBWProfilingTime) + common.SetExternalParams(profilingTime) +} + +func paramValid(platform string) error { + var err error + switch platform { + case prometheusPlatform: + err = paramValidInPrometheus() + case telegrafPlatform: + err = paramValidInTelegraf() + default: + err = fmt.Errorf("err platform input") + } + if err != nil { + logger.Error(err) + return err + } + return nil +} + +func initConfig() *limiter.HandlerConfig { + conf := &limiter.HandlerConfig{ + PrintLog: true, + Method: http.MethodGet, + LimitBytes: limiter.DefaultDataLimit, + TotalConCurrency: concurrency, + IPConCurrency: limitIPReq, + CacheSize: limiter.DefaultCacheSize, + } + return conf +} + +func newServerAndListener(conf *limiter.HandlerConfig) (*http.Server, net.Listener) { + handler, err := limiter.NewLimitHandlerV2(http.DefaultServeMux, conf) + if err != nil { + hwlog.RunLog.Error(err) + return nil, nil + } + s := &http.Server{ + Addr: ip + ":" + strconv.Itoa(port), + Handler: handler, + ReadTimeout: timeout * time.Second, + WriteTimeout: timeout * time.Second, + MaxHeaderBytes: maxHeaderBytes, + ErrorLog: log.New(&hwlog.SelfLogWriter{}, "", log.Lshortfile), + } + ln, err := net.Listen("tcp", s.Addr) + if err != nil { + logger.Errorf("listen ip and port error: %v", err) + return nil, nil + } + limitLs, err := limiter.LimitListener(ln, limitTotalConn, limitIPConn, limiter.DefaultCacheSize) + if err != nil { + hwlog.RunLog.Error(err) + return nil, nil + } + return s, limitLs +} + +func readCntMonitoringFlags() container.CntNpuMonitorOpts { + opts := container.CntNpuMonitorOpts{UseOciBackup: true, UseCriBackup: true} + switch containerMode { + case containerModeDocker: + opts.EndpointType = container.EndpointTypeDockerd + opts.OciEndpoint = container.DefaultDockerAddr + opts.CriEndpoint = container.DefaultDockerShim + case containerModeContainerd: + opts.EndpointType = container.EndpointTypeContainerd + opts.OciEndpoint = container.DefaultContainerdAddr + opts.CriEndpoint = container.DefaultContainerdAddr + case containerModeIsula: + opts.EndpointType = container.EndpointTypeIsula + opts.OciEndpoint = container.DefaultIsuladAddr + opts.CriEndpoint = container.DefaultIsuladAddr + default: + hwlog.RunLog.Error("invalid container mode setting,reset to docker") + opts.EndpointType = container.EndpointTypeDockerd + opts.OciEndpoint = container.DefaultDockerAddr + opts.CriEndpoint = container.DefaultDockerShim + } + if containerd != "" { + opts.OciEndpoint = containerd + opts.UseOciBackup = false + } + if endpoint != "" { + opts.CriEndpoint = endpoint + opts.UseCriBackup = false + } + return opts +} + +func checkIPAndPortInPrometheus() error { + if port < portLeft || port > portRight { + return errors.New("the port is invalid") + } + parsedIP := net.ParseIP(ip) + if parsedIP == nil { + return errors.New("the listen ip is invalid") + } + ip = parsedIP.String() + logger.Infof("listen on: %s", ip) + return nil +} + +func paramValidInPrometheus() error { + checks := []func() error{ + checkIPAndPortInPrometheus, + checkUpdateTime, + containerSockCheck, + checkLimitIPReqFormat, + checkLimitIPConn, + checkLimitTotalConn, + checkCacheSize, + checkConcurrency, + checkProfilingTime, + checkHccsBWProfilingTime, + checkDeviceResetTimeout, + checkPollIntervalInCmdLine, + } + + for _, check := range checks { + if err := check(); err != nil { + return err + } + } + return nil +} + +func checkUpdateTime() error { + if updateTime > oneMinute || updateTime < 1 { + return errors.New("the updateTime is invalid") + } + return nil +} + +func checkLimitIPReqFormat() error { + reg := regexp.MustCompile(limiter.IPReqLimitReg) + if !reg.Match([]byte(limitIPReq)) { + return errors.New("limitIPReq format error") + } + return nil +} + +func checkLimitIPConn() error { + if limitIPConn < 1 || limitIPConn > maxIPConnLimit { + return errors.New("limitIPConn is invalid") + } + return nil +} + +func checkLimitTotalConn() error { + if limitTotalConn < 1 || limitTotalConn > maxConcurrency { + return errors.New("limitTotalConn is invalid") + } + return nil +} + +func checkCacheSize() error { + if cacheSize < 1 || cacheSize > limiter.DefaultCacheSize*tenDays { + return errors.New("cacheSize is invalid") + } + return nil +} + +func checkConcurrency() error { + if concurrency < 1 || concurrency > maxConcurrency { + return errors.New("concurrency is invalid") + } + return nil +} + +func checkProfilingTime() error { + if profilingTime < 1 || profilingTime > maxProfilingTime { + return errors.New("profilingTime range error") + } + return nil +} + +func checkHccsBWProfilingTime() error { + if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { + return errors.New("hccsBWProfilingTime range error") + } + return nil +} + +func checkDeviceResetTimeout() error { + if deviceResetTimeout < api.MinDeviceResetTimeout || deviceResetTimeout > api.MaxDeviceResetTimeout { + return errors.New("deviceResetTimeout range error") + } + return nil +} + +func checkPollIntervalInCmdLine() error { + cmdLine := strings.Join(os.Args[1:], "") + if strings.Contains(cmdLine, pollIntervalStr) { + return fmt.Errorf("%s is not support this scene", pollIntervalStr) + } + return nil +} + +func containerSockCheck() error { + if endpoint != "" && !strings.Contains(endpoint, ".sock") { + return errors.New("endpoint file is not sock address") + } + if containerd != "" && !strings.Contains(containerd, ".sock") { + return errors.New("containerd file is not sock address") + } + if endpoint != "" && !strings.Contains(endpoint, unixPre) { + endpoint = unixPre + endpoint + } + if containerd != "" && !strings.Contains(containerd, unixPre) { + containerd = unixPre + containerd + } + return nil +} + +func init() { + flag.IntVar(&port, "port", portConst, + "The server port of the http service,range[1025-40000]") + flag.StringVar(&ip, "ip", "", + "The listen ip of the service,0.0.0.0 is not recommended when install on Multi-NIC host") + flag.IntVar(&updateTime, "updateTime", updateTimeConst, + "Interval (seconds) to update the npu metrics cache,range[1-60]") + flag.BoolVar(&version, "version", false, + "If true,query the version of the program (default false)") + flag.StringVar(&containerMode, "containerMode", containerModeDocker, + "Set 'docker' for monitoring docker containers or 'containerd' for CRI & containerd") + flag.StringVar(&containerd, "containerd", "", + "The endpoint of containerd used for listening containers' events") + flag.StringVar(&endpoint, "endpoint", "", + "The endpoint of the CRI server to which will be connected") + flag.IntVar(&concurrency, "concurrency", defaultConcurrency, + "The max concurrency of the http server, range is [1-512]") + // hwlog configuration + flag.IntVar(&logger.HwLogConfig.LogLevel, "logLevel", 0, + "Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)") + flag.IntVar(&logger.HwLogConfig.MaxAge, "maxAge", hwlog.DefaultMinSaveAge, + "Maximum number of days for backup log files, range [7, 700] days") + flag.StringVar(&logger.HwLogConfig.LogFileName, "logFile", defaultLogFile, + "Log file path. If the file size exceeds 20MB, will be rotated") + flag.IntVar(&logger.HwLogConfig.MaxBackups, "maxBackups", hwlog.DefaultMaxBackups, + "Maximum number of backup log files, range is (0, 30]") + flag.IntVar(&cacheSize, "cacheSize", limiter.DefaultCacheSize, "the cacheSize for ip limit,"+ + "range is [1,1024000],keep default normally") + flag.IntVar(&limitIPConn, "limitIPConn", defaultConcurrency, "the tcp connection limit for each Ip,"+ + "range is [1,128]") + flag.IntVar(&limitTotalConn, "limitTotalConn", defaultConnection, "the tcp connection limit for all"+ + " request,range is [1,512]") + flag.StringVar(&limitIPReq, "limitIPReq", "20/1", + "the http request limit counts for each Ip,20/1 means allow 20 request in 1 seconds") + flag.StringVar(&platform, "platform", "Prometheus", "the data reporting platform, "+ + "just support Prometheus and Telegraf") + flag.StringVar(&textMetricsFilePath, "textMetricsFilePath", "", + "text indicator collection path, only support specified one file path") + flag.DurationVar(&pollInterval, pollIntervalStr, 1*time.Second, + "how often to send metrics when use Telegraf plugin, "+ + "needs to be used with -platform=Telegraf, otherwise, it does not take effect") + flag.IntVar(&profilingTime, "profilingTime", defaultProfilingTime, + "config pcie bandwidth profiling time, range is [1, 2000]") + flag.IntVar(&hccsBWProfilingTime, api.HccsBWProfilingTimeStr, defaultHccsBwProfilingTime, + "config "+api.Hccs+" bandwidth profiling time, range is [1, 1000]") + flag.IntVar(&deviceResetTimeout, api.DeviceResetTimeout, api.DefaultDeviceResetTimeout, + "when npu-exporter starts, if the number of chips is insufficient, the maximum duration to wait for "+ + "the driver to report all chips, unit second, range [10, 600]") +} + +func indexHandler(w http.ResponseWriter, _ *http.Request) { + var proposal = "http" + _, err := w.Write([]byte( + ` + NPU-Exporter + +

NPU-Exporter

+

Welcome to use NPU-Exporter,the Prometheus metrics url is ` + proposal + `://ip:` + + strconv.Itoa(port) + `/metrics: Metrics

+ + `)) + if err != nil { + logger.Errorf("Write to response error: %v", err) + } +} + +func prometheusProcess() { + +} + +func startServe(ctx context.Context, cancel context.CancelFunc, reg *prometheus.Registry) { + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{ErrorHandling: promhttp.ContinueOnError})) + http.Handle("/", http.HandlerFunc(indexHandler)) + conf := initConfig() + s, limitLs := newServerAndListener(conf) + if s == nil || limitLs == nil { + cancel() + return + } + + go func() { + logger.Warn("enable unsafe http server") + if err := s.Serve(limitLs); err != nil { + logger.Errorf("Http server error: %v and stopped", err) + cancel() + } + }() + + <-ctx.Done() + shutErr := func() error { + logger.Info("received stop signal, STOP http server") + ctxShutDown, timeOut := context.WithTimeout(context.Background(), defaultShutDownTimeout) + defer timeOut() + return s.Shutdown(ctxShutDown) + }() + if shutErr != nil { + logger.Errorf("shutdown http server error: %v", shutErr) + } +} + +func paramValidInTelegraf() error { + // cmdLine here must contain "-platform=Telegraf", otherwise, it will enter the Prometheus process + cmdLine := os.Args[1:] + + // store the preset parameter names in the map + presetParamsMap := map[string]bool{ + platformStr: true, + pollIntervalStr: true, + api.HccsBWProfilingTimeStr: true, + } + + if len(cmdLine) > len(presetParamsMap) { + return errors.New("too many parameters") + } + + var paramLen = 2 + // check every input params + for _, param := range cmdLine { + param = strings.TrimPrefix(param, "-") + split := strings.Split(param, "=") + if len(split) != paramLen { + return fmt.Errorf("the param [%s] is a wrong format", param) + } + paramName := split[0] + if !presetParamsMap[paramName] { + return fmt.Errorf("not support [%s] in Telegraf", paramName) + } + } + + if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { + return errors.New(api.Hccs + "BWProfilingTime range error") + } + return nil +} + +func telegrafProcess() { + // create the shim. This is what will run your plugins. + shim := shim.New() + + // If no config is specified, all imported plugins are loaded. + // otherwise follow what the config asks for. + // Check for settings from a config toml file, + // (or just use whatever plugins were imported above) + configFile := "" + err := shim.LoadConfig(&configFile) + if err != nil { + fmt.Fprintf(os.Stderr, "Err loading input: %s\n", err) + return + } + + // run the input plugin(s) until stdin closes, or we receive a termination signal + if err := shim.Run(pollInterval); err != nil { + fmt.Fprintf(os.Stderr, "Err: %s\n", err) + return + } +} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go new file mode 100644 index 0000000..af46251 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go @@ -0,0 +1,109 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "context" + "strings" + "sync" + "time" + + "ascend-common/common-utils/hwlog" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +// StartContainerInfoCollect start collect container info +func StartContainerInfoCollect(ctx context.Context, cancelFunc context.CancelFunc, group *sync.WaitGroup, + n *NpuCollector) { + group.Add(1) + + go func() { + defer group.Done() + retryCount := 0 + collectContainerInfo := func() { + logger.Info("start to collect container info") + n.devicesParser.FetchAndParse(nil) + select { + case result := <-n.devicesParser.RecvResult(): + if err := n.cache.Set(containersDevicesCacheKey, result, n.cacheTime); err != nil { + logger.Error(err) + } + logger.Infof(UpdateCachePattern, containersDevicesCacheKey) + retryCount = 0 + case err := <-n.devicesParser.RecvErr(): + logger.Errorf("received error from device parser: %v", err) + if strings.Contains(err.Error(), "connection refused") { + retryCount++ + if retryCount == connectRefusedMaxRetry { + logger.Error("connection refused, task shutdown") + cancelFunc() + } + } + } + } + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop container info collect") + return + default: + collectContainerInfo() + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, containersDevicesCacheKey) + return + } + } + } + }() +} + +// GetContainerNPUInfo get container npu info +func GetContainerNPUInfo(n *NpuCollector) map[int32]container.DevicesInfo { + obj, err := n.cache.Get(containersDevicesCacheKey) + // only run once to prevent wait when container info get failed + npuContainerInfoInit.Do(func() { + if err != nil { + logger.Warn("containers' devices info not found in cache, rebuilding") + resultChan := make(chan container.DevicesInfos, 1) + n.devicesParser.FetchAndParse(resultChan) + select { + case obj = <-resultChan: + case <-time.After(time.Second): + logger.Warn("rebuild container info cache timeout") + return + } + logger.Info("rebuild cache successfully") + } + }) + cntNpuInfos, ok := obj.(container.DevicesInfos) + if !ok { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: DomainForContainerInfo, ID: 0}, + "error container npu info cache and convert failed") + return nil + } + hwlog.ResetErrCnt(DomainForContainerInfo, 0) + res := make(map[int32]container.DevicesInfo, initSize) + for _, v := range cntNpuInfos { + for _, deviceID := range v.Devices { + res[int32(deviceID)] = v + } + } + return res +} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go new file mode 100644 index 0000000..6412e12 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go @@ -0,0 +1,137 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "sync" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/cache" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + testCacheTime = 60 * time.Second + testUpdateTime = 10 * time.Millisecond + testDeviceID0 = 0 + testDeviceID1 = 1 + testDeviceID2 = 2 + testContainerID1 = "container1" + testContainerID2 = "container2" + testContainerName1 = "test-container-1" + testContainerName2 = "test-container-2" +) + +var ( + testDevicesInfos = container.DevicesInfos{ + testContainerID1: { + ID: testContainerID1, + Name: testContainerName1, + Devices: []int{testDeviceID0, testDeviceID1}, + }, + testContainerID2: { + ID: testContainerID2, + Name: testContainerName2, + Devices: []int{testDeviceID2}, + }, + } +) + +func createTestNpuCollector() *NpuCollector { + parser := &container.DevicesParser{} + return &NpuCollector{ + cache: cache.New(cacheSize), + devicesParser: parser, + updateTime: testUpdateTime, + cacheTime: testCacheTime, + } +} + +func resetNpuContainerInfoInit() { + npuContainerInfoInit = sync.Once{} +} + +type getContainerNPUInfoTestCase struct { + name string + setupCache func(*NpuCollector) + mockParser func(*gomonkey.Patches, *container.DevicesParser) + expectedResult map[int32]container.DevicesInfo +} + +func createGetContainerNPUInfoTestCases() []getContainerNPUInfoTestCase { + return []getContainerNPUInfoTestCase{ + { + name: "should return container npu info when cache exists", + setupCache: func(n *NpuCollector) { + n.cache.Set(containersDevicesCacheKey, testDevicesInfos, testCacheTime) + }, + mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, + expectedResult: map[int32]container.DevicesInfo{ + int32(testDeviceID0): testDevicesInfos[testContainerID1], + int32(testDeviceID1): testDevicesInfos[testContainerID1], + int32(testDeviceID2): testDevicesInfos[testContainerID2], + }, + }, + { + name: "should rebuild cache when cache not exists", + setupCache: func(n *NpuCollector) {}, + mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) { + patches.ApplyMethod(parser, "FetchAndParse", + func(p *container.DevicesParser, resultOut chan<- container.DevicesInfos) { + if resultOut != nil { + resultOut <- testDevicesInfos + } + }) + }, + expectedResult: map[int32]container.DevicesInfo{ + int32(testDeviceID0): testDevicesInfos[testContainerID1], + int32(testDeviceID1): testDevicesInfos[testContainerID1], + int32(testDeviceID2): testDevicesInfos[testContainerID2], + }, + }, + { + name: "should return nil when cache type conversion failed", + setupCache: func(n *NpuCollector) { + n.cache.Set(containersDevicesCacheKey, "invalid type", testCacheTime) + }, + mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, + expectedResult: nil, + }, + } +} + +func TestGetContainerNPUInfo(t *testing.T) { + testCases := createGetContainerNPUInfoTestCases() + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + resetNpuContainerInfoInit() + n := createTestNpuCollector() + tc.setupCache(n) + + patches := gomonkey.NewPatches() + defer patches.Reset() + tc.mockParser(patches, n.devicesParser) + + result := GetContainerNPUInfo(n) + convey.So(result, convey.ShouldResemble, tc.expectedResult) + }) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/common/constants.go b/mind-cluster/component/npu-exporter/collector/common/constants.go new file mode 100644 index 0000000..d7e1409 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/constants.go @@ -0,0 +1,140 @@ +/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general constants +package common + +// metric label name +const ( + npuID = "id" + modelName = "model_name" + npuUUID = "vdie_id" + npuPCIEInfo = "pcie_bus_info" + namespace = "namespace" + podName = "pod_name" + cntrName = "container_name" +) + +const ( + // Healthy status of Health + Healthy = "Healthy" + // UnHealthy status of unhealth + UnHealthy = "UnHealthy" + // Abnormal status of Abnormal + Abnormal = "Abnormal" + + // LinkUp npu interface up + LinkUp = "UP" + // LinkDown npu interface down + LinkDown = "DOWN" + + // Base convert base + Base = 10 + // ContainerNameLen container name length + ContainerNameLen = 3 + // npuListCacheKey Cache key + npuListCacheKey = "npu-exporter-npu-list" + // Cache key for parsing-device result + containersDevicesCacheKey = "npu-exporter-containers-devices" + initSize = 8 + tickerFailedPattern = "%s ticker failed, task shutdown" + // UpdateCachePattern Update cache pattern + UpdateCachePattern = "update Cache,key is %s" + connectRefusedMaxRetry = 3 +) + +const ( + cacheSize = 128 + // NameSpaceIdx is the index of namespace in container name + NameSpaceIdx = 0 + // PodNameIdx is the index of pod name in container name + PodNameIdx = 1 + // ConNameIdx is the index of container name in container name + ConNameIdx = 2 + + // DecimalPlaces is the decimal places of float64 + DecimalPlaces = 2 + // BitSize is the bit size of float64 + BitSize = 64 + // GeneralDevTagKey is the default value of devTagKey in telegraf, it means the metric is not related to any device + GeneralDevTagKey = "GeneralDevTagKey" +) + +// log limit domains for metrics +const ( + // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID + DomainForLogicIdErr = "logicID" + + // DomainForHccs domain for hccs + DomainForHccs = "hccs" + + // DomainForDDR domain for DDR + DomainForDDR = "DDR" + + // DomainForSio domain for sio + DomainForSio = "sio" + + // DomainForHBM domain for HBM + DomainForHBM = "hbm" + + // DomainForHBMECC domain for hbmEcc + DomainForHBMECC = "hbmEcc" + + // DomainForHccsBW domain for hccs bandwidth + DomainForHccsBW = "hccsBw" + + // DomainForOptical domain for Optical + DomainForOptical = "optical" + + // DomainForLinkState domain for linkState + DomainForLinkState = "linkState" + + // DomainForBandwidth domain for bandwidth + DomainForBandwidth = "bandwidth" + + // DomainForLinkStat domain for linkStat + DomainForLinkStat = "linkStat" + + // DomainForLinkSpeed domain for linkSpeed + DomainForLinkSpeed = "linkSpeed" + + // DomainForRoce domain for roce + DomainForRoce = "roce" + + // DomainForMcuPower domain for mcu power + DomainForMcuPower = "mcuPower" + + // DomainForChipPower domain for chip power + DomainForChipPower = "chipPower" + + // DomainForAICoreUtilization domain for ai core utilization + DomainForAICoreUtilization = "AICoreUtilization" + + // DomainForVectorCoreUtilization domain for vector core utilization + DomainForVectorCoreUtilization = "vectorCoreUtilization" + + // DomainForProcess domain for process info + DomainForProcess = "processInfo" + + // DomainForHbmUtilization domain for High Bandwidth Memory Utilization + DomainForHbmUtilization = "hbmUtilization" + + // DomainForOverallUtilization domain for overall utilization + DomainForOverallUtilization = "overallUtilization" + + // DomainForPcieBandwidth domain for pcie bandwidth + DomainForPcieBandwidth = "pcieBandwidth" + // DomainForContainerInfo domain for pcie container info + DomainForContainerInfo = "containerInfo" +) diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go new file mode 100644 index 0000000..d891649 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go @@ -0,0 +1,192 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "reflect" + "strings" + "sync" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + // CardLabel general card label + CardLabel = []string{npuID, modelName, npuUUID, npuPCIEInfo, namespace, podName, cntrName} + + noNeedToPrintUpdateLog = map[string]bool{ + "NetworkCollector": true, + "RoceCollector": true, + "OpticalCollector": true, + } +) + +// BuildDescSlice build desc slice +func BuildDescSlice(slice *[]*prometheus.Desc, name string, help string) { + *slice = append(*slice, BuildDesc(name, help)) +} + +// BuildDesc build desc +func BuildDesc(name string, help string) *prometheus.Desc { + return prometheus.NewDesc(name, help, CardLabel, nil) +} + +// BuildDescWithLabel build desc with label +func BuildDescWithLabel(name string, help string, label []string) *prometheus.Desc { + return prometheus.NewDesc(name, help, label, nil) +} + +// MetricsCollector metrics collector +type MetricsCollector interface { + // Describe report metrics to prometheus + Describe(ch chan<- *prometheus.Desc) + + // CollectToCache collect data to cache + CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) + + // UpdatePrometheus update prometheus + UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, containerMap map[int32]container.DevicesInfo, + chips []HuaWeiAIChip) + + // UpdateTelegraf update telegraf + UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} + + // PreCollect pre handle before collect + PreCollect(*NpuCollector, []HuaWeiAIChip) + + // PostCollect post handle after collect + PostCollect(*NpuCollector) + + // IsSupported Check whether the current hardware supports this metric + IsSupported(*NpuCollector) bool +} + +// MetricsCollectorAdapter base collector for metrics collector +type MetricsCollectorAdapter struct { + LocalCache sync.Map + Is910Series bool + ContainerMap map[int32]container.DevicesInfo + Chips []HuaWeiAIChip +} + +// Describe report metrics to prometheus +func (c *MetricsCollectorAdapter) Describe(ch chan<- *prometheus.Desc) { +} + +// CollectToCache collect data to cache +func (c *MetricsCollectorAdapter) CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) { +} + +// UpdatePrometheus update prometheus +func (c *MetricsCollectorAdapter) UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) { +} + +// UpdateTelegraf update telegraf +func (c *MetricsCollectorAdapter) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} { + return fieldsMap +} + +// PreCollect pre handle before collect +func (c *MetricsCollectorAdapter) PreCollect(n *NpuCollector, chipList []HuaWeiAIChip) { + if strings.Contains(n.Dmgr.GetDevType(), api.Ascend910A) { + c.Is910Series = true + } +} + +// PostCollect post handle after collect +func (c *MetricsCollectorAdapter) PostCollect(*NpuCollector) { +} + +// IsSupported Check whether the current hardware supports this metric +func (c *MetricsCollectorAdapter) IsSupported(*NpuCollector) bool { + return true +} + +// UpdateCache update cache +func UpdateCache[T any](n *NpuCollector, cacheKey string, localCache *sync.Map) { + var cacheInfo = make(map[int32]T) + obj, err := n.cache.Get(cacheKey) + if err != nil { + logger.Debugf("get info of %s failed: %v, use initial data", cacheKey, err) + } else { + if oldCacheInfo, ok := obj.(map[int32]T); ok { + cacheInfo = copyMap(oldCacheInfo) + } else { + logger.Debug("cache format invalid, reset") + } + } + + localCache.Range(func(key, value interface{}) bool { + finalKey, okKey := key.(int32) + finalValue, okValue := value.(T) + if okKey && okValue { + cacheInfo[finalKey] = finalValue + } + return true + }) + + err = n.cache.Set(cacheKey, cacheInfo, n.cacheTime) + if noNeedToPrintUpdateLog[cacheKey] { + return + } + if err != nil { + logger.Error(err) + } +} + +func copyMap[T any](oldCacheInfo map[int32]T) map[int32]T { + var cacheInfo = make(map[int32]T) + for key, value := range oldCacheInfo { + cacheInfo[key] = value + } + return cacheInfo +} + +// GetInfoFromCache get info from cache +func GetInfoFromCache[T any](n *NpuCollector, cacheKey string) map[int32]T { + res := make(map[int32]T) + obj, err := n.cache.Get(cacheKey) + if err != nil { + logger.Warn("cache not found, please wait for rebuild") + return res + } + + if data, ok := obj.(map[int32]T); ok { + return data + } + logger.Error("cache type mismatch") + return res +} + +// GetCacheKey Obtain the name of the struct pointer as the key of the cache +func GetCacheKey(ptr interface{}) string { + v := reflect.ValueOf(ptr) + if v.Kind() != reflect.Ptr { + return "" + } + v = v.Elem() + if v.Kind() != reflect.Struct { + return "" + } + return v.Type().Name() +} diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go new file mode 100644 index 0000000..f66ceb5 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go @@ -0,0 +1,231 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "reflect" + "sync" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" +) + +// TestCopyMap test copyMap +func TestCopyMap(t *testing.T) { + type testStruct struct { + name string + age int + } + mockString := "mock" + tests := []struct { + name string + input map[int32]testStruct + validate func(*testing.T, interface{}) + }{ + {name: "NilInput", input: (map[int32]testStruct)(nil), + validate: func(t *testing.T, got interface{}) { + g, ok := got.(map[int32]testStruct) + if !ok || g == nil || len(g) != 0 { + t.Errorf("should return empty map for nil input") + } + }}, + {name: "EmptyMap", input: map[int32]testStruct{}, + validate: func(t *testing.T, got interface{}) { + if len(got.(map[int32]testStruct)) != 0 { + t.Errorf("expected empty map") + } + }}, + {name: "SingleElement", input: map[int32]testStruct{1: {name: mockString, age: 1}}, + validate: func(t *testing.T, got interface{}) { + g, ok := got.(map[int32]testStruct) + if !ok || g[1].name != mockString || g[1].age != 1 || len(g) != 1 { + t.Errorf("element mismatch") + } + }}, + {name: "MultipleElements", input: map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}}, + validate: func(t *testing.T, got interface{}) { + expected := map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}} + if !reflect.DeepEqual(got, expected) { + t.Errorf("deepEqual failed") + } + }}, + } + + for _, tt := range tests { + convey.Convey(tt.name, t, func() { + got := copyMap[testStruct](tt.input) + tt.validate(t, got) + }) + } +} + +func TestPreCollect(t *testing.T) { + tests := []struct { + name string + deviceType string + expected bool + }{ + {name: "TestPreCollect_" + api.Ascend910, + deviceType: api.Ascend910, + expected: true, + }, + {name: "TestPreCollect_" + api.Ascend310, + deviceType: api.Ascend310, + expected: false, + }, + } + convey.Convey("TestPreCollect", t, func() { + n := mockNewNpuCollector() + adapter := MetricsCollectorAdapter{ + Is910Series: false, + ContainerMap: nil, + Chips: nil, + } + for _, tt := range tests { + convey.Convey(tt.name, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.deviceType) + adapter.PreCollect(n, nil) + convey.So(adapter.Is910Series, convey.ShouldEqual, tt.expected) + }) + } + }) +} + +type cacheCase struct { + name string + cacheKey string + preHandle func() + expected int +} + +func buildTestsForUpdateCache(expected int) []cacheCase { + tests := []cacheCase{ + {name: "TestUpdateCache_save info to cache", + cacheKey: "mockKey1", + preHandle: func() {}, + expected: expected, + }, + {name: "TestUpdateCache_update old cache", + cacheKey: "mockKey2", + preHandle: func() { + noNeedToPrintUpdateLog["mockKey2"] = true + }, + expected: expected, + }, + {name: "TestUpdateCache_old cache is in incorrect type", + cacheKey: "mockKey3", + preHandle: func() {}, + expected: expected, + }, + } + return tests +} + +func TestUpdateCache(t *testing.T) { + const key = int32(0) + const expected = 1 + tests := buildTestsForUpdateCache(expected) + + n := mockNewNpuCollector() + // data init + n.cache.Set("mockKey2", map[int32]string{key: "0"}, n.cacheTime) + n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) + + convey.Convey("TestUpdateCache", t, func() { + + for _, tt := range tests { + convey.Convey(tt.name, func() { + localCache := sync.Map{} + localCache.Store(key, "mockValue") + tt.preHandle() + UpdateCache[string](n, tt.cacheKey, &localCache) + + data, err := n.cache.Get(tt.cacheKey) + convey.So(err, convey.ShouldBeNil) + map2, ok := data.(map[int32]string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(len(map2), convey.ShouldEqual, tt.expected) + }) + } + + }) +} + +func TestGetInfoFromCache(t *testing.T) { + const key = int32(0) + tests := []struct { + name string + cacheKey string + expected int + }{ + {name: "TestGetInfoFromCache_no info in cache", + cacheKey: "mockKey1", + expected: 0, + }, + {name: "TestGetInfoFromCache_correct", + cacheKey: "mockKey2", + expected: 1, + }, + {name: "TestGetInfoFromCache_info in cache is in incorrect type", + cacheKey: "mockKey3", + expected: 0, + }, + } + n := mockNewNpuCollector() + // data init + n.cache.Set("mockKey2", map[int32]string{key: "mockValue"}, n.cacheTime) + n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) + for _, tt := range tests { + convey.Convey(tt.name, t, func() { + cache := GetInfoFromCache[string](n, tt.cacheKey) + convey.So(len(cache), convey.ShouldEqual, tt.expected) + }) + } +} + +func TestGetCacheKey(t *testing.T) { + tests := []struct { + name string + args interface{} + expected string + }{ + {name: "TestGetCacheKey_ptr", + args: &MetricsCollectorAdapter{}, + expected: "MetricsCollectorAdapter", + }, + {name: "TestGetCacheKey_int", + args: 0, + expected: "", + }, + {name: "TestGetCacheKey_struct", + args: MetricsCollectorAdapter{}, + expected: "", + }, + } + + convey.Convey("TestGetCacheKey", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + convey.So(GetCacheKey(tt.args), convey.ShouldEqual, tt.expected) + }) + } + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector.go new file mode 100644 index 0000000..fee5312 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/npu_collector.go @@ -0,0 +1,423 @@ +/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for general collector +package common + +import ( + "context" + "sync" + "time" + + "ascend-common/api" + "ascend-common/common-utils/cache" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + "ascend-common/devmanager/dcmi" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + npuContainerInfoInit sync.Once + npuChipInfoInit sync.Once + // Collector base collector for prometheus and telegraf + Collector *NpuCollector + + // ChainForSingleGoroutine a list of collectors for single goroutine + ChainForSingleGoroutine []MetricsCollector + + // ChainForMultiGoroutine a list of collectors for multi goroutine + ChainForMultiGoroutine []MetricsCollector + + // ChainForCustomPlugin a list of collectors for plugin + ChainForCustomPlugin []MetricsCollector + + updateTimeForCardIds = time.Minute +) + +const ( + maxCollectTimeout = 10 * time.Second +) + +// NpuCollector for collect metrics +type NpuCollector struct { + cache *cache.ConcurrencyLRUCache + devicesParser *container.DevicesParser + updateTime time.Duration + cacheTime time.Duration + Dmgr *devmanager.DeviceManager +} + +// NewNpuCollector create a new collector +func NewNpuCollector(cacheTime time.Duration, updateTime time.Duration, + deviceParser *container.DevicesParser, dmgr *devmanager.DeviceManager) *NpuCollector { + CommonCollector := &NpuCollector{ + cache: cache.New(cacheSize), + cacheTime: cacheTime, + updateTime: updateTime, + devicesParser: deviceParser, + Dmgr: dmgr, + } + return CommonCollector +} + +// StartCollect start collect +func StartCollect(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + npuChipInfoInitAtFirstTime(n) + startCollectSingleGoroutine(group, ctx, n) + startCollectForMultiGoroutine(group, ctx, n) + startCollectForPluginGoroutine(group, ctx, n) +} + +func startCollectForPluginGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + group.Add(1) + go func() { + defer group.Done() + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + goroutinePreCollect(ChainForCustomPlugin, n) + defer goroutinePostCollect(ChainForCustomPlugin, n) + runPluginCollect(ctx, n, ticker) + }() +} + +func runPluginCollect(ctx context.Context, n *NpuCollector, ticker *time.Ticker) { + for { + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop plugin collect") + return + default: + collectPluginMetrics(n) + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, "handling plugin collectors") + return + } + } + } +} + +func collectPluginMetrics(n *NpuCollector) { + chipList := getChipListCache(n) + for _, c := range ChainForCustomPlugin { + resultChan := make(chan struct{}, 1) + go func(cur MetricsCollector) { + cur.CollectToCache(n, chipList) + resultChan <- struct{}{} + }(c) + select { + case <-resultChan: + continue + case <-time.After(maxCollectTimeout): + logger.Errorf("collect timeout for %v", GetCacheKey(c)) + continue + } + + } +} + +func startCollectForMultiGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + chips := getChipListCache(n) + + group.Add(len(chips)) + for _, chip := range chips { + go func(chip HuaWeiAIChip) { + defer group.Done() + runChipCollector(ctx, n, chip) + }(chip) + } +} + +func runChipCollector(ctx context.Context, n *NpuCollector, chip HuaWeiAIChip) { + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + goroutinePreCollect(ChainForMultiGoroutine, n) + defer goroutinePostCollect(ChainForMultiGoroutine, n) + for { + select { + case <-ctx.Done(): + logger.Infof("received the stop signal,stop collect network info of npu(%d)", chip.LogicID) + return + default: + singleChipSlice := []HuaWeiAIChip{chip} + for _, c := range ChainForMultiGoroutine { + c.CollectToCache(n, singleChipSlice) + } + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, "collect for multigroutine ") + return + } + } + } +} + +func goroutinePreCollect(collectors []MetricsCollector, n *NpuCollector) { + chipList := getChipListCache(n) + for _, c := range collectors { + c.PreCollect(n, chipList) + } +} + +func goroutinePostCollect(collectors []MetricsCollector, n *NpuCollector) { + for _, c := range collectors { + c.PostCollect(n) + } +} + +func startCollectSingleGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + group.Add(1) + go func() { + defer group.Done() + ticker := time.NewTicker(n.updateTime) + defer ticker.Stop() + goroutinePreCollect(ChainForSingleGoroutine, n) + defer goroutinePostCollect(ChainForSingleGoroutine, n) + for { + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop npu base info collect") + return + default: + chipList := getChipListCache(n) + for _, c := range ChainForSingleGoroutine { + c.CollectToCache(n, chipList) + } + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, "handling all collectors") + return + } + } + } + }() +} + +// npuChipInfoInitAtFirstTime When first enter, the cache data is empty, +// need to get the data from the device, and build the cache +func npuChipInfoInitAtFirstTime(n *NpuCollector) { + npuChipInfoInit.Do(func() { + _, err := n.cache.Get(npuListCacheKey) + if err != nil { + logger.Debug("no cache in first time, start to collect chip list and rebuild cache") + + npuInfo := getNPUChipList(n.Dmgr) + if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { + logger.Error(err) + } else { + logger.Infof(UpdateCachePattern, npuListCacheKey) + } + logger.Debug("rebuild cache successfully") + } + }) +} + +// InitCardInfo init card info +func InitCardInfo(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { + + group.Add(1) + go func() { + defer group.Done() + ticker := time.NewTicker(updateTimeForCardIds) + defer ticker.Stop() + for { + logger.Info("start to collect npu chip list info") + select { + case <-ctx.Done(): + logger.Info("received the stop signal,stop card info collect") + return + default: + npuInfo := getNPUChipList(n.Dmgr) + if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { + logger.Error(err) + } else { + logger.Infof(UpdateCachePattern, npuListCacheKey) + } + if _, ok := <-ticker.C; !ok { + logger.Errorf(tickerFailedPattern, npuListCacheKey) + return + } + } + } + }() +} + +func getNPUChipList(dmgr devmanager.DeviceInterface) (npuInfo []HuaWeiAIChip) { + chipList := make([]HuaWeiAIChip, 0) + + cardNum, cards, err := dmgr.GetCardList() + if err != nil || cardNum == 0 { + logger.Errorf("failed to get npu info, error is: %v", err) + return chipList + } + + chipListIDs := make([]int32, 0) + + for _, cardID := range cards { + deviceNum, _ := dmgr.GetDeviceNumInCard(cardID) + for deviceID := int32(0); deviceID < deviceNum; deviceID++ { + var chip HuaWeiAIChip + // get logicID + logicID, err := dmgr.GetDeviceLogicID(cardID, deviceID) + if err != nil { + logger.Errorf("get logic ID of card: %v device:%v failed: %v", cardID, deviceID, err) + continue + } + + chip.LogicID = logicID + chip.CardId = cardID + chip.MainBoardId = dmgr.GetMainBoardId() + + setPhyId(&chip, dmgr, cardID, deviceID) + setChipInfo(&chip, dmgr, cardID, deviceID) + setBoardInfo(&chip, dmgr, cardID, deviceID) + setVdieID(&chip, dmgr, cardID, deviceID) + assemblevNPUInfo(dmgr, logicID, &chip) + setPCIeBusInfo(logicID, dmgr, &chip) + setElabelInfo(&chip, dmgr, cardID) + + chipList = append(chipList, chip) + chipListIDs = append(chipListIDs, logicID) + } + } + + logger.Debugf("flush chip info list successed,chip num is : %v, chipLogicIDs: %v", + len(chipList), chipListIDs) + return chipList +} + +func setBoardInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + boardInfo, err := dmgr.GetBoardInfo(chip.LogicID) + if err != nil { + logger.Errorf("get board info of card: %v device:%v failed: %v", cardID, deviceID, err) + boardInfo = common.BoardInfo{} + } + chip.BoardInfo = &boardInfo +} +func setVdieID(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + vdieID, err := dmgr.GetDieID(chip.LogicID, dcmi.VDIE) + if err != nil { + logger.Debug(err) + } + chip.VDieID = vdieID +} + +func setPhyId(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + phyID, err := dmgr.GetPhysicIDFromLogicID(chip.LogicID) + if err != nil { + logger.Errorf("get phy ID of card: %v device:%v failed: %v", cardID, deviceID, err) + } + chip.PhyId = phyID + chip.DeviceID = phyID +} +func setChipInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { + // get chip info + chipInfo, err := dmgr.GetChipInfo(chip.LogicID) + if err != nil { + logger.Errorf("get chip info of card: %v device:%v failed: %v", cardID, deviceID, err) + chipInfo = &common.ChipInfo{} + } + chip.ChipInfo = chipInfo +} + +func setPCIeBusInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *HuaWeiAIChip) { + productTypes := dmgr.GetProductTypeArray() + pcieInfo, err := dmgr.GetPCIeBusInfo(logicID) + if err != nil { + if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { + logger.Debugf("pcie bus info is not supported on %s", common.Atlas200ISoc) + hwChip.PCIeBusInfo = "" + return + } + logger.Error(err) + pcieInfo = "" + } + hwChip.PCIeBusInfo = pcieInfo +} + +func setElabelInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32) { + elabelInfo, err := dmgr.GetCardElabelV2(cardID) + if err != nil { + logger.Errorf("get elabel info of card: %v failed: %v", cardID, err) + chip.ElabelInfo = &common.ElabelInfo{SerialNumber: "NA"} + return + } + chip.ElabelInfo = &common.ElabelInfo{ + SerialNumber: elabelInfo.SerialNumber, + } +} + +func assemblevNPUInfo(dmgr devmanager.DeviceInterface, logicID int32, baseChipInfo *HuaWeiAIChip) { + if dmgr.GetDevType() != api.Ascend310P { + return + } + vDevInfos, err := dmgr.GetVirtualDeviceInfo(logicID) + if err != nil { + logger.Warnf("failed to get virtual device info,logicID(%d),err: %v", logicID, err) + baseChipInfo.VDevInfos = nil + } + if vDevInfos.TotalResource.VDevNum == 0 { + baseChipInfo.VDevInfos = &common.VirtualDevInfo{} + } + baseChipInfo.VDevInfos = &vDevInfos +} + +// GetChipListWithVNPU get chip list with vnpu +func GetChipListWithVNPU(n *NpuCollector) []HuaWeiAIChip { + result := make([]HuaWeiAIChip, 0) + chips := getChipListCache(n) + + for _, chipInfo := range chips { + isNeedHandleVnpu := n.Dmgr.GetDevType() == api.Ascend310P && chipInfo.VDevInfos != nil && + len(chipInfo.VDevInfos.VDevActivityInfo) > 0 + + if !isNeedHandleVnpu { + result = append(result, chipInfo) + continue + } + + for _, activityVDev := range chipInfo.VDevInfos.VDevActivityInfo { + vDevInfo := chipInfo + activityVDevCopy := activityVDev + vDevInfo.VDevActivityInfo = &activityVDevCopy + result = append(result, vDevInfo) + } + } + + return result + +} +func getChipListCache(n *NpuCollector) []HuaWeiAIChip { + obj, err := n.cache.Get(npuListCacheKey) + if err != nil { + logger.Errorf("get npu chip list from cache failed,err is : %v", err) + return make([]HuaWeiAIChip, 0) + } + if obj == nil { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "getChipListCache"}, + "there is no chip list info in cache,please check collect logs") + return make([]HuaWeiAIChip, 0) + } + + chipList, ok := obj.([]HuaWeiAIChip) + if !ok { + logger.Errorf("error npu chip info cache and convert failed,real type is (%T)", obj) + n.cache.Delete(npuListCacheKey) + return make([]HuaWeiAIChip, 0) + } + // if cache is empty or nil, return empty list + if len(chipList) == 0 { + return make([]HuaWeiAIChip, 0) + } + return chipList +} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go new file mode 100644 index 0000000..722079b --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go @@ -0,0 +1,547 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package common for general collector +package common + +import ( + "context" + "errors" + "strconv" + "sync" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "github.com/stretchr/testify/assert" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + mockErr = errors.New("mockErr") + testError = errors.New(testErrorMsg) +) + +const ( + cacheTime = 60 * time.Second + npuCount = 8 + defaultUpdateTime = 10 * time.Millisecond + num2 = 2 + num100 = 100 + mockKey = "mockKey" + mockValue = "mockValue" + + // Test constants for setElabelInfo + testCardID = int32(1) + testProductName = "Atlas 900" + testModel = "Atlas-900-9000" + testManufacturer = "Huawei" + testManufacturerDate = "2023-01-01" + testSerialNumber = "SN123456789" + testDefaultSerial = "NA" + testErrorMsg = "get elabel info failed" +) + +type mockContainerRuntimeOperator struct{} + +// Init implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) Init() error { + return nil +} + +// Close implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) Close() error { + return nil +} + +// ContainerIDs implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetContainers(ctx context.Context) ([]*container.CommonContainer, error) { + return []*container.CommonContainer{}, nil +} + +// GetContainerInfoByID implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { + return v1.Spec{}, nil +} + +// GetIsulaContainerInfoByID implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetIsulaContainerInfoByID(ctx context.Context, + id string) (isula.ContainerJson, error) { + return isula.ContainerJson{}, nil +} + +// GetContainerType implements ContainerRuntimeOperator +func (operator *mockContainerRuntimeOperator) GetContainerType() string { + return container.DefaultContainer +} + +func mockScan4AscendDevices(_ string) ([]int, bool, error) { + return []int{1}, true, nil +} + +func mockGetCgroupPath(controller, specCgroupsPath string) (string, error) { + return "", nil +} + +func makeMockDevicesParser() *container.DevicesParser { + return &container.DevicesParser{ + RuntimeOperator: new(mockContainerRuntimeOperator), + } +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +// TestNewNpuCollector test method of NewNpuCollector +func TestNewNpuCollector(t *testing.T) { + tc := newNpuCollectorTestCase{ + cacheTime: cacheTime, + updateTime: defaultUpdateTime, + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + + c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + + assert.NotNil(t, c) +} + +type testCase struct { + name string + wantErr bool + mockPart interface{} + expectValue interface{} + expectCount interface{} +} + +func newTestCase(name string, wantErr bool, mockPart interface{}) testCase { + return testCase{ + name: name, + wantErr: wantErr, + mockPart: mockPart, + } +} + +// TestGetChipInfo test method getChipInfo +func TestGetChipInfo(t *testing.T) { + tests := []testCase{ + newTestCase("should return chip info successfully when dsmi works normally", false, + &devmanager.DeviceManagerMock{}), + newTestCase("should return nil when dsmi works abnormally", true, &devmanager.DeviceManagerMockErr{}), + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + chipInfo := getNPUChipList(tt.mockPart.(devmanager.DeviceInterface)) + t.Logf("%#v", chipInfo) + assert.NotNil(t, chipInfo) + if tt.wantErr { + assert.Len(t, chipInfo, 0) + } else { + assert.NotNil(t, chipInfo) + } + }) + } +} + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") +} + +func mockGetNPUChipList() []HuaWeiAIChip { + chips := make([]HuaWeiAIChip, 0) + for id := int32(0); id < npuCount; id++ { + chip := HuaWeiAIChip{ + CardId: id, + PhyId: id, + DeviceID: id, + LogicID: id, + } + + chips = append(chips, chip) + } + return chips +} + +// TestInitCardInfo test method getChipInfo +func TestInitCardInfo(t *testing.T) { + patches := gomonkey.ApplyFuncReturn(getNPUChipList, mockGetNPUChipList()) + defer patches.Reset() + convey.Convey("test InitCardInfo", t, func() { + + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() + npuCollector := mockNewNpuCollector() + + InitCardInfo(&sync.WaitGroup{}, ctx, npuCollector) + time.Sleep(time.Millisecond * num100) + cancelFunc() + chips := getChipListCache(npuCollector) + convey.So(len(chips), convey.ShouldEqual, npuCount) + }) +} + +// TestGetChipListCache test method getChipListCache +func TestGetChipListCache(t *testing.T) { + npuCollector := mockNewNpuCollector() + tests := []testCase{ + {name: "should return 0 chips when cache is nil", wantErr: false, mockPart: func() {}, expectCount: 0}, + {name: "should return chips : " + strconv.Itoa(npuCount), expectCount: npuCount, wantErr: false, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, mockGetNPUChipList(), cacheTime) }}, + {name: "should return 0 chips when cache value is nil", wantErr: false, expectCount: 0, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, nil, cacheTime) }}, + {name: "should return 0 chips when value is a incorrect type", expectCount: 0, wantErr: false, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, &HuaWeiAIChip{}, cacheTime) }}, + {name: "should return 0 chips when cache is empty", expectCount: 0, wantErr: false, + mockPart: func() { npuCollector.cache.Set(npuListCacheKey, []HuaWeiAIChip{}, cacheTime) }, + }, + } + + convey.Convey("getChipListCache", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + tt.mockPart.(func())() + chips := getChipListCache(npuCollector) + assert.Len(t, chips, tt.expectCount.(int)) + convey.So(len(chips), convey.ShouldEqual, tt.expectCount) + }) + } + }) +} + +func mockNewNpuCollector() *NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: cacheTime, + updateTime: defaultUpdateTime, + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +func TestNpuChipInfoInitAtFirstTime(t *testing.T) { + n := mockNewNpuCollector() + convey.Convey("TestNpuChipInfoInitAtFirstTime", t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyFuncReturn(getNPUChipList, []HuaWeiAIChip{{CardId: 0}}) + // do test + npuChipInfoInitAtFirstTime(n) + // valid cache + data, err := n.cache.Get(npuListCacheKey) + convey.So(err, convey.ShouldBeNil) + chips, ok := data.([]HuaWeiAIChip) + convey.So(ok, convey.ShouldBeTrue) + convey.So(len(chips), convey.ShouldEqual, 1) + }) +} + +func patchCollectToCache() *gomonkey.Patches { + return gomonkey.ApplyMethod(&MetricsCollectorAdapter{}, "CollectToCache", + func(_ *MetricsCollectorAdapter, n *NpuCollector, chipList []HuaWeiAIChip) { + n.cache.Set(mockKey, mockValue, n.cacheTime) + }) +} + +func TestStartCollectForMultiGoroutine(t *testing.T) { + n := mockNewNpuCollector() + wg := sync.WaitGroup{} + ChainForMultiGoroutine = []MetricsCollector{ + &MetricsCollectorAdapter{}, + &MetricsCollectorAdapter{}, + } + patches := patchCollectToCache() + defer patches.Reset() + patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{createChip()}) + convey.Convey("TestStartCollectForMultiGoroutine", t, func() { + ctx, cancel := context.WithCancel(context.Background()) + startCollectForMultiGoroutine(&wg, ctx, n) + time.Sleep(n.updateTime) + cancel() + data, err := n.cache.Get(mockKey) + convey.So(err, convey.ShouldBeNil) + value, ok := data.(string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(value, convey.ShouldEqual, mockValue) + }) +} + +func TestRunChipCollector(t *testing.T) { + n := mockNewNpuCollector() + patches := patchCollectToCache() + defer patches.Reset() + convey.Convey("TestRunChipCollector", t, func() { + ctx, cancel := context.WithCancel(context.Background()) + tickCh := make(chan time.Time) + patches.ApplyFuncReturn(time.NewTicker, &time.Ticker{C: tickCh}) + close(tickCh) + go runChipCollector(ctx, n, createChip()) + time.Sleep(n.updateTime) + cancel() + data, err := n.cache.Get(mockKey) + convey.So(err, convey.ShouldBeNil) + value, ok := data.(string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(value, convey.ShouldEqual, mockValue) + }) +} + +func TestStartCollectSingleGoroutine(t *testing.T) { + n := mockNewNpuCollector() + wg := sync.WaitGroup{} + ChainForSingleGoroutine = []MetricsCollector{ + &MetricsCollectorAdapter{}, + } + patches := patchCollectToCache() + defer patches.Reset() + convey.Convey("TestStartCollectSingleGoroutine", t, func() { + ctx, cancel := context.WithCancel(context.Background()) + startCollectSingleGoroutine(&wg, ctx, n) + time.Sleep(n.updateTime) + cancel() + data, err := n.cache.Get(mockKey) + convey.So(err, convey.ShouldBeNil) + value, ok := data.(string) + convey.So(ok, convey.ShouldBeTrue) + convey.So(value, convey.ShouldEqual, mockValue) + }) +} + +type chipsCase struct { + name string + devType string + buildChips func() + expectValue int +} + +func TestGetChipListWithVNPU(t *testing.T) { + n := mockNewNpuCollector() + chip := HuaWeiAIChip{} + tests := []chipsCase{ + {name: "TestGetChipListWithVNPU_310p_no_vnpu", + devType: api.Ascend310P, + buildChips: func() { + chip = createChip() + }, + expectValue: 1, + }, + {name: "TestGetChipListWithVNPU_310p_2_vnpus", + devType: api.Ascend310P, + buildChips: func() { + chip = createValidVnpuChip() + }, + expectValue: num2, + }, + {name: "TestGetChipListWithVNPU_910", + devType: api.Ascend910, + buildChips: func() { + chip = createChip() + }, + expectValue: 1, + }, + } + + convey.Convey("TestGetChipListWithVNPU", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + tt.buildChips() + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.devType) + patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{chip}) + + chips := GetChipListWithVNPU(n) + convey.So(len(chips), convey.ShouldEqual, tt.expectValue) + }) + } + }) +} + +func createValidVnpuChip() HuaWeiAIChip { + chip := createChip() + chip.VDevInfos = &common.VirtualDevInfo{ + VDevActivityInfo: []common.VDevActivityInfo{ + { + VDevID: 0, + VDevAiCore: 0, + VDevTotalMem: 0, + VDevUsedMem: 0, + IsVirtualDev: true, + }, + { + VDevID: 1, + VDevAiCore: 1, + VDevTotalMem: 1, + VDevUsedMem: 1, + IsVirtualDev: true, + }, + }, + } + return chip +} + +func createChip() HuaWeiAIChip { + return HuaWeiAIChip{ + CardId: 0, + PhyId: 0, + DeviceID: 0, + LogicID: 0, + ChipInfo: &common.ChipInfo{ + Name: api.Ascend910, + Type: "Ascend", + Version: "V1", + }, + } +} + +func TestSetPCIeBusInfo(t *testing.T) { + const mockPcieBus = "0000:01:00.0" + tests := []struct { + name string + productTypes []string + err error + expectValue string + }{{ + name: "TestSetPCIeBusInfo_910", + productTypes: []string{api.Ascend910}, + err: nil, + expectValue: mockPcieBus, + }, { + name: "TestSetPCIeBusInfo_910_err", + productTypes: []string{api.Ascend910}, + err: mockErr, + expectValue: "", + }, { + name: "TestSetPCIeBusInfo_Atlas200ISoc", + productTypes: []string{common.Atlas200ISoc}, + err: nil, + expectValue: mockPcieBus, + }, { + name: "TestSetPCIeBusInfo_Atlas200ISoc_err", + productTypes: []string{common.Atlas200ISoc}, + err: mockErr, + expectValue: "", + }} + chip := createChip() + convey.Convey("TestSetPCIeBusInfo", t, func() { + for _, tt := range tests { + convey.Convey(tt.name, func() { + dmgr := &devmanager.DeviceManager{ProductTypes: tt.productTypes} + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(dmgr, "GetPCIeBusInfo", mockPcieBus, tt.err) + + setPCIeBusInfo(0, dmgr, &chip) + convey.So(chip.PCIeBusInfo, convey.ShouldEqual, tt.expectValue) + }) + } + }) +} + +type setElabelInfoTestCase struct { + name string + cardID int32 + mockElabelInfo common.ElabelInfo + mockError error + expectSerial string + expectProduct string + expectModel string + expectManufacturer string + expectManufacturerDate string +} + +func createSetElabelInfoTestCases() []setElabelInfoTestCase { + return []setElabelInfoTestCase{ + { + name: "should set elabel info successfully when GetCardElabelV2 returns valid data", + cardID: testCardID, + mockElabelInfo: common.ElabelInfo{ + ProductName: testProductName, + Model: testModel, + Manufacturer: testManufacturer, + ManufacturerDate: testManufacturerDate, + SerialNumber: testSerialNumber, + }, + mockError: nil, + expectSerial: testSerialNumber, + expectProduct: testProductName, + expectModel: testModel, + expectManufacturer: testManufacturer, + expectManufacturerDate: testManufacturerDate, + }, + { + name: "should set default elabel info when GetCardElabelV2 returns error", + cardID: testCardID, + mockElabelInfo: common.ElabelInfo{}, + mockError: testError, + expectSerial: testDefaultSerial, + expectProduct: "", + expectModel: "", + expectManufacturer: "", + expectManufacturerDate: "", + }, + } +} + +func executeSetElabelInfoTest(tc setElabelInfoTestCase) { + // Create mock device manager + mockDmgr := &devmanager.DeviceManager{} + + // Create test chip + chip := &HuaWeiAIChip{} + + // Apply gomonkey patches + patches := gomonkey.NewPatches() + defer patches.Reset() + + patches.ApplyMethodReturn(mockDmgr, "GetCardElabelV2", + tc.mockElabelInfo, tc.mockError) + + // Execute the function under test + setElabelInfo(chip, mockDmgr, tc.cardID) + + // Verify results + convey.So(chip.ElabelInfo, convey.ShouldNotBeNil) + convey.So(chip.ElabelInfo.SerialNumber, convey.ShouldEqual, tc.expectSerial) +} + +// TestSetElabelInfo test setElabelInfo method +func TestSetElabelInfo(t *testing.T) { + testCases := createSetElabelInfoTestCases() + + convey.Convey("TestSetElabelInfo", t, func() { + for _, tc := range testCases { + convey.Convey(tc.name, func() { + executeSetElabelInfoTest(tc) + }) + } + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/common/types.go b/mind-cluster/component/npu-exporter/collector/common/types.go new file mode 100644 index 0000000..4576c85 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/common/types.go @@ -0,0 +1,50 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for collector +package common + +import ( + "ascend-common/devmanager/common" +) + +// HuaWeiAIChip chip info +type HuaWeiAIChip struct { + + // CardId npu card id + CardId int32 `json:"card_id"` + // PhyId npu chip phy id + PhyId int32 `json:"phy_id"` + // DeviceID the chip physic ID + DeviceID int32 `json:"device_id"` + // the chip logic ID + LogicID int32 `json:"logic_id"` + // VDieID the vdie id + VDieID string `json:"vdie_id"` + // MainBoardId main board id , used to distinguish between A900A3SuperPod and A9000A3SuperPod + MainBoardId uint32 + // ChipInfo the chip info + ChipInfo *common.ChipInfo `json:"chip_info"` + // BoardInfo board info of device, but not display + BoardInfo *common.BoardInfo + + // VDevActivityInfo the activity virtual device info + VDevActivityInfo *common.VDevActivityInfo `json:"v_dev_activity_info"` + // VDevInfos the virtual device info + VDevInfos *common.VirtualDevInfo `json:"v_dev_infos"` + // PCIeBusInfo bus info + PCIeBusInfo string + // ElabelInfo elabel info including SN + ElabelInfo *common.ElabelInfo `json:"elabel_info"` +} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config.go new file mode 100644 index 0000000..be32832 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/config/metrics_config.go @@ -0,0 +1,208 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package config for general collector +package config + +import ( + "encoding/json" + "fmt" + "reflect" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" + + "ascend-common/common-utils/utils" +) + +var ( + // singleGoroutineMap metrics in this map will be collected in single goroutine + singleGoroutineMap = map[string]common.MetricsCollector{ + groupHccs: &metrics.HccsCollector{}, + groupNpu: &metrics.BaseInfoCollector{}, + groupSio: &metrics.SioCollector{}, + groupVersion: &metrics.VersionCollector{}, + groupHbm: &metrics.HbmCollector{}, + groupDDR: &metrics.DdrCollector{}, + groupVnpu: &metrics.VnpuCollector{}, + groupPcie: &metrics.PcieCollector{}, + } + // multiGoroutineMap metrics in this map will be collected in multi goroutine + multiGoroutineMap = map[string]common.MetricsCollector{ + groupNetwork: &metrics.NetworkCollector{}, + groupRoce: &metrics.RoceCollector{}, + groupOptical: &metrics.OpticalCollector{}, + } + // pluginCollectorMap metrics in this map will be collected in plugin goroutine + pluginCollectorMap = map[string]common.MetricsCollector{} + presetConfigs = make([]map[string]string, 0) + pluginConfigs = make([]map[string]string, 0) + + defaultPresetConfigs = []map[string]string{ + {metricsGroup: groupDDR, state: stateOn}, + {metricsGroup: groupHccs, state: stateOn}, + {metricsGroup: groupNpu, state: stateOn}, + {metricsGroup: groupNetwork, state: stateOn}, + {metricsGroup: groupPcie, state: stateOn}, + {metricsGroup: groupRoce, state: stateOn}, + {metricsGroup: groupSio, state: stateOn}, + {metricsGroup: groupVnpu, state: stateOn}, + {metricsGroup: groupVersion, state: stateOn}, + {metricsGroup: groupOptical, state: stateOn}, + {metricsGroup: groupHbm, state: stateOn}, + } + defaultPluginConfigs = []map[string]string{ + {metricsGroup: groupText, state: stateOn}, + } +) + +const ( + metricsGroup = "metricsGroup" + state = "state" + + groupDDR = "ddr" + groupHccs = "hccs" + groupNpu = "npu" + groupNetwork = "network" + groupPcie = "pcie" + groupRoce = "roce" + groupSio = "sio" + groupVnpu = "vnpu" + groupVersion = "version" + groupOptical = "optical" + groupHbm = "hbm" + groupText = "text" + + stateOn = "ON" + stateOFF = "OFF" +) + +const ( + PresetConfigPath = "/usr/local/metricConfiguration.json" + PluginConfigPath = "/usr/local/pluginConfiguration.json" +) + +func loadConfiguration() { + if fileBytes := loadFromFile(PresetConfigPath); fileBytes == nil { + logger.Warnf("load config from file %s failed, use default config", PresetConfigPath) + presetConfigs = defaultPresetConfigs + } else { + initConfiguration(fileBytes, &presetConfigs) + } + if fileBytes := loadFromFile(PluginConfigPath); fileBytes == nil { + logger.Warnf("load config from file %s failed, use default config", PluginConfigPath) + pluginConfigs = defaultPluginConfigs + } else { + initConfiguration(fileBytes, &pluginConfigs) + } +} + +func loadFromFile(filePath string) []byte { + fileBytes, err := utils.LoadFile(filePath) + if err != nil { + return nil + } + return fileBytes +} + +func initConfiguration(fileBytes []byte, configs *[]map[string]string) { + if err := json.Unmarshal(fileBytes, configs); err != nil { + logger.Errorf("unmarshal config byte failed: %v", err) + return + } +} + +// AddPluginCollector add plugin collector to cache +func AddPluginCollector(name string, collector common.MetricsCollector) error { + if _, exist := pluginCollectorMap[name]; exist { + logger.Errorf("plugin collector %v already exist", name) + return fmt.Errorf("plugin collector %v already exist", name) + } + logger.Infof("add plugin collector %v ok", name) + pluginCollectorMap[name] = collector + return nil +} + +// DeletePluginCollector delete plugin collector from cache +func DeletePluginCollector(name string) { + if _, exist := pluginCollectorMap[name]; !exist { + logger.Warnf("plugin collector %v does not exist", name) + return + } + logger.Infof("delete plugin collector %v ok", name) + delete(pluginCollectorMap, name) +} + +// Register register collector to cache +func Register(n *common.NpuCollector) { + loadConfiguration() + + for _, config := range presetConfigs { + metricsGroupName := config[metricsGroup] + + if config[state] != stateOn { + logger.Infof("metricsGroup [%v] is off", metricsGroupName) + continue + } + logger.Infof("metricsGroup [%v] is on", metricsGroupName) + collector, exist := singleGoroutineMap[metricsGroupName] + if exist && collector.IsSupported(n) { + common.ChainForSingleGoroutine = append(common.ChainForSingleGoroutine, collector) + } + + collector, exist = multiGoroutineMap[metricsGroupName] + if exist && collector.IsSupported(n) { + common.ChainForMultiGoroutine = append(common.ChainForMultiGoroutine, collector) + } + } + + for _, config := range pluginConfigs { + metricsGroupName := config[metricsGroup] + + if config[state] != stateOn { + logger.Infof("plugin collector [%v] is off", metricsGroupName) + continue + } + logger.Infof("plugin collector [%v] is on", metricsGroupName) + collector, exist := pluginCollectorMap[metricsGroupName] + if exist && collector.IsSupported(n) { + logger.Infof("add plugin collector:%v", metricsGroupName) + common.ChainForCustomPlugin = append(common.ChainForCustomPlugin, collector) + } + + } + + logger.Infof("ChainForSingleGoroutine:%#v", common.ChainForSingleGoroutine) + logger.Infof("ChainForMultiGoroutine:%#v", common.ChainForMultiGoroutine) + logger.Infof("ChainForCustomPlugin:%#v", common.ChainForCustomPlugin) +} + +// UnRegister delete collector from chain +func UnRegister(worker reflect.Type) { + logger.Debugf("unRegister collector:%v", worker) + unRegisterChain(worker, &common.ChainForSingleGoroutine) + unRegisterChain(worker, &common.ChainForMultiGoroutine) + unRegisterChain(worker, &common.ChainForCustomPlugin) +} + +func unRegisterChain(worker reflect.Type, chain *[]common.MetricsCollector) { + newChain := make([]common.MetricsCollector, 0) + for _, collector := range *chain { + if reflect.TypeOf(collector) != worker { + newChain = append(newChain, collector) + } + } + *chain = newChain +} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go new file mode 100644 index 0000000..974ed3e --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go @@ -0,0 +1,216 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package config for general collector +package config + +import ( + "ascend-common/common-utils/utils" + "reflect" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" +) + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + initChain() +} + +func initChain() { + common.ChainForSingleGoroutine = []common.MetricsCollector{} + common.ChainForMultiGoroutine = []common.MetricsCollector{} +} + +func TestInitConfiguration(t *testing.T) { + convey.Convey("TestInitConfiguration", t, func() { + initConfiguration([]byte("test"), &presetConfigs) + convey.So(len(presetConfigs), convey.ShouldEqual, 0) + }) +} + +func TestLoadConfiguration(t *testing.T) { + convey.Convey("TestLoadConfiguration", t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + convey.Convey("load config ok", func() { + patches.ApplyFunc(loadFromFile, func(filePath string) []byte { + if filePath == PresetConfigPath { + filePath = "../../build/metricConfiguration.json" + } else if filePath == PluginConfigPath { + filePath = "../../build/pluginConfiguration.json" + } + fileBytes, _ := utils.LoadFile(filePath) + return fileBytes + }) + defer func() { + presetConfigs = make([]map[string]string, 0) + pluginConfigs = make([]map[string]string, 0) + }() + loadConfiguration() + convey.So(len(presetConfigs), convey.ShouldBeGreaterThan, 0) + convey.So(len(pluginConfigs), convey.ShouldBeGreaterThan, 0) + }) + convey.Convey("load config fail", func() { + presetConfigs = make([]map[string]string, 0) + pluginConfigs = make([]map[string]string, 0) + patches.ApplyFunc(loadFromFile, func(filePath string) []byte { + return nil + }) + loadConfiguration() + convey.So(len(presetConfigs), convey.ShouldEqual, len(defaultPresetConfigs)) + convey.So(len(pluginConfigs), convey.ShouldEqual, len(defaultPluginConfigs)) + }) + }) +} + +func TestAddPluginCollector(t *testing.T) { + convey.Convey("TestAddPluginCollector", t, func() { + convey.Convey("add plugin ok", func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + defer func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + }() + err := AddPluginCollector("test", &metrics.HccsCollector{}) + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("add plugin fail", func() { + pluginCollectorMap["test"] = &metrics.HccsCollector{} + defer func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + }() + err := AddPluginCollector("test", &metrics.HccsCollector{}) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestDeletePluginCollector(t *testing.T) { + convey.Convey("TestDeletePluginCollector", t, func() { + convey.Convey("delete plugin ok", func() { + pluginCollectorMap["test"] = &metrics.HccsCollector{} + DeletePluginCollector("test") + convey.So(pluginCollectorMap["test"], convey.ShouldBeNil) + }) + convey.Convey("delete plugin fail", func() { + pluginCollectorMap = make(map[string]common.MetricsCollector) + DeletePluginCollector("test") + convey.So(len(pluginCollectorMap), convey.ShouldEqual, 0) + }) + }) +} + +func TestRegister(t *testing.T) { + convey.Convey("TestRegister", t, func() { + n := &common.NpuCollector{} + patches := gomonkey.NewPatches() + defer patches.Reset() + // Mock IsSupported method to always return true + patches.ApplyMethodReturn(&metrics.HccsCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.BaseInfoCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.SioCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.VersionCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.HbmCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.DdrCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.VnpuCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.PcieCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.NetworkCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.RoceCollector{}, "IsSupported", true) + patches.ApplyMethodReturn(&metrics.OpticalCollector{}, "IsSupported", true) + patches.ApplyFunc(loadConfiguration, func() { + initConfiguration(loadFromFile("../../build/metricConfiguration.json"), &presetConfigs) + initConfiguration(loadFromFile("../../build/pluginConfiguration.json"), &pluginConfigs) + }) + Register(n) + convey.Convey("Should add collectors to ChainForSingleGoroutine", func() { + convey.So(len(common.ChainForSingleGoroutine), convey.ShouldBeGreaterThan, 0) + }) + convey.Convey("Should add collectors to ChainForMultiGoroutine", func() { + convey.So(len(common.ChainForMultiGoroutine), convey.ShouldBeGreaterThan, 0) + }) + }) +} + +func TestUnRegister(t *testing.T) { + convey.Convey("TestUnRegister", t, func() { + // Initialize chains with some collectors + common.ChainForSingleGoroutine = []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.BaseInfoCollector{}, + } + common.ChainForMultiGoroutine = []common.MetricsCollector{ + &metrics.NetworkCollector{}, + &metrics.RoceCollector{}, + } + + convey.Convey("When UnRegister is called with HccsCollector type", func() { + UnRegister(reflect.TypeOf(&metrics.HccsCollector{})) + + convey.Convey("Should remove HccsCollector from ChainForSingleGoroutine", func() { + expected := []common.MetricsCollector{ + &metrics.BaseInfoCollector{}, + } + convey.So(len(common.ChainForSingleGoroutine), convey.ShouldEqual, len(expected)) + for i, collector := range common.ChainForSingleGoroutine { + convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) + } + }) + + convey.Convey("Should not affect ChainForMultiGoroutine", func() { + expected := []common.MetricsCollector{ + &metrics.NetworkCollector{}, + &metrics.RoceCollector{}, + } + convey.So(len(common.ChainForMultiGoroutine), convey.ShouldEqual, len(expected)) + for i, collector := range common.ChainForMultiGoroutine { + convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) + } + }) + }) + }) +} + +func TestUnRegisterChain(t *testing.T) { + convey.Convey("TestUnRegisterChain", t, func() { + // Initialize a chain with some collectors + chain := []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.BaseInfoCollector{}, + &metrics.NetworkCollector{}, + } + + convey.Convey("When unRegisterChain is called with BaseInfoCollector type", func() { + unRegisterChain(reflect.TypeOf(&metrics.BaseInfoCollector{}), &chain) + convey.Convey("Should remove BaseInfoCollector from the chain", func() { + expected := []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.NetworkCollector{}, + } + convey.So(len(chain), convey.ShouldEqual, len(expected)) + for i, collector := range chain { + convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) + } + }) + }) + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go new file mode 100644 index 0000000..5ee3c7f --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go @@ -0,0 +1,870 @@ +// +//Copyright 2018 The Kubernetes Authors. +//Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +//modify descripe: remove unused options for example: +//remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" +// +//Licensed under the Apache License, Version 2.0 (the "License"); +//you may not use this file except in compliance with the License. +//You may obtain a copy of the License at +// +//http://www.apache.org/licenses/LICENSE-2.0 +// +//Unless required by applicable law or agreed to in writing, software +//distributed under the License is distributed on an "AS IS" BASIS, +//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//See the License for the specific language governing permissions and +//limitations under the License. + +// To regenerate api.pb.go run hack/update-generated-runtime.sh + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.13.0 +// source: isula_api.proto + +package isula + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type ContainerState int32 + +const ( + ContainerState_CONTAINER_CREATED ContainerState = 0 + ContainerState_CONTAINER_RUNNING ContainerState = 1 + ContainerState_CONTAINER_EXITED ContainerState = 2 + ContainerState_CONTAINER_UNKNOWN ContainerState = 3 +) + +// Enum value maps for ContainerState. +var ( + ContainerState_name = map[int32]string{ + 0: "CONTAINER_CREATED", + 1: "CONTAINER_RUNNING", + 2: "CONTAINER_EXITED", + 3: "CONTAINER_UNKNOWN", + } + ContainerState_value = map[string]int32{ + "CONTAINER_CREATED": 0, + "CONTAINER_RUNNING": 1, + "CONTAINER_EXITED": 2, + "CONTAINER_UNKNOWN": 3, + } +) + +func (x ContainerState) Enum() *ContainerState { + p := new(ContainerState) + *p = x + return p +} + +func (x ContainerState) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ContainerState) Descriptor() protoreflect.EnumDescriptor { + return file_isula_api_proto_enumTypes[0].Descriptor() +} + +func (ContainerState) Type() protoreflect.EnumType { + return &file_isula_api_proto_enumTypes[0] +} + +func (x ContainerState) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use ContainerState.Descriptor instead. +func (ContainerState) EnumDescriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{0} +} + +// ImageSpec is an internal representation of an image. Currently, it wraps the +// value of a Container's Image field (e.g. imageID or imageDigest), but in the +// future it will include more detailed information about the different image types. +type ImageSpec struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` + // Unstructured key-value map holding arbitrary metadata. + // ImageSpec Annotations can be used to help the runtime target specific + // images in multi-arch images. + Annotations map[string]string `protobuf:"bytes,2,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` +} + +func (x *ImageSpec) Reset() { + *x = ImageSpec{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ImageSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ImageSpec) ProtoMessage() {} + +func (x *ImageSpec) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ImageSpec.ProtoReflect.Descriptor instead. +func (*ImageSpec) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{0} +} + +func (x *ImageSpec) GetImage() string { + if x != nil { + return x.Image + } + return "" +} + +func (x *ImageSpec) GetAnnotations() map[string]string { + if x != nil { + return x.Annotations + } + return nil +} + +// ContainerMetadata holds all necessary information for building the container +// name. The container runtime is encouraged to expose the metadata in its user +// interface for better user experience. E.g., runtime can construct a unique +// container name based on the metadata. Note that (name, attempt) is unique +// within a sandbox for the entire lifetime of the sandbox. +type ContainerMetadata struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Name of the container. Same as the container name in the PodSpec. + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + // Attempt number of creating the container. Default: 0. + Attempt uint32 `protobuf:"varint,2,opt,name=attempt,proto3" json:"attempt,omitempty"` +} + +func (x *ContainerMetadata) Reset() { + *x = ContainerMetadata{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ContainerMetadata) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ContainerMetadata) ProtoMessage() {} + +func (x *ContainerMetadata) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ContainerMetadata.ProtoReflect.Descriptor instead. +func (*ContainerMetadata) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{1} +} + +func (x *ContainerMetadata) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ContainerMetadata) GetAttempt() uint32 { + if x != nil { + return x.Attempt + } + return 0 +} + +// ContainerStateValue is the wrapper of ContainerState. +type ContainerStateValue struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // State of the container. + State ContainerState `protobuf:"varint,1,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` +} + +func (x *ContainerStateValue) Reset() { + *x = ContainerStateValue{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ContainerStateValue) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ContainerStateValue) ProtoMessage() {} + +func (x *ContainerStateValue) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ContainerStateValue.ProtoReflect.Descriptor instead. +func (*ContainerStateValue) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{2} +} + +func (x *ContainerStateValue) GetState() ContainerState { + if x != nil { + return x.State + } + return ContainerState_CONTAINER_CREATED +} + +// ContainerFilter is used to filter containers. +// All those fields are combined with 'AND' +type ContainerFilter struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // ID of the container. + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // State of the container. + State *ContainerStateValue `protobuf:"bytes,2,opt,name=state,proto3" json:"state,omitempty"` + // ID of the PodSandbox. + PodSandboxId string `protobuf:"bytes,3,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + LabelSelector map[string]string `protobuf:"bytes,4,rep,name=label_selector,json=labelSelector,proto3" json:"label_selector,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` +} + +func (x *ContainerFilter) Reset() { + *x = ContainerFilter{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ContainerFilter) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ContainerFilter) ProtoMessage() {} + +func (x *ContainerFilter) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ContainerFilter.ProtoReflect.Descriptor instead. +func (*ContainerFilter) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{3} +} + +func (x *ContainerFilter) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *ContainerFilter) GetState() *ContainerStateValue { + if x != nil { + return x.State + } + return nil +} + +func (x *ContainerFilter) GetPodSandboxId() string { + if x != nil { + return x.PodSandboxId + } + return "" +} + +func (x *ContainerFilter) GetLabelSelector() map[string]string { + if x != nil { + return x.LabelSelector + } + return nil +} + +type ListContainersRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Filter *ContainerFilter `protobuf:"bytes,1,opt,name=filter,proto3" json:"filter,omitempty"` +} + +func (x *ListContainersRequest) Reset() { + *x = ListContainersRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ListContainersRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListContainersRequest) ProtoMessage() {} + +func (x *ListContainersRequest) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListContainersRequest.ProtoReflect.Descriptor instead. +func (*ListContainersRequest) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{4} +} + +func (x *ListContainersRequest) GetFilter() *ContainerFilter { + if x != nil { + return x.Filter + } + return nil +} + +// Container provides the runtime information for a container, such as ID, hash, +// state of the container. +type Container struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // ID of the container, used by the container runtime to identify + // a container. + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // ID of the sandbox to which this container belongs. + PodSandboxId string `protobuf:"bytes,2,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` + // Metadata of the container. + Metadata *ContainerMetadata `protobuf:"bytes,3,opt,name=metadata,proto3" json:"metadata,omitempty"` + // Spec of the image. + Image *ImageSpec `protobuf:"bytes,4,opt,name=image,proto3" json:"image,omitempty"` + // Reference to the image in use. For most runtimes, this should be an + // image ID. + ImageRef string `protobuf:"bytes,5,opt,name=image_ref,json=imageRef,proto3" json:"image_ref,omitempty"` + // State of the container. + State ContainerState `protobuf:"varint,6,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` + // Creation time of the container in nanoseconds. + CreatedAt int64 `protobuf:"varint,7,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` + // Key-value pairs that may be used to scope and select individual resources. + Labels map[string]string `protobuf:"bytes,8,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate this Container. + Annotations map[string]string `protobuf:"bytes,9,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` +} + +func (x *Container) Reset() { + *x = Container{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Container) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Container) ProtoMessage() {} + +func (x *Container) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Container.ProtoReflect.Descriptor instead. +func (*Container) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{5} +} + +func (x *Container) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *Container) GetPodSandboxId() string { + if x != nil { + return x.PodSandboxId + } + return "" +} + +func (x *Container) GetMetadata() *ContainerMetadata { + if x != nil { + return x.Metadata + } + return nil +} + +func (x *Container) GetImage() *ImageSpec { + if x != nil { + return x.Image + } + return nil +} + +func (x *Container) GetImageRef() string { + if x != nil { + return x.ImageRef + } + return "" +} + +func (x *Container) GetState() ContainerState { + if x != nil { + return x.State + } + return ContainerState_CONTAINER_CREATED +} + +func (x *Container) GetCreatedAt() int64 { + if x != nil { + return x.CreatedAt + } + return 0 +} + +func (x *Container) GetLabels() map[string]string { + if x != nil { + return x.Labels + } + return nil +} + +func (x *Container) GetAnnotations() map[string]string { + if x != nil { + return x.Annotations + } + return nil +} + +type ListContainersResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // List of containers. + Containers []*Container `protobuf:"bytes,1,rep,name=containers,proto3" json:"containers,omitempty"` +} + +func (x *ListContainersResponse) Reset() { + *x = ListContainersResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_isula_api_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ListContainersResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListContainersResponse) ProtoMessage() {} + +func (x *ListContainersResponse) ProtoReflect() protoreflect.Message { + mi := &file_isula_api_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListContainersResponse.ProtoReflect.Descriptor instead. +func (*ListContainersResponse) Descriptor() ([]byte, []int) { + return file_isula_api_proto_rawDescGZIP(), []int{6} +} + +func (x *ListContainersResponse) GetContainers() []*Container { + if x != nil { + return x.Containers + } + return nil +} + +var File_isula_api_proto protoreflect.FileDescriptor + +var file_isula_api_proto_rawDesc = []byte{ + 0x0a, 0x0f, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x5f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x12, 0x10, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, + 0x68, 0x61, 0x32, 0x22, 0xb1, 0x01, 0x0a, 0x09, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, + 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, + 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, + 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, + 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x3e, 0x0a, 0x10, 0x41, 0x6e, 0x6e, 0x6f, 0x74, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, + 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, + 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x41, 0x0a, 0x11, 0x43, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, + 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, + 0x12, 0x18, 0x0a, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x0d, 0x52, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x22, 0x4d, 0x0a, 0x13, 0x43, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, 0x6c, 0x75, + 0x65, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, + 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, + 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, + 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0xa3, 0x02, 0x0a, 0x0f, 0x43, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x12, 0x0e, 0x0a, + 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x3b, 0x0a, + 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x25, 0x2e, 0x72, + 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, + 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, + 0x6c, 0x75, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, + 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x5f, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, + 0x12, 0x5b, 0x0a, 0x0e, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, + 0x6f, 0x72, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, + 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, 0x65, + 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0d, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x1a, 0x40, 0x0a, + 0x12, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, + 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, + 0x52, 0x0a, 0x15, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x39, 0x0a, 0x06, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, + 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x52, 0x06, 0x66, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x22, 0xb5, 0x04, 0x0a, 0x09, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, + 0x72, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, + 0x64, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, + 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, + 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, 0x12, 0x3f, 0x0a, 0x08, 0x6d, 0x65, 0x74, 0x61, 0x64, + 0x61, 0x74, 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x72, 0x75, 0x6e, 0x74, + 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x08, + 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x31, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, + 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, + 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x49, 0x6d, 0x61, 0x67, 0x65, + 0x53, 0x70, 0x65, 0x63, 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x69, + 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x72, 0x65, 0x66, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, + 0x69, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x66, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, + 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, + 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x61, 0x74, 0x18, 0x07, + 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x41, 0x74, 0x12, + 0x3f, 0x0a, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, + 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, + 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, + 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, + 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, + 0x09, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, + 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, + 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, + 0x1a, 0x39, 0x0a, 0x0b, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, + 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, + 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x1a, 0x3e, 0x0a, 0x10, 0x41, + 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, + 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, + 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x55, 0x0a, 0x16, 0x4c, + 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, + 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, + 0x72, 0x73, 0x2a, 0x6b, 0x0a, 0x0e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, + 0x74, 0x61, 0x74, 0x65, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, + 0x52, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x43, + 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x52, 0x55, 0x4e, 0x4e, 0x49, 0x4e, 0x47, + 0x10, 0x01, 0x12, 0x14, 0x0a, 0x10, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, + 0x45, 0x58, 0x49, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, + 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x03, 0x32, + 0x77, 0x0a, 0x0e, 0x52, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, + 0x65, 0x12, 0x65, 0x0a, 0x0e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x73, 0x12, 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, + 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x28, 0x2e, 0x72, + 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, + 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0a, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, + 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_isula_api_proto_rawDescOnce sync.Once + file_isula_api_proto_rawDescData = file_isula_api_proto_rawDesc +) + +func file_isula_api_proto_rawDescGZIP() []byte { + file_isula_api_proto_rawDescOnce.Do(func() { + file_isula_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_isula_api_proto_rawDescData) + }) + return file_isula_api_proto_rawDescData +} + +var file_isula_api_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_isula_api_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_isula_api_proto_goTypes = []interface{}{ + (ContainerState)(0), // 0: runtime.v1alpha2.ContainerState + (*ImageSpec)(nil), // 1: runtime.v1alpha2.ImageSpec + (*ContainerMetadata)(nil), // 2: runtime.v1alpha2.ContainerMetadata + (*ContainerStateValue)(nil), // 3: runtime.v1alpha2.ContainerStateValue + (*ContainerFilter)(nil), // 4: runtime.v1alpha2.ContainerFilter + (*ListContainersRequest)(nil), // 5: runtime.v1alpha2.ListContainersRequest + (*Container)(nil), // 6: runtime.v1alpha2.Container + (*ListContainersResponse)(nil), // 7: runtime.v1alpha2.ListContainersResponse + nil, // 8: runtime.v1alpha2.ImageSpec.AnnotationsEntry + nil, // 9: runtime.v1alpha2.ContainerFilter.LabelSelectorEntry + nil, // 10: runtime.v1alpha2.Container.LabelsEntry + nil, // 11: runtime.v1alpha2.Container.AnnotationsEntry +} +var file_isula_api_proto_depIdxs = []int32{ + 8, // 0: runtime.v1alpha2.ImageSpec.annotations:type_name -> runtime.v1alpha2.ImageSpec.AnnotationsEntry + 0, // 1: runtime.v1alpha2.ContainerStateValue.state:type_name -> runtime.v1alpha2.ContainerState + 3, // 2: runtime.v1alpha2.ContainerFilter.state:type_name -> runtime.v1alpha2.ContainerStateValue + 9, // 3: runtime.v1alpha2.ContainerFilter.label_selector:type_name -> runtime.v1alpha2.ContainerFilter.LabelSelectorEntry + 4, // 4: runtime.v1alpha2.ListContainersRequest.filter:type_name -> runtime.v1alpha2.ContainerFilter + 2, // 5: runtime.v1alpha2.Container.metadata:type_name -> runtime.v1alpha2.ContainerMetadata + 1, // 6: runtime.v1alpha2.Container.image:type_name -> runtime.v1alpha2.ImageSpec + 0, // 7: runtime.v1alpha2.Container.state:type_name -> runtime.v1alpha2.ContainerState + 10, // 8: runtime.v1alpha2.Container.labels:type_name -> runtime.v1alpha2.Container.LabelsEntry + 11, // 9: runtime.v1alpha2.Container.annotations:type_name -> runtime.v1alpha2.Container.AnnotationsEntry + 6, // 10: runtime.v1alpha2.ListContainersResponse.containers:type_name -> runtime.v1alpha2.Container + 5, // 11: runtime.v1alpha2.RuntimeService.ListContainers:input_type -> runtime.v1alpha2.ListContainersRequest + 7, // 12: runtime.v1alpha2.RuntimeService.ListContainers:output_type -> runtime.v1alpha2.ListContainersResponse + 12, // [12:13] is the sub-list for method output_type + 11, // [11:12] is the sub-list for method input_type + 11, // [11:11] is the sub-list for extension type_name + 11, // [11:11] is the sub-list for extension extendee + 0, // [0:11] is the sub-list for field type_name +} + +func init() { file_isula_api_proto_init() } +func file_isula_api_proto_init() { + if File_isula_api_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_isula_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ImageSpec) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ContainerMetadata) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ContainerStateValue) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ContainerFilter) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ListContainersRequest) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*Container) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isula_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*ListContainersResponse) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_isula_api_proto_rawDesc, + NumEnums: 1, + NumMessages: 11, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_isula_api_proto_goTypes, + DependencyIndexes: file_isula_api_proto_depIdxs, + EnumInfos: file_isula_api_proto_enumTypes, + MessageInfos: file_isula_api_proto_msgTypes, + }.Build() + File_isula_api_proto = out.File + file_isula_api_proto_rawDesc = nil + file_isula_api_proto_goTypes = nil + file_isula_api_proto_depIdxs = nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto new file mode 100644 index 0000000..3f1f9f9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto @@ -0,0 +1,118 @@ +/* +Copyright 2018 The Kubernetes Authors. +Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. + modify descripe: remove unused options for example: + remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// To regenerate api.pb.go run hack/update-generated-runtime.sh +syntax = 'proto3'; + +package runtime.v1alpha2; +option go_package = "./;isula"; + +// Runtime service defines the public APIs for remote container runtimes +service RuntimeService { + // ListContainers lists all containers by filters. + rpc ListContainers(ListContainersRequest) returns (ListContainersResponse) {} +} + +// ImageSpec is an internal representation of an image. Currently, it wraps the +// value of a Container's Image field (e.g. imageID or imageDigest), but in the +// future it will include more detailed information about the different image types. +message ImageSpec { + string image = 1; + // Unstructured key-value map holding arbitrary metadata. + // ImageSpec Annotations can be used to help the runtime target specific + // images in multi-arch images. + map annotations = 2; +} + +// ContainerMetadata holds all necessary information for building the container +// name. The container runtime is encouraged to expose the metadata in its user +// interface for better user experience. E.g., runtime can construct a unique +// container name based on the metadata. Note that (name, attempt) is unique +// within a sandbox for the entire lifetime of the sandbox. +message ContainerMetadata { + // Name of the container. Same as the container name in the PodSpec. + string name = 1; + // Attempt number of creating the container. Default: 0. + uint32 attempt = 2; +} + +enum ContainerState { + CONTAINER_CREATED = 0; + CONTAINER_RUNNING = 1; + CONTAINER_EXITED = 2; + CONTAINER_UNKNOWN = 3; +} + +// ContainerStateValue is the wrapper of ContainerState. +message ContainerStateValue { + // State of the container. + ContainerState state = 1; +} + +// ContainerFilter is used to filter containers. +// All those fields are combined with 'AND' +message ContainerFilter { + // ID of the container. + string id = 1; + // State of the container. + ContainerStateValue state = 2; + // ID of the PodSandbox. + string pod_sandbox_id = 3; + // LabelSelector to select matches. + // Only api.MatchLabels is supported for now and the requirements + // are ANDed. MatchExpressions is not supported yet. + map label_selector = 4; +} + +message ListContainersRequest { + ContainerFilter filter = 1; +} + +// Container provides the runtime information for a container, such as ID, hash, +// state of the container. +message Container { + // ID of the container, used by the container runtime to identify + // a container. + string id = 1; + // ID of the sandbox to which this container belongs. + string pod_sandbox_id = 2; + // Metadata of the container. + ContainerMetadata metadata = 3; + // Spec of the image. + ImageSpec image = 4; + // Reference to the image in use. For most runtimes, this should be an + // image ID. + string image_ref = 5; + // State of the container. + ContainerState state = 6; + // Creation time of the container in nanoseconds. + int64 created_at = 7; + // Key-value pairs that may be used to scope and select individual resources. + map labels = 8; + // Unstructured key-value map holding arbitrary metadata. + // Annotations MUST NOT be altered by the runtime; the value of this field + // MUST be identical to that of the corresponding ContainerConfig used to + // instantiate this Container. + map annotations = 9; +} + +message ListContainersResponse { + // List of containers. + repeated Container containers = 1; +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go new file mode 100644 index 0000000..a503e15 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go @@ -0,0 +1,107 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.13.0 +// source: isula_api.proto + +package isula + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// RuntimeServiceClient is the client API for RuntimeService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type RuntimeServiceClient interface { + // ListContainers lists all containers by filters. + ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) +} + +type runtimeServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewRuntimeServiceClient(cc grpc.ClientConnInterface) RuntimeServiceClient { + return &runtimeServiceClient{cc} +} + +func (c *runtimeServiceClient) ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) { + out := new(ListContainersResponse) + err := c.cc.Invoke(ctx, "/runtime.v1alpha2.RuntimeService/ListContainers", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// RuntimeServiceServer is the server API for RuntimeService service. +// All implementations must embed UnimplementedRuntimeServiceServer +// for forward compatibility +type RuntimeServiceServer interface { + // ListContainers lists all containers by filters. + ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) + mustEmbedUnimplementedRuntimeServiceServer() +} + +// UnimplementedRuntimeServiceServer must be embedded to have forward compatible implementations. +type UnimplementedRuntimeServiceServer struct { +} + +func (UnimplementedRuntimeServiceServer) ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method ListContainers not implemented") +} +func (UnimplementedRuntimeServiceServer) mustEmbedUnimplementedRuntimeServiceServer() {} + +// UnsafeRuntimeServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to RuntimeServiceServer will +// result in compilation errors. +type UnsafeRuntimeServiceServer interface { + mustEmbedUnimplementedRuntimeServiceServer() +} + +func RegisterRuntimeServiceServer(s grpc.ServiceRegistrar, srv RuntimeServiceServer) { + s.RegisterService(&RuntimeService_ServiceDesc, srv) +} + +func _RuntimeService_ListContainers_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(ListContainersRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(RuntimeServiceServer).ListContainers(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/runtime.v1alpha2.RuntimeService/ListContainers", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(RuntimeServiceServer).ListContainers(ctx, req.(*ListContainersRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// RuntimeService_ServiceDesc is the grpc.ServiceDesc for RuntimeService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var RuntimeService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "runtime.v1alpha2.RuntimeService", + HandlerType: (*RuntimeServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "ListContainers", + Handler: _RuntimeService_ListContainers_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "isula_api.proto", +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go new file mode 100644 index 0000000..e31fea9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go @@ -0,0 +1,39 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package isula for monitoring isula' npu allocation +package isula + +// Config represents env +type Config struct { + Env []string `json:"Env,omitempty" platform:"linux"` +} + +// DeviceInfo represents device info +type DeviceInfo struct { + PathInContainer string `json:"PathInContainer,omitempty" platform:"linux"` +} + +// HostConfig represents host config content +type HostConfig struct { + Devices []DeviceInfo `json:"Devices,omitempty" platform:"linux"` + Privileged bool `json:"Privileged,omitempty" platform:"linux"` +} + +// ContainerJson represents container json content +type ContainerJson struct { + Config *Config `json:"Config,omitempty" platform:"linux"` + HostConfig *HostConfig `json:"HostConfig,omitempty" platform:"linux"` +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go new file mode 100644 index 0000000..5e4f83f --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go @@ -0,0 +1,278 @@ +// ####################################################################### +// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +// # - iSulad licensed under the Mulan PSL v2. +// # - You can use this software according to the terms and conditions of the Mulan PSL v2. +// # - You may obtain a copy of Mulan PSL v2 at: +// # - http://license.coscl.org.cn/MulanPSL2 +// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +// # - PURPOSE. +// # - See the Mulan PSL v2 for more details. +// ##- @Description: generate grpc +// ##- @Author: wujing +// ##- @Create: 2019-04-25 +// ####################################################################### + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.13.0 +// source: isulad.proto + +package isula + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type InspectContainerRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + Bformat bool `protobuf:"varint,2,opt,name=bformat,proto3" json:"bformat,omitempty"` + Timeout int32 `protobuf:"varint,3,opt,name=timeout,proto3" json:"timeout,omitempty"` +} + +func (x *InspectContainerRequest) Reset() { + *x = InspectContainerRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_isulad_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *InspectContainerRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InspectContainerRequest) ProtoMessage() {} + +func (x *InspectContainerRequest) ProtoReflect() protoreflect.Message { + mi := &file_isulad_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use InspectContainerRequest.ProtoReflect.Descriptor instead. +func (*InspectContainerRequest) Descriptor() ([]byte, []int) { + return file_isulad_proto_rawDescGZIP(), []int{0} +} + +func (x *InspectContainerRequest) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *InspectContainerRequest) GetBformat() bool { + if x != nil { + return x.Bformat + } + return false +} + +func (x *InspectContainerRequest) GetTimeout() int32 { + if x != nil { + return x.Timeout + } + return 0 +} + +type InspectContainerResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ContainerJSON string `protobuf:"bytes,1,opt,name=ContainerJSON,proto3" json:"ContainerJSON,omitempty"` + Cc uint32 `protobuf:"varint,2,opt,name=cc,proto3" json:"cc,omitempty"` + Errmsg string `protobuf:"bytes,3,opt,name=errmsg,proto3" json:"errmsg,omitempty"` +} + +func (x *InspectContainerResponse) Reset() { + *x = InspectContainerResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_isulad_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *InspectContainerResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InspectContainerResponse) ProtoMessage() {} + +func (x *InspectContainerResponse) ProtoReflect() protoreflect.Message { + mi := &file_isulad_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use InspectContainerResponse.ProtoReflect.Descriptor instead. +func (*InspectContainerResponse) Descriptor() ([]byte, []int) { + return file_isulad_proto_rawDescGZIP(), []int{1} +} + +func (x *InspectContainerResponse) GetContainerJSON() string { + if x != nil { + return x.ContainerJSON + } + return "" +} + +func (x *InspectContainerResponse) GetCc() uint32 { + if x != nil { + return x.Cc + } + return 0 +} + +func (x *InspectContainerResponse) GetErrmsg() string { + if x != nil { + return x.Errmsg + } + return "" +} + +var File_isulad_proto protoreflect.FileDescriptor + +var file_isulad_proto_rawDesc = []byte{ + 0x0a, 0x0c, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0a, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x22, 0x5d, 0x0a, 0x17, 0x49, 0x6e, + 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x12, + 0x18, 0x0a, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x22, 0x68, 0x0a, 0x18, 0x49, 0x6e, 0x73, + 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x24, 0x0a, 0x0d, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x43, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x12, 0x0e, 0x0a, 0x02, 0x63, + 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x02, 0x63, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x65, + 0x72, 0x72, 0x6d, 0x73, 0x67, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x65, 0x72, 0x72, + 0x6d, 0x73, 0x67, 0x32, 0x68, 0x0a, 0x10, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x54, 0x0a, 0x07, 0x49, 0x6e, 0x73, 0x70, 0x65, + 0x63, 0x74, 0x12, 0x23, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x2e, + 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, + 0x6e, 0x65, 0x72, 0x73, 0x2e, 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x42, 0x0c, 0x48, + 0x02, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x33, +} + +var ( + file_isulad_proto_rawDescOnce sync.Once + file_isulad_proto_rawDescData = file_isulad_proto_rawDesc +) + +func file_isulad_proto_rawDescGZIP() []byte { + file_isulad_proto_rawDescOnce.Do(func() { + file_isulad_proto_rawDescData = protoimpl.X.CompressGZIP(file_isulad_proto_rawDescData) + }) + return file_isulad_proto_rawDescData +} + +var file_isulad_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_isulad_proto_goTypes = []interface{}{ + (*InspectContainerRequest)(nil), // 0: containers.InspectContainerRequest + (*InspectContainerResponse)(nil), // 1: containers.InspectContainerResponse +} +var file_isulad_proto_depIdxs = []int32{ + 0, // 0: containers.ContainerService.Inspect:input_type -> containers.InspectContainerRequest + 1, // 1: containers.ContainerService.Inspect:output_type -> containers.InspectContainerResponse + 1, // [1:2] is the sub-list for method output_type + 0, // [0:1] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_isulad_proto_init() } +func file_isulad_proto_init() { + if File_isulad_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_isulad_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*InspectContainerRequest) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_isulad_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + value, ok := v.(*InspectContainerResponse) + if !ok { + return nil + } + + switch v := value; i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_isulad_proto_rawDesc, + NumEnums: 0, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_isulad_proto_goTypes, + DependencyIndexes: file_isulad_proto_depIdxs, + MessageInfos: file_isulad_proto_msgTypes, + }.Build() + File_isulad_proto = out.File + file_isulad_proto_rawDesc = nil + file_isulad_proto_goTypes = nil + file_isulad_proto_depIdxs = nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto new file mode 100644 index 0000000..af5f85c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto @@ -0,0 +1,35 @@ +// ####################################################################### +// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +// # - iSulad licensed under the Mulan PSL v2. +// # - You can use this software according to the terms and conditions of the Mulan PSL v2. +// # - You may obtain a copy of Mulan PSL v2 at: +// # - http://license.coscl.org.cn/MulanPSL2 +// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +// # - PURPOSE. +// # - See the Mulan PSL v2 for more details. +// ##- @Description: generate grpc +// ##- @Author: wujing +// ##- @Create: 2019-04-25 +// ####################################################################### +syntax = "proto3"; +option optimize_for = CODE_SIZE; + +package containers; +option go_package = "./;isula"; + +service ContainerService { + rpc Inspect(InspectContainerRequest) returns (InspectContainerResponse); +} + +message InspectContainerRequest { + string id = 1; + bool bformat = 2; + int32 timeout = 3; +} + +message InspectContainerResponse { + string ContainerJSON = 1; + uint32 cc = 2; + string errmsg = 3; +} \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go new file mode 100644 index 0000000..c563e0a --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go @@ -0,0 +1,105 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.13.0 +// source: isulad.proto + +package isula + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// ContainerServiceClient is the client API for ContainerService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type ContainerServiceClient interface { + Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) +} + +type containerServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewContainerServiceClient(cc grpc.ClientConnInterface) ContainerServiceClient { + return &containerServiceClient{cc} +} + +func (c *containerServiceClient) Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) { + out := new(InspectContainerResponse) + err := c.cc.Invoke(ctx, "/containers.ContainerService/Inspect", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// ContainerServiceServer is the server API for ContainerService service. +// All implementations must embed UnimplementedContainerServiceServer +// for forward compatibility +type ContainerServiceServer interface { + Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) + mustEmbedUnimplementedContainerServiceServer() +} + +// UnimplementedContainerServiceServer must be embedded to have forward compatible implementations. +type UnimplementedContainerServiceServer struct { +} + +func (UnimplementedContainerServiceServer) Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Inspect not implemented") +} +func (UnimplementedContainerServiceServer) mustEmbedUnimplementedContainerServiceServer() {} + +// UnsafeContainerServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to ContainerServiceServer will +// result in compilation errors. +type UnsafeContainerServiceServer interface { + mustEmbedUnimplementedContainerServiceServer() +} + +func RegisterContainerServiceServer(s grpc.ServiceRegistrar, srv ContainerServiceServer) { + s.RegisterService(&ContainerService_ServiceDesc, srv) +} + +func _ContainerService_Inspect_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(InspectContainerRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ContainerServiceServer).Inspect(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/containers.ContainerService/Inspect", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ContainerServiceServer).Inspect(ctx, req.(*InspectContainerRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// ContainerService_ServiceDesc is the grpc.ServiceDesc for ContainerService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var ContainerService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "containers.ContainerService", + HandlerType: (*ContainerServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "Inspect", + Handler: _ContainerService_Inspect_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "isulad.proto", +} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser.go b/mind-cluster/component/npu-exporter/collector/container/parser.go new file mode 100644 index 0000000..4531374 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/parser.go @@ -0,0 +1,630 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container for monitoring containers' npu allocation +package container + +import ( + "bufio" + "context" + "errors" + "fmt" + "math" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + namespaceMoby = "moby" // Docker + namespaceK8s = "k8s.io" // CRI + Containerd + sliceLen8 = 8 + ascendEnvPart = 2 + charDevice = "c" + + minus = "-" + comma = "," + ascend = "Ascend" + maxEnvLength = 1024 + parsingNpuDefaultTimeoutDuration = 3 +) + +const ( + // EndpointTypeContainerd K8S + Containerd + EndpointTypeContainerd = iota + // EndpointTypeDockerd Docker with or without K8S + EndpointTypeDockerd + // EndpointTypeIsula K8S + isula + EndpointTypeIsula = 2 +) + +var ( + // ErrFromContext error is from the context + ErrFromContext = errors.New("error from context") + + npuMajorID []string + npuMajorFetchCtrl sync.Once + parsingNpuDefaultTimeout = parsingNpuDefaultTimeoutDuration * time.Second +) + +var ( + envErrDescribe = func(ctrID, devID, env string, err error) string { + return fmt.Sprintf("container (%s) has an invalid device ID (%s) in %s, err is %v", ctrID, devID, env, err) + } + minusStyle = func(s string) bool { + return strings.Contains(s, minus) + } + commaMinusStyle = func(s string) bool { + return strings.Contains(s, minus) && strings.Contains(s, comma) + } + ascendStyle = func(s string) bool { + return strings.Contains(s, ascend) + } +) + +// CntNpuMonitorOpts contains setting options for monitoring containers +type CntNpuMonitorOpts struct { + EndpointType int // containerd or docker + CriEndpoint string // CRI server address + UseCriBackup bool // whether try to use cri backup address + OciEndpoint string // OCI server, now is containerd address + UseOciBackup bool // whether try to use oci backup address +} + +// MakeDevicesParser evaluates option settings and make an instance according to it +func MakeDevicesParser(opts CntNpuMonitorOpts) *DevicesParser { + runtimeOperator := &RuntimeOperatorTool{ + UseCriBackup: opts.UseCriBackup, + UseOciBackup: opts.UseOciBackup, + CriEndpoint: opts.CriEndpoint, + OciEndpoint: opts.OciEndpoint, + } + parser := &DevicesParser{ + RuntimeOperator: runtimeOperator, + } + + switch opts.EndpointType { + case EndpointTypeContainerd: + runtimeOperator.Namespace = namespaceK8s + case EndpointTypeDockerd: + runtimeOperator.Namespace = namespaceMoby + case EndpointTypeIsula: + runtimeOperator.Namespace = namespaceK8s + default: + logger.Errorf("invalid type value %d", opts.EndpointType) + } + + return parser +} + +// DevicesInfo the container device information struct +type DevicesInfo struct { + // container id + ID string + // container name, the format is: PodNameSpace_PodName_ContainerName + Name string + Devices []int +} + +// DevicesInfos the device information storage map +type DevicesInfos = map[string]DevicesInfo + +// DevicesParser the parser which parse device info +type DevicesParser struct { + // instances + result chan DevicesInfos + err chan error + // configuration + RuntimeOperator RuntimeOperator + Timeout time.Duration +} + +// Init initializes connection to containerd daemon and to CRI server or dockerd daemon based on name fetcher setting +func (dp *DevicesParser) Init() error { + if err := dp.RuntimeOperator.Init(); err != nil { + return contactError(err, "connecting to container runtime failed") + } + dp.result = make(chan DevicesInfos, 1) + dp.err = make(chan error, 1) + return nil +} + +// RecvResult exposes the channel used for receiving devices info analyzing result +func (dp *DevicesParser) RecvResult() <-chan DevicesInfos { + return dp.result +} + +// RecvErr exposes the channel used for receiving errors occurred during analyzing +func (dp *DevicesParser) RecvErr() <-chan error { + return dp.err +} + +// Close closes all connections and channels established during initializing +func (dp *DevicesParser) Close() { + _ = dp.RuntimeOperator.Close() +} + +func (dp *DevicesParser) parseDevices(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { + if dp.RuntimeOperator.GetContainerType() == IsulaContainer { + return dp.parseDeviceInIsula(ctx, c, rs) + } + + return dp.parseDevicesInContainerd(ctx, c, rs) +} + +func (dp *DevicesParser) parseDevicesInContainerd(ctx context.Context, c *CommonContainer, + rs chan<- DevicesInfo) error { + if rs == nil { + return errors.New("empty result channel") + } + deviceInfo := DevicesInfo{} + defer func(di *DevicesInfo) { + rs <- *di + }(&deviceInfo) + + spec, err := dp.RuntimeOperator.GetContainerInfoByID(ctx, c.Id) + if err != nil { + return contactError(err, fmt.Sprintf("cannot get container devices by container id (%s)", c.Id)) + } + if spec.Linux == nil || spec.Linux.Resources == nil || len(spec.Linux.Resources.Devices) > maxDevicesNum { + return contactError(errors.New("device error"), + fmt.Sprintf("devices in container is too much (%v) or empty", maxDevicesNum)) + } + if spec.Process == nil || len(spec.Process.Env) > maxEnvNum { + return contactError(errors.New("env error"), fmt.Sprintf("env in container is too much (%v) or empty", + maxEnvNum)) + } + + envs := spec.Process.Env + for i := len(envs) - 1; i >= 0; i-- { + e := envs[i] + if strings.Contains(e, api.AscendDeviceInfo) { + deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) + return err + } + } + + deviceInfo, err = dp.getDevicesWithoutAscendRuntime(spec, c) + return err +} + +func (dp *DevicesParser) getDevicesWithoutAscendRuntime(spec v1.Spec, c *CommonContainer) (DevicesInfo, error) { + deviceInfo := DevicesInfo{} + devicesIDs, err := filterNPUDevices(spec) + if err != nil { + logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) + return DevicesInfo{}, nil + } + logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) + + if len(devicesIDs) != 0 { + if deviceInfo, err = makeUpDeviceInfo(c); err == nil { + deviceInfo.Devices = devicesIDs + return deviceInfo, nil + } else { + logger.Errorf("makeUpDeviceInfo failed: %s", err) + } + return DevicesInfo{}, err + } + + return DevicesInfo{}, nil +} + +func (dp *DevicesParser) getDevicesWithAscendRuntime(ascendDevEnv string, c *CommonContainer) (DevicesInfo, error) { + logger.Debugf("get device info by env (%s) in %s", ascendDevEnv, c.Id) + devInfo := strings.Split(ascendDevEnv, "=") + if len(devInfo) != ascendEnvPart { + return DevicesInfo{}, fmt.Errorf("an invalid %s env(%s)", api.AscendDeviceInfo, ascendDevEnv) + } + devicesIDs := dp.parseDiffEnvFmt(devInfo[1], c.Id) + if len(devicesIDs) == 0 { + return DevicesInfo{}, nil + } + + deviceInfo, err := makeUpDeviceInfo(c) + if err != nil { + hwlog.RunLog.Error(err) + return DevicesInfo{}, err + } + deviceInfo.Devices = devicesIDs + return deviceInfo, nil +} + +func (dp *DevicesParser) parseDiffEnvFmt(devices, containerID string) []int { + if len(devices) > maxEnvLength { + return []int{} + } + if ascendStyle(devices) { + return dp.getDeviceIDsByAscendStyle(devices, containerID) + } + if commaMinusStyle(devices) { + return dp.getDeviceIDsByCommaMinusStyle(devices, containerID) + } + if minusStyle(devices) { + return dp.getDeviceIDsByMinusStyle(devices, containerID) + } + return dp.getDeviceIDsByCommaStyle(devices, containerID) +} + +func (dp *DevicesParser) getDeviceIDsByCommaStyle(devices, containerID string) []int { + devList := strings.Split(devices, comma) + devicesIDs := make([]int, 0, len(devList)) + for _, devID := range devList { + id, err := strconv.Atoi(devID) + if err != nil { + logger.Errorf("container (%s) has an invalid device ID (%v) in %s, error is %s", containerID, + devID, api.AscendDeviceInfo, err) + continue + } + devicesIDs = append(devicesIDs, id) + } + return devicesIDs +} + +func (dp *DevicesParser) getDeviceIDsByAscendStyle(devices, containerID string) []int { + devList := strings.Split(devices, comma) + deviceIDs := make([]int, 0, len(devList)) + for _, subDevice := range devList { + deviceName := strings.Split(subDevice, minus) + if len(deviceName) != ascendEnvPart { + logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, nil)) + continue + } + id, err := strconv.Atoi(deviceName[1]) + if err != nil { + logger.Errorf(envErrDescribe(containerID, deviceName[1], api.AscendDeviceInfo, err)) + continue + } + deviceIDs = append(deviceIDs, id) + } + return deviceIDs +} + +func (dp *DevicesParser) getDeviceIDsByMinusStyle(devices, containerID string) []int { + deviceIDs := make([]int, 0) + devIDRange := strings.Split(devices, minus) + if len(devIDRange) != ascendEnvPart { + logger.Errorf(envErrDescribe(containerID, "range", api.AscendDeviceInfo, nil)) + return deviceIDs + } + minDevID, err := strconv.Atoi(devIDRange[0]) + if err != nil { + logger.Errorf(envErrDescribe(containerID, devIDRange[0], api.AscendDeviceInfo, err)) + return deviceIDs + } + maxDevID, err := strconv.Atoi(devIDRange[1]) + if err != nil { + logger.Errorf(envErrDescribe(containerID, devIDRange[1], api.AscendDeviceInfo, err)) + return deviceIDs + } + if minDevID > maxDevID { + logger.Errorf(envErrDescribe(containerID, "", + api.AscendDeviceInfo, errors.New("min id bigger than max id"))) + return deviceIDs + } + if maxDevID > math.MaxInt16 { + logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, errors.New("max id invalid"))) + return deviceIDs + } + for deviceID := minDevID; deviceID <= maxDevID; deviceID++ { + deviceIDs = append(deviceIDs, deviceID) + } + return deviceIDs +} + +func (dp *DevicesParser) getDeviceIDsByCommaMinusStyle(devices, containerID string) []int { + var deviceIDs []int + devList := strings.Split(devices, comma) + for _, subDevices := range devList { + if minusStyle(subDevices) { + deviceIDs = append(deviceIDs, dp.getDeviceIDsByMinusStyle(subDevices, containerID)...) + continue + } + deviceIDs = append(deviceIDs, dp.getDeviceIDsByCommaStyle(subDevices, containerID)...) + } + return deviceIDs +} + +func (dp *DevicesParser) getDevWithoutAscendRuntimeInIsula(containerInfo isula.ContainerJson, + c *CommonContainer) (DevicesInfo, error) { + deviceInfo := DevicesInfo{} + devicesIDs, err := filterNPUDevicesInIsula(containerInfo) + if err != nil { + logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) + return DevicesInfo{}, nil + } + logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) + + if len(devicesIDs) == 0 { + return DevicesInfo{}, nil + } + + deviceInfo, err = makeUpDeviceInfo(c) + if err != nil { + hwlog.RunLog.Error(err) + return DevicesInfo{}, err + } + deviceInfo.Devices = devicesIDs + return deviceInfo, nil +} + +func (dp *DevicesParser) parseDeviceInIsula(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { + if rs == nil { + return errors.New("empty result channel") + } + + deviceInfo := DevicesInfo{} + defer func(di *DevicesInfo) { + rs <- *di + }(&deviceInfo) + + if len(c.Id) > maxCgroupPath { + return fmt.Errorf("the containerId (%s) is too long", c.Id) + } + containerInfo, err := dp.RuntimeOperator.GetIsulaContainerInfoByID(ctx, c.Id) + if err != nil { + return contactError(err, fmt.Sprintf("getting config of container(%s) fail", c.Id)) + } + if containerInfo.HostConfig == nil || containerInfo.Config == nil { + return errors.New("empty container info") + } + + envs := containerInfo.Config.Env + for i := len(envs) - 1; i >= 0; i-- { + e := envs[i] + if strings.Contains(e, api.AscendDeviceInfo) { + deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) + return err + } + } + + deviceInfo, err = dp.getDevWithoutAscendRuntimeInIsula(containerInfo, c) + return err +} + +func (dp *DevicesParser) collect(ctx context.Context, r <-chan DevicesInfo, ct int32) (DevicesInfos, error) { + if r == nil { + return nil, errors.New("receiving channel is empty") + } + if ct < 0 { + return nil, nil + } + + results := make(map[string]DevicesInfo, ct) + for { + select { + case info, ok := <-r: + if !ok { + return nil, nil + } + if info.ID != "" { + results[info.ID] = info + } + if ct -= 1; ct <= 0 { + return results, nil + } + case <-ctx.Done(): + hwlog.RunLog.Error("ctx is timeout") + dp.err <- ErrFromContext + return nil, nil + } + } +} + +func (dp *DevicesParser) doParse(resultOut chan<- DevicesInfos) { + var result DevicesInfos = nil + defer func(rslt DevicesInfos) { + if resultOut != nil { + resultOut <- rslt + close(resultOut) + } + }(result) + + ctx := context.Background() + containers, err := dp.RuntimeOperator.GetContainers(ctx) + if err != nil { + dp.err <- err + return + } + + l := len(containers) + if l == 0 || l > maxContainers { + logger.Debugf("get %d containers from cri interface, return empty data", l) + dp.result <- make(DevicesInfos) + return + } + + r := make(chan DevicesInfo) + defer close(r) + wg := sync.WaitGroup{} + wg.Add(l) + + for _, container := range containers { + go func(container *CommonContainer, c context.Context) { + if err := dp.parseDevices(c, container, r); err != nil { + dp.err <- err + } + wg.Done() + }(container, ctx) + } + ctx, cancelFn := context.WithTimeout(ctx, withDefault(dp.Timeout, parsingNpuDefaultTimeout)) + defer cancelFn() + result, err = dp.collect(ctx, r, int32(l)) + if err != nil { + logger.Errorf("collect info error: %v", err) + } + + if result != nil { + dp.result <- result + } + wg.Wait() +} + +// FetchAndParse triggers the asynchronous process of querying and analyzing all containers +// resultOut channel is for fetching the current result +func (dp *DevicesParser) FetchAndParse(resultOut chan<- DevicesInfos) { + if dp.err == nil { + logger.Debug("device paster is not initialized") + return + } + go dp.doParse(resultOut) +} + +func withDefault(v time.Duration, d time.Duration) time.Duration { + if v == 0 { + return d + } + + return v +} + +// query the MajorID of NPU devices +func getNPUMajorID() ([]string, error) { + const ( + deviceCount = 2 + maxSearchLine = 512 + ) + + path, err := utils.CheckPath("/proc/devices") + if err != nil { + return nil, err + } + majorID := make([]string, 0, deviceCount) + f, err := os.Open(path) + if err != nil { + return majorID, err + } + defer func() { + err = f.Close() + if err != nil { + hwlog.RunLog.Error(err) + } + }() + s := bufio.NewScanner(f) + count := 0 + for s.Scan() { + // prevent from searching too many lines + if count > maxSearchLine { + break + } + count++ + text := s.Text() + matched, err := regexp.MatchString("^[0-9]{1,3}\\s[v]?devdrv-cdev$", text) + if err != nil { + return majorID, err + } + if !matched { + continue + } + fields := strings.Fields(text) + majorID = append(majorID, fields[0]) + } + return majorID, nil +} + +func npuMajor() []string { + npuMajorFetchCtrl.Do(func() { + var err error + npuMajorID, err = getNPUMajorID() + if err != nil { + return + } + }) + return npuMajorID +} + +func contains(slice []string, target string) bool { + for _, v := range slice { + if v == target { + return true + } + } + return false +} + +func contactError(err error, msg string) error { + return fmt.Errorf("%s->%s", err.Error(), msg) +} + +func filterNPUDevices(spec v1.Spec) ([]int, error) { + if spec.Linux == nil || spec.Linux.Resources == nil { + return nil, errors.New("empty spec info") + } + + const base = 10 + devIDs := make([]int, 0, sliceLen8) + majorIDs := npuMajor() + for _, dev := range spec.Linux.Resources.Devices { + if dev.Minor == nil || dev.Major == nil { + // do not monitor privileged container + continue + } + if *dev.Minor > math.MaxInt32 { + return nil, fmt.Errorf("get wrong device ID (%v)", dev.Minor) + } + major := strconv.FormatInt(*dev.Major, base) + if dev.Type == charDevice && contains(majorIDs, major) { + devIDs = append(devIDs, int(*dev.Minor)) + } + } + + return devIDs, nil +} + +// filterNPUDevicesInIsula get id of device from containerJson(containerInfo) +func filterNPUDevicesInIsula(containerInfo isula.ContainerJson) ([]int, error) { + privileged := containerInfo.HostConfig.Privileged + if privileged { + return nil, errors.New("it's a privileged container and skip it") + } + + devIDs := make([]int, 0, sliceLen8) + devices := containerInfo.HostConfig.Devices + for _, dev := range devices { + Id, err := getDevIdFromPath(api.DevicePathPattern, dev.PathInContainer) + if err != nil { + logger.Warn(err) + continue + } + devIDs = append(devIDs, Id) + } + + return devIDs, nil +} + +func getDevIdFromPath(pattern, path string) (int, error) { + if match, err := regexp.MatchString(pattern, path); err != nil || !match { + return -1, fmt.Errorf("unexpected path of device: %s or match error: %v", path, err) + } + number := regexp.MustCompile(`\d+`) + IdStr := number.FindString(path) + Id, err := strconv.Atoi(IdStr) + if err != nil { + return -1, fmt.Errorf("unexpected device ID (%v)", IdStr) + } + if Id > math.MaxInt32 { + return -1, fmt.Errorf("get wrong device ID (%v)", Id) + } + return Id, nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser_test.go b/mind-cluster/component/npu-exporter/collector/container/parser_test.go new file mode 100644 index 0000000..f2975b9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/parser_test.go @@ -0,0 +1,1027 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container provides utilities for container monitoring and testing. +package container + +import ( + "context" + "errors" + "os" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + // Test endpoint constants + testContainerdEndpoint = "unix:///run/containerd.sock" + testDockerEndpoint = "unix:///run/docker.sock" + + device0 = 0 + device1 = 1 + device2 = 2 + device3 = 3 + testDeviceRange = "0-2" + testDeviceComma = "0,1,2" + testDeviceCommaRange = "0-1,2-3" + testAscendDevice0 = "Ascend-0" + testAscendDevices = "Ascend-0,Ascend-1" + testMixedDevices = "0-1,3" + + // Test error constants + testOriginalError = "original error" + testErrorMessage = "test message" + testContactedError = "original error->test message" + + // Test path constants + testDevicePattern = "/dev/npu([0-9]+)" + + // Test duration constants + testZeroDuration = 0 +) + +func TestMakeDevicesParser(t *testing.T) { + testCases := []struct { + name string + opts CntNpuMonitorOpts + expected *DevicesParser + }{ + {name: "should create parser when options are valid for containerd", + opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeContainerd, + OciEndpoint: testContainerdEndpoint, UseOciBackup: false, UseCriBackup: false}, + expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: false, UseCriBackup: false, + CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, + {name: "should create parser when options are valid for docker", + opts: CntNpuMonitorOpts{CriEndpoint: testDockerEndpoint, EndpointType: EndpointTypeDockerd, + OciEndpoint: testDockerEndpoint, UseOciBackup: true, UseCriBackup: false}, + expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, + CriEndpoint: testDockerEndpoint, OciEndpoint: testDockerEndpoint}, Timeout: testZeroDuration}}, + {name: "should create parser when options are valid for isula", + opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeIsula, + OciEndpoint: testContainerdEndpoint, UseOciBackup: true, UseCriBackup: true}, + expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, + CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, + } + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + result := MakeDevicesParser(tc.opts) + convey.So(result, convey.ShouldNotBeNil) + convey.So(result.RuntimeOperator, convey.ShouldNotBeNil) + convey.So(result.Timeout, convey.ShouldEqual, tc.expected.Timeout) + }) + } +} + +func TestDevicesParserInit(t *testing.T) { + convey.Convey("TestDevicesParserInit", t, func() { + convey.Convey("should initialize successfully when runtime operator init succeeds", func() { + dp := &DevicesParser{ + RuntimeOperator: &RuntimeOperatorTool{}, + } + + patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", nil) + defer patches.Reset() + + err := dp.Init() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when initialization fails", func() { + dp := &DevicesParser{ + RuntimeOperator: &RuntimeOperatorTool{}, + } + patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", errors.New("init failed")) + defer patches.Reset() + err := dp.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "init failed") + }) + }) +} + +func TestDevicesParserRecvResult(t *testing.T) { + convey.Convey("TestDevicesParserRecvResult", t, func() { + convey.Convey("should return result channel when initialized", func() { + dp := &DevicesParser{ + result: make(chan DevicesInfos, 1), + } + resultChan := dp.RecvResult() + convey.So(resultChan, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserRecvErr(t *testing.T) { + convey.Convey("TestDevicesParserRecvErr", t, func() { + convey.Convey("should return error channel when initialized", func() { + dp := &DevicesParser{ + err: make(chan error, 1), + } + errChan := dp.RecvErr() + convey.So(errChan, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserClose(t *testing.T) { + convey.Convey("TestDevicesParserClose", t, func() { + convey.Convey("should close runtime operator when called", func() { + mockOperator := &RuntimeOperatorTool{} + dp := &DevicesParser{ + RuntimeOperator: mockOperator, + } + + visited := false + patches := gomonkey.ApplyMethod(mockOperator, "Close", func(*RuntimeOperatorTool) error { + visited = true + return nil + }) + defer patches.Reset() + + dp.Close() + convey.So(visited, convey.ShouldBeTrue) + }) + }) +} + +func TestDevicesParserParseDevices(t *testing.T) { + convey.Convey("TestDevicesParserParseDevices", t, func() { + convey.Convey("should parse isula devices when container type is isula", func() { + dp := &DevicesParser{} + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", IsulaContainer). + ApplyFuncReturn((*DevicesParser).parseDeviceInIsula, nil) + defer patches.Reset() + + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + resultChan := make(chan DevicesInfo, 1) + err := dp.parseDevices(ctx, container, resultChan) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should parse containerd devices when container type is not isula", func() { + dp := &DevicesParser{} + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", DefaultContainer). + ApplyFuncReturn((*DevicesParser).parseDevicesInContainerd, nil) + defer patches.Reset() + + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + resultChan := make(chan DevicesInfo, 1) + err := dp.parseDevices(ctx, container, resultChan) + convey.So(err, convey.ShouldBeNil) + }) + }) +} + +func TestDevicesParserParseDevicesInContainerd(t *testing.T) { + convey.Convey("TestDevicesParserParseDevicesInContainerd", t, func() { + convey.Convey("should return error when result channel is nil", func() { + dp := &DevicesParser{} + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + + err := dp.parseDevicesInContainerd(ctx, container, nil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") + }) + + convey.Convey("should return error when get container info fails", func() { + dp := &DevicesParser{} + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethod(mockOperator, "GetContainerInfoByID", + func(*RuntimeOperatorTool, context.Context, string) (v1.Spec, error) { + return v1.Spec{}, errors.New("get container info failed") + }) + defer patches.Reset() + + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + resultChan := make(chan DevicesInfo, 1) + + err := dp.parseDevicesInContainerd(ctx, container, resultChan) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserGetDevicesWithoutAscendRuntime(t *testing.T) { + convey.Convey("TestDevicesParserGetDevicesWithoutAscendRuntime", t, func() { + convey.Convey("should return devices when filter succeeds", func() { + dp := &DevicesParser{} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevices, []int{device0, device1, device2}, nil) + defer patches.Reset() + + patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) + + spec := v1.Spec{} + container := &CommonContainer{Id: "test-container"} + + result, err := dp.getDevicesWithoutAscendRuntime(spec, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) + }) + + convey.Convey("should return empty when filter fails", func() { + dp := &DevicesParser{} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevices, nil, errors.New("filter failed")) + defer patches.Reset() + + spec := v1.Spec{} + container := &CommonContainer{Id: "test-container"} + + result, err := dp.getDevicesWithoutAscendRuntime(spec, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldResemble, DevicesInfo{}) + }) + }) +} + +func TestDevicesParserGetDevicesWithAscendRuntime(t *testing.T) { + convey.Convey("TestDevicesParserGetDevicesWithAscendRuntime", t, func() { + convey.Convey("should return error when env format is invalid", func() { + dp := &DevicesParser{} + ascendDevEnv := "invalid-env" + container := &CommonContainer{Id: "test-container"} + + result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) + convey.So(err, convey.ShouldNotBeNil) + convey.So(result, convey.ShouldResemble, DevicesInfo{}) + }) + + convey.Convey("should return devices when env format is valid", func() { + dp := &DevicesParser{} + ascendDevEnv := "ASCEND_VISIBLE_DEVICES=0,1,2" + container := &CommonContainer{Id: "test-container"} + + patches := gomonkey.ApplyFunc(makeUpDeviceInfo, func(*CommonContainer) (DevicesInfo, error) { + return DevicesInfo{ID: "test", Name: "test-name"}, nil + }) + defer patches.Reset() + + result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) + }) + }) +} + +func TestDevicesParserGetDevWithoutAscendRuntimeInIsula(t *testing.T) { + convey.Convey("TestDevicesParserGetDevWithoutAscendRuntimeInIsula", t, func() { + convey.Convey("should return devices when filter succeeds", func() { + dp := &DevicesParser{} + containerInfo := isula.ContainerJson{} + container := &CommonContainer{Id: "test-container"} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, []int{device0, device1, device2}, nil) + defer patches.Reset() + + patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) + + result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) + }) + + convey.Convey("should return empty when filter fails", func() { + dp := &DevicesParser{} + containerInfo := isula.ContainerJson{} + container := &CommonContainer{Id: "test-container"} + + patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, nil, errors.New("filter failed")) + defer patches.Reset() + + result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldResemble, DevicesInfo{}) + }) + }) +} + +func TestDevicesParserParseDeviceInIsula(t *testing.T) { + convey.Convey("TestDevicesParserParseDeviceInIsula", t, func() { + convey.Convey("should return error when result channel is nil", func() { + dp := &DevicesParser{} + ctx := context.Background() + container := &CommonContainer{Id: "test-container"} + + err := dp.parseDeviceInIsula(ctx, container, nil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") + }) + + convey.Convey("should return error when container id is too long", func() { + dp := &DevicesParser{} + ctx := context.Background() + longId := string(make([]byte, maxCgroupPath+1)) + container := &CommonContainer{Id: longId} + resultChan := make(chan DevicesInfo, 1) + + err := dp.parseDeviceInIsula(ctx, container, resultChan) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestDevicesParserCollect(t *testing.T) { + convey.Convey("TestDevicesParserCollect", t, func() { + convey.Convey("should return error when receiving channel is nil", func() { + dp := &DevicesParser{} + ctx := context.Background() + + result, err := dp.collect(ctx, nil, 1) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "receiving channel is empty") + convey.So(result, convey.ShouldBeNil) + }) + + convey.Convey("should return nil when count is negative", func() { + dp := &DevicesParser{} + ctx := context.Background() + resultChan := make(chan DevicesInfo) + + result, err := dp.collect(ctx, resultChan, -1) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldBeNil) + }) + }) +} + +func TestDevicesParserDoParse(t *testing.T) { + convey.Convey("TestDevicesParserDoParse", t, func() { + const time100ms = 100 * time.Millisecond + convey.Convey("should handle error when get containers fails", func() { + dp := &DevicesParser{ + err: make(chan error, 1), + } + mockOperator := &RuntimeOperatorTool{} + dp.RuntimeOperator = mockOperator + + patches := gomonkey.ApplyMethod(mockOperator, "GetContainers", + func(*RuntimeOperatorTool, context.Context) ([]*CommonContainer, error) { + return nil, errors.New("get containers failed") + }) + defer patches.Reset() + + resultChan := make(chan DevicesInfos, 1) + dp.doParse(resultChan) + + select { + case err := <-dp.err: + convey.So(err, convey.ShouldNotBeNil) + case <-time.After(time100ms): + convey.So("timeout", convey.ShouldEqual, "should receive error") + } + }) + }) +} + +func TestDevicesParserFetchAndParse(t *testing.T) { + const time10ms = 10 * time.Millisecond + convey.Convey("TestDevicesParserFetchAndParse", t, func() { + convey.Convey("should return early when err channel is nil", func() { + dp := &DevicesParser{ + err: nil, + } + visited := make(chan bool, 1) + patches := gomonkey.ApplyPrivateMethod(dp, "doParse", + func(*DevicesParser, chan<- DevicesInfos) error { + visited <- true + return nil + }) + defer patches.Reset() + + dp.FetchAndParse(nil) + time.Sleep(time10ms) + convey.So(len(visited), convey.ShouldEqual, 0) + }) + + convey.Convey("should start parsing when initialized", func() { + dp := &DevicesParser{ + err: make(chan error, 1), + RuntimeOperator: &RuntimeOperatorTool{}, + } + visited := make(chan bool, 1) + patches := gomonkey.ApplyPrivateMethod(dp, "doParse", + func(*DevicesParser, chan<- DevicesInfos) error { + visited <- true + return nil + }) + defer patches.Reset() + + dp.FetchAndParse(nil) + time.Sleep(time10ms) + convey.So(len(visited), convey.ShouldEqual, 1) + }) + }) +} + +func TestDevicesParserGetDeviceIDsByMinusStyle(t *testing.T) { + convey.Convey("TestDevicesParserGetDeviceIDsByMinusStyle", t, func() { + testCases := []struct { + name string + devices string + expected []int + }{ + {name: "should return empty slice when devices string is invalid", devices: "invalid-devices", expected: []int{}}, + {name: "should return empty slice when min device ID is invalid", devices: "invalid-5", expected: []int{}}, + {name: "should return empty slice when max device ID is invalid", devices: "0-invalid", expected: []int{}}, + {name: "should return empty slice when min ID is bigger than max ID", devices: "5-3", expected: []int{}}, + {name: "should return empty slice when max ID is too large", devices: "0-99999", expected: []int{}}, + {name: "should return device IDs when range is valid", devices: "0-2", expected: []int{0, 1, 2}}, + {name: "should return single device ID when min equals max", devices: "1-1", expected: []int{1}}, + } + for _, tc := range testCases { + convey.Convey(tc.name, func() { + dp := &DevicesParser{} + result := dp.getDeviceIDsByMinusStyle(tc.devices, "test-container") + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetNPUMajorID(t *testing.T) { + testCases := builderTestGetNPUMajorIDCases() + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + _, cleanup := tc.setup(t) + defer cleanup() + result, err := getNPUMajorID() + if tc.hasError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + } + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } +} + +type TestGetNPUMajorIDCase struct { + name string + setup func(*testing.T) (*gomonkey.Patches, func()) + expected []string + hasError bool +} + +func builderTestGetNPUMajorIDCases() []TestGetNPUMajorIDCase { + testCases := []TestGetNPUMajorIDCase{{name: "should return error when path check fails", + setup: func(*testing.T) (*gomonkey.Patches, func()) { + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) + return patches, func() { patches.Reset() } + }, expected: nil, hasError: true}, + {name: "should return error when file open fails", + setup: func(*testing.T) (*gomonkey.Patches, func()) { + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, "/proc/devices", nil) + p1.ApplyFuncReturn(os.Open, nil, errors.New("file open failed")) + return p1, func() { p1.Reset() } + }, expected: []string{}, hasError: true}, + {name: "should return empty slice when no NPU devices found", + setup: func(t *testing.T) (*gomonkey.Patches, func()) { + tmpFile, clean, err := mkTemp("1 mem\n2 pty\n") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) + return p1, func() { clean(); p1.Reset() } + }, expected: []string{}, hasError: false}, + {name: "should return major IDs when NPU devices found", + setup: func(t *testing.T) (*gomonkey.Patches, func()) { + tmpFile, clean, err := mkTemp("195 devdrv-cdev\n196 devdrv-cdev\n") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) + return p1, func() { clean(); p1.Reset() } + }, expected: []string{"195", "196"}, hasError: false}, + {name: "should return major IDs when mixed devices found", + setup: func(t *testing.T) (*gomonkey.Patches, func()) { + tmpFile, clean, err := mkTemp("1 mem\n195 devdrv-cdev\n2 pty\n196 devdrv-cdev\n") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) + return p1, func() { clean(); p1.Reset() } + }, expected: []string{"195", "196"}, hasError: false}, + } + return testCases +} + +func TestNpuMajor(t *testing.T) { + convey.Convey("TestNpuMajor", t, func() { + convey.Convey("should return cached major IDs", func() { + patches := gomonkey.ApplyFuncReturn(getNPUMajorID, []string{"123", "456"}, nil) + defer patches.Reset() + + result := npuMajor() + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +func TestFilterNPUDevices(t *testing.T) { + convey.Convey("TestFilterNPUDevices", t, func() { + const mockMajorID = 236 + convey.Convey("should return error when spec is empty", func() { + spec := v1.Spec{} + result, err := filterNPUDevices(spec) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "empty spec info") + convey.So(result, convey.ShouldBeNil) + }) + + convey.Convey("should return devices when spec is valid", func() { + spec := v1.Spec{ + Linux: &v1.Linux{ + Resources: &v1.LinuxResources{ + Devices: []v1.LinuxDeviceCgroup{{Type: "c", Major: int64Ptr(mockMajorID), Minor: int64Ptr(0)}}, + }, + }, + } + patches := gomonkey.ApplyFuncReturn(npuMajor, []string{"236"}) + defer patches.Reset() + + result, err := filterNPUDevices(spec) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +// mkTemp creates a temporary file with the given content and returns the file name, +// a cleanup function, and an error. The file is closed before returning. +func mkTemp(content string) (string, func(), error) { + f, err := os.CreateTemp("", "test_*") + if err != nil { + return "", func() {}, err + } + if _, err = f.WriteString(content); err != nil { + clean(f) + return "", func() {}, err + } + if _, err = f.Seek(0, 0); err != nil { + clean(f) + return "", func() {}, err + } + name := f.Name() + return name, func() { clean(f) }, nil +} + +func clean(f *os.File) { + if f == nil { + return + } + if err := f.Close(); err != nil { + logger.Errorf("an error occurred where close file [%v],err :%v", f.Name(), err) + } + if err := os.Remove(f.Name()); err != nil { + logger.Errorf("an error occurred where remove file [%v],err :%v", f.Name(), err) + } +} + +func TestFilterNPUDevicesInIsula(t *testing.T) { + convey.Convey("TestFilterNPUDevicesInIsula", t, func() { + convey.Convey("should return error when container is privileged", func() { + containerInfo := isula.ContainerJson{ + HostConfig: &isula.HostConfig{ + Privileged: true, + }, + } + + result, err := filterNPUDevicesInIsula(containerInfo) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "privileged container") + convey.So(result, convey.ShouldBeNil) + }) + + convey.Convey("should return devices when container is not privileged", func() { + containerInfo := isula.ContainerJson{ + HostConfig: &isula.HostConfig{ + Privileged: false, + Devices: []isula.DeviceInfo{ + { + PathInContainer: "/dev/npu0", + }, + }, + }, + } + + patches := gomonkey.ApplyFuncReturn(getDevIdFromPath, 0, nil) + defer patches.Reset() + + result, err := filterNPUDevicesInIsula(containerInfo) + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +// Helper function for creating int64 pointers +func int64Ptr(v int64) *int64 { + return &v +} + +func TestParseDiffEnvFmt(t *testing.T) { + convey.Convey("TestParseDiffEnvFmt", t, func() { + dp := &DevicesParser{} + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + {name: "should parse comma style devices when valid", + devices: testDeviceComma, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + {name: "should parse minus style devices when valid", + devices: testDeviceRange, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + {name: "should parse ascend style devices when valid", + devices: testAscendDevices, + containerID: "test-container", + expected: []int{device0, device1}, + }, + {name: "should parse comma minus style devices when valid", + devices: testDeviceCommaRange, + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + {name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.parseDiffEnvFmt(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByCommaStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByCommaStyle", t, func() { + dp := &DevicesParser{} + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + {name: "should parse comma separated devices when valid", + devices: "0,1,2,3", + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + {name: "should parse single device when valid", + devices: "0", + containerID: "test-container", + expected: []int{device0}, + }, + {name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + {name: "should parse devices with spaces when valid", + devices: testDeviceComma, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByCommaStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByAscendStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByAscendStyle", t, func() { + dp := &DevicesParser{} + + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + { + name: "should parse ascend devices when valid", + devices: "Ascend-0,Ascend-1,Ascend-2", + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + { + name: "should parse single ascend device when valid", + devices: testAscendDevice0, + containerID: "test-container", + expected: []int{0}, + }, + { + name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + { + name: "should parse mixed case ascend devices when valid", + devices: "ascend-0,ASCEND-1", + containerID: "test-container", + expected: []int{device0, device1}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByAscendStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByMinusStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByMinusStyle", t, func() { + dp := &DevicesParser{} + + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + { + name: "should parse range devices when valid", + devices: "0-3", + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + { + name: "should parse single device range when valid", + devices: "0-0", + containerID: "test-container", + expected: []int{device0}, + }, + { + name: "should return empty slice when devices are empty", + devices: "", + containerID: "test-container", + expected: []int{}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByMinusStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestGetDeviceIDsByCommaMinusStyle(t *testing.T) { + convey.Convey("TestGetDeviceIDsByCommaMinusStyle", t, func() { + dp := &DevicesParser{} + + testCases := []struct { + name string + devices string + containerID string + expected []int + }{ + { + name: "should parse comma minus devices when valid", + devices: testDeviceCommaRange, + containerID: "test-container", + expected: []int{device0, device1, device2, device3}, + }, + { + name: "should parse single range when valid", + devices: testDeviceRange, + containerID: "test-container", + expected: []int{device0, device1, device2}, + }, + { + name: "should return nil when devices are empty", + devices: "", + containerID: "test-container", + expected: nil, + }, + { + name: "should parse mixed ranges when valid", + devices: testMixedDevices, + containerID: "test-container", + expected: []int{device0, device1, device3}, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := dp.getDeviceIDsByCommaMinusStyle(tc.devices, tc.containerID) + convey.So(result, convey.ShouldResemble, tc.expected) + }) + } + }) +} + +func TestContains(t *testing.T) { + convey.Convey("TestContains", t, func() { + testCases := []struct { + name string + slice []string + target string + expected bool + }{ + { + name: "should return true when target exists in slice", + slice: []string{"a", "b", "c"}, + target: "b", + expected: true, + }, + { + name: "should return false when target does not exist in slice", + slice: []string{"a", "b", "c"}, + target: "d", + expected: false, + }, + { + name: "should return false when slice is empty", + slice: []string{}, + target: "a", + expected: false, + }, + { + name: "should return false when slice is nil", + slice: nil, + target: "a", + expected: false, + }, + { + name: "should return false when target is empty string", + slice: []string{"a", "b", "c"}, + target: "", + expected: false, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := contains(tc.slice, tc.target) + convey.So(result, convey.ShouldEqual, tc.expected) + }) + } + }) +} + +func TestContactError(t *testing.T) { + convey.Convey("TestContactError", t, func() { + testCases := []struct { + name string + err error + msg string + expected string + }{ + { + name: "should concatenate error with message when both provided", + err: errors.New(testOriginalError), + msg: testErrorMessage, + expected: testContactedError, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := contactError(tc.err, tc.msg) + convey.So(result.Error(), convey.ShouldEqual, tc.expected) + }) + } + }) +} + +func TestGetDevIdFromPath(t *testing.T) { + convey.Convey("TestGetDevIdFromPath", t, func() { + testCases := []struct { + name string + pattern string + path string + expected int + hasError bool + }{ + {name: "should extract device id when path is valid", + pattern: testDevicePattern, + path: "/dev/npu0", + expected: 0, + hasError: false, + }, + {name: "should extract device id when path has multiple digits", + pattern: testDevicePattern, + path: "/dev/npu123", + expected: 123, + hasError: false, + }, + {name: "should return error when device path is invalid", + pattern: testDevicePattern, + path: "/dev/cpu0", + expected: 0, + hasError: true, + }, + {name: "should return error when path is empty", + pattern: testDevicePattern, + path: "", + expected: 0, + hasError: true, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result, err := getDevIdFromPath(tc.pattern, tc.path) + if tc.hasError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + convey.So(result, convey.ShouldEqual, tc.expected) + } + }) + } + }) +} + +func TestWithDefault(t *testing.T) { + convey.Convey("TestWithDefault", t, func() { + const time0s = 0 + const time3s = 3 * time.Second + const time5s = 5 * time.Second + testCases := []struct { + name string + v time.Duration + d time.Duration + expected time.Duration + }{ + {name: "should return default when duration is zero", + v: time0s, + d: time5s, + expected: time5s, + }, + {name: "should return value when duration is non-zero", + v: time3s, + d: time5s, + expected: time3s, + }, + {name: "should return value when duration is negative", + v: -1 * time.Second, + d: time5s, + expected: -1 * time.Second, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + result := withDefault(tc.v, tc.d) + convey.So(result, convey.ShouldEqual, tc.expected) + }) + } + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go new file mode 100644 index 0000000..daab834 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go @@ -0,0 +1,413 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container for monitoring containers' npu allocation +package container + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "syscall" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" + criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" + + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + labelK8sPodNamespace = "io.kubernetes.pod.namespace" + labelK8sPodName = "io.kubernetes.pod.name" + labelContainerName = "io.kubernetes.container.name" + + // DefaultIsuladAddr default isulad sock adress + DefaultIsuladAddr = "unix:///run/isulad.sock" + // DefaultDockerShim default docker shim sock address + DefaultDockerShim = "unix:///run/dockershim.sock" + // DefaultCRIDockerd default cri-dockerd sock address + DefaultCRIDockerd = "unix:///run/cri-dockerd.sock" + // DefaultContainerdAddr default containerd sock address + DefaultContainerdAddr = "unix:///run/containerd/containerd.sock" + // DefaultDockerAddr default docker containerd sock address + DefaultDockerAddr = "unix:///run/docker/containerd/docker-containerd.sock" + defaultDockerOnEuler = "unix:///run/docker/containerd/containerd.sock" + grpcHeader = "containerd-namespace" + unixPre = "unix://" + + // IsulaContainer represents isula container type + IsulaContainer = "isula" + // DefaultContainer represents default container type + DefaultContainer = "docker-containerd" + excludePermissions = 0002 + + criV1alpha2 = "runtime.v1alpha2.RuntimeService" +) + +// CommonContainer wraps some common container attribute of isulad and containerd +type CommonContainer struct { + Id string + Labels map[string]string +} + +// RuntimeOperator wraps operations against container runtime +type RuntimeOperator interface { + Init() error + Close() error + GetContainers(ctx context.Context) ([]*CommonContainer, error) + GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) + GetIsulaContainerInfoByID(ctx context.Context, id string) (isula.ContainerJson, error) + GetContainerType() string +} + +// RuntimeOperatorTool implements RuntimeOperator interface +type RuntimeOperatorTool struct { + criConn *grpc.ClientConn + conn *grpc.ClientConn + criClient interface{} + client interface{} + // CriEndpoint CRI server endpoint + CriEndpoint string + // OciEndpoint containerd Server endpoint + OciEndpoint string + // Namespace the namespace of containerd + Namespace string + // UseCriBackup use cri back up address or not + UseCriBackup bool + // UseOciBackup use oci back up address or not + UseOciBackup bool +} + +// Init initializes container runtime operator +func (operator *RuntimeOperatorTool) Init() error { + start := syscall.Getuid() + logger.Debugf("the init uid is:%d", start) + if start != 0 { + err := syscall.Setuid(0) + if err != nil { + return fmt.Errorf("raise uid failed: %v", err) + } + logger.Debugf("raise uid to:%d", 0) + defer func() { + err = syscall.Setuid(start) + if err != nil { + logger.Errorf("recover uid failed: %v", err) + } + logger.Debugf("recover uid to:%d", start) + }() + } + if err := sockCheck(operator); err != nil { + hwlog.RunLog.Error("check socket path failed") + return err + } + + if err := operator.initCriClient(); err != nil { + return fmt.Errorf("init CRI client failed, %s", err) + } + + if err := operator.initOciClient(); err != nil { + return fmt.Errorf("init OCI client failed, %s", err) + } + return nil +} + +func (operator *RuntimeOperatorTool) initCriClient() error { + criConn, err := GetConnection(operator.CriEndpoint) + if err != nil || criConn == nil { + msg := fmt.Sprintf("connecting to CRI server failed: %v", err) + if operator.UseCriBackup { + logger.Warnf("%v, will use cri-dockerd address to try again", msg) + if utils.IsExist(strings.TrimPrefix(DefaultCRIDockerd, unixPre)) { + criConn, err = GetConnection(DefaultCRIDockerd) + } + } else { + logger.Warn(msg) + } + } + if err != nil { + return fmt.Errorf("connecting to CRI server failed: %v", err) + } + if operator.CriEndpoint == DefaultIsuladAddr { + operator.criClient = isula.NewRuntimeServiceClient(criConn) + } else { + operator.criClient = v1alpha2.NewRuntimeServiceClient(criConn) + } + operator.criConn = criConn + return nil +} + +func (operator *RuntimeOperatorTool) initOciClient() error { + conn, err := GetConnection(operator.OciEndpoint) + if err != nil || conn == nil { + msg := fmt.Sprintf("failed to get OCI connection: %v", err) + if operator.UseOciBackup { + logger.Warnf("%v, will use backup address to try again", msg) + if utils.IsExist(strings.TrimPrefix(DefaultContainerdAddr, unixPre)) { + conn, err = GetConnection(DefaultContainerdAddr) + + } else if utils.IsExist(strings.TrimPrefix(defaultDockerOnEuler, unixPre)) { + conn, err = GetConnection(defaultDockerOnEuler) + } + } else { + logger.Warn(msg) + } + } + if err != nil { + return fmt.Errorf("connecting to OCI server failed: %v", err) + } + if operator.OciEndpoint == DefaultIsuladAddr { + operator.client = isula.NewContainerServiceClient(conn) + } else { + operator.client = v1.NewContainersClient(conn) + } + operator.conn = conn + return nil +} + +func sockCheck(operator *RuntimeOperatorTool) error { + absPath, err := utils.CheckPath(strings.TrimPrefix(operator.CriEndpoint, unixPre)) + if err != nil { + return err + } + if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { + return err + } + + absPath, err = utils.CheckPath(strings.TrimPrefix(operator.OciEndpoint, unixPre)) + if err != nil { + return err + } + if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { + return err + } + return nil +} + +// Close closes container runtime operator +func (operator *RuntimeOperatorTool) Close() error { + err := operator.conn.Close() + if err != nil { + return err + } + err = operator.criConn.Close() + if err != nil { + return err + } + return nil +} + +// GetContainers returns all containers' IDs +func (operator *RuntimeOperatorTool) GetContainers(ctx context.Context) ([]*CommonContainer, error) { + if utils.IsNil(operator.criClient) || operator.criConn == nil { + return nil, errors.New("criClient is empty") + } + if client, ok := operator.criClient.(v1alpha2.RuntimeServiceClient); ok { + containers, err := getContainersByContainerdV1alpha2(ctx, client) + if isUnimplementedError(err, criV1alpha2) { + v1Client := criv1.NewRuntimeServiceClient(operator.criConn) + return getContainersByContainerdV1(ctx, v1Client) + } + return containers, err + } + if client, ok := operator.criClient.(isula.RuntimeServiceClient); ok { + return getContainersByIsulad(ctx, client) + } + + logger.Errorf("client %v is unexpected", operator.criClient) + return nil, errors.New("unexpected client type") +} + +func isUnimplementedError(err error, serviceName string) bool { + if err == nil { + return false + } + st, ok := status.FromError(err) + if ok { + return st.Code() == codes.Unimplemented && strings.Contains(st.Message(), serviceName) + } + errStr := err.Error() + if strings.Contains(errStr, "code = Unimplemented") && + strings.Contains(errStr, "desc = ") && strings.Contains(errStr, serviceName) { + return true + } + return false +} + +// GetContainerInfoByID use oci interface to get container +func (operator *RuntimeOperatorTool) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { + if utils.IsNil(operator.client) || operator.conn == nil { + return v1.Spec{}, errors.New("oci client is empty") + } + + s := v1.Spec{} + if client, ok := operator.client.(v1.ContainersClient); ok { + resp, err := client.Get(setGrpcNamespaceHeader(ctx, operator.Namespace), &v1.GetContainerRequest{ + Id: id, + }) + if err != nil { + hwlog.RunLog.Error("get call OCI get method failed") + return v1.Spec{}, err + } + if err = json.Unmarshal(resp.Container.Spec.Value, &s); err != nil { + hwlog.RunLog.Error("unmarshal OCI response failed") + return v1.Spec{}, err + } + return s, nil + } + + return s, errors.New("unexpected containerd client") +} + +// GetIsulaContainerInfoByID return isula container info +func (operator *RuntimeOperatorTool) GetIsulaContainerInfoByID(ctx context.Context, + id string) (isula.ContainerJson, error) { + containerJsonInfo := isula.ContainerJson{} + if utils.IsNil(operator.client) || operator.conn == nil { + return containerJsonInfo, errors.New("oci client is empty") + } + + if client, ok := operator.client.(isula.ContainerServiceClient); ok { + resp, err := client.Inspect(setGrpcNamespaceHeader(ctx, operator.Namespace), &isula.InspectContainerRequest{ + Id: id, + }) + if err != nil { + hwlog.RunLog.Error("call isula OCI Inspect method failed") + return containerJsonInfo, err + } + if err = json.Unmarshal([]byte(resp.ContainerJSON), &containerJsonInfo); err != nil { + logger.Errorf("unmarshal err: %v", err) + return containerJsonInfo, err + } + return containerJsonInfo, nil + } + + return containerJsonInfo, errors.New("unexpected isula client") +} + +// GetContainerType return container type +func (operator *RuntimeOperatorTool) GetContainerType() string { + if operator.OciEndpoint == DefaultIsuladAddr { + return IsulaContainer + } + return DefaultContainer +} + +type nsKey struct{} + +func setGrpcNamespaceHeader(ctx context.Context, namespace string) context.Context { + context.WithValue(ctx, nsKey{}, namespace) + ns := metadata.Pairs(grpcHeader, namespace) + md, ok := metadata.FromOutgoingContext(ctx) + if !ok { + md = ns + } else { + md = metadata.Join(ns, md) + } + return metadata.NewOutgoingContext(ctx, md) +} + +func getContainersByContainerdV1alpha2(ctx context.Context, + client v1alpha2.RuntimeServiceClient) ([]*CommonContainer, error) { + var allContainers []*CommonContainer + request := genContainerRequestV1alpha2() + r, err := client.ListContainers(ctx, request) + if err != nil { + hwlog.RunLog.Warn(err) + return nil, err + } + for _, container := range r.Containers { + allContainers = append(allContainers, &CommonContainer{ + Id: container.Id, + Labels: container.Labels, + }) + } + return allContainers, nil +} + +func getContainersByContainerdV1(ctx context.Context, client criv1.RuntimeServiceClient) ([]*CommonContainer, error) { + var allContainers []*CommonContainer + request := genContainerRequestV1() + r, err := client.ListContainers(ctx, request) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + for _, container := range r.Containers { + allContainers = append(allContainers, &CommonContainer{ + Id: container.Id, + Labels: container.Labels, + }) + } + return allContainers, nil +} + +func getContainersByIsulad(ctx context.Context, client isula.RuntimeServiceClient) ([]*CommonContainer, error) { + var allContainers []*CommonContainer + request := genIsulaRequest() + r, err := client.ListContainers(ctx, request) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + for _, container := range r.Containers { + allContainers = append(allContainers, &CommonContainer{ + Id: container.Id, + Labels: container.Labels, + }) + } + return allContainers, nil +} + +func genContainerRequestV1alpha2() *v1alpha2.ListContainersRequest { + filter := &v1alpha2.ContainerFilter{} + st := &v1alpha2.ContainerStateValue{} + st.State = v1alpha2.ContainerState_CONTAINER_RUNNING + filter.State = st + request := &v1alpha2.ListContainersRequest{ + Filter: filter, + } + return request +} + +func genContainerRequestV1() *criv1.ListContainersRequest { + filter := &criv1.ContainerFilter{} + st := &criv1.ContainerStateValue{} + st.State = criv1.ContainerState_CONTAINER_RUNNING + filter.State = st + request := &criv1.ListContainersRequest{ + Filter: filter, + } + return request +} + +func genIsulaRequest() *isula.ListContainersRequest { + filter := &isula.ContainerFilter{} + st := &isula.ContainerStateValue{} + st.State = isula.ContainerState_CONTAINER_RUNNING + filter.State = st + request := &isula.ListContainersRequest{ + Filter: filter, + } + return request +} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go new file mode 100644 index 0000000..2bc135c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go @@ -0,0 +1,568 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container provides utilities for container monitoring and testing. +package container + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" + + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/container/isula" + "huawei.com/npu-exporter/v6/collector/container/v1" +) + +const ( + // Test constants for runtime operations + testNamespace = "test-namespace" + + // Test error messages + testInitCriError = "init CRI client failed" + testInitOciError = "init OCI client failed" + testSockCheckError = "socket check failed" + testCriClientEmptyError = "criClient is empty" + testOciClientEmptyError = "oci client is empty" + testUnexpectedClientError = "unexpected client type" + testUnexpectedContainerdClientError = "unexpected containerd client" + testUnexpectedIsulaClientError = "unexpected isula client" + testCriV1alpha2 = "runtime.v1alpha2.RuntimeService" + testCriV1 = "runtime.v1.RuntimeService" +) + +func TestRuntimeOperatorToolInit(t *testing.T) { + r := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + convey.Convey("should initialize successfully when all components succeed", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, nil) + defer patches.Reset() + patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) + patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) + err := operator.Init() + convey.So(err, convey.ShouldBeNil) + }) + convey.Convey("should return error when socket check fails", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, errors.New(testSockCheckError)) + defer patches.Reset() + err := operator.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testSockCheckError) + }) + convey.Convey("should return error when CRI client init fails", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, nil) + defer patches.Reset() + patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, errors.New(testInitCriError)) + patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) + err := operator.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testInitCriError) + }) + convey.Convey("should return error when OCI client init fails", t, func() { + operator := r + patches := gomonkey.ApplyFuncReturn(sockCheck, nil) + defer patches.Reset() + patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) + patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, errors.New(testInitOciError)) + err := operator.Init() + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testInitOciError) + }) +} + +func TestRuntimeOperatorToolInitCriClient(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolInitCriClient", t, func() { + convey.Convey("should initialize CRI client successfully for containerd", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + UseOciBackup: false, + UseCriBackup: false, + } + + patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + defer patches.Reset() + + err := operator.initCriClient() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should initialize CRI client successfully for isulad", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: DefaultIsuladAddr, + UseOciBackup: false, + UseCriBackup: false, + } + + patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + defer patches.Reset() + + err := operator.initCriClient() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when connection fails and no backup", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + UseOciBackup: false, + UseCriBackup: false, + } + + patches := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) + defer patches.Reset() + + err := operator.initCriClient() + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestRuntimeOperatorToolInitOciClient(t *testing.T) { + testCases := buildInitOciClientTestCases() + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + operator, patches := tc.setup() + if patches != nil { + defer patches.Reset() + } + err := operator.initOciClient() + if tc.hasError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + } + }) + } +} + +type initOciClientTestCase struct { + name string + setup func() (*RuntimeOperatorTool, *gomonkey.Patches) + hasError bool +} + +func buildInitOciClientTestCases() []initOciClientTestCase { + return []initOciClientTestCase{ + {name: "should initialize OCI client successfully for containerd", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} + p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + return op, p + }, + hasError: false}, + {name: "should initialize OCI client successfully for isulad", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: DefaultIsuladAddr, UseOciBackup: false} + p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) + return op, p + }, + hasError: false}, + {name: "should return error when connection fails and no backup", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} + p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) + return op, p + }, + hasError: true}, + {name: "should return error when OCI endpoint is empty", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: "", UseOciBackup: false} + return op, nil + }, + hasError: true}, + {name: "should try backup when primary connection fails", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} + p := gomonkey.ApplyFunc(GetConnection, func(endpoint string) (*grpc.ClientConn, error) { + if endpoint == testContainerdEndpoint { + return nil, errors.New("primary failed") + } + return nil, errors.New("backup failed") + }) + return op, p + }, + hasError: true}, + {name: "should return error when all connections fail", + setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { + op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} + p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("all failed")) + return op, p + }, + hasError: true}, + } +} + +func TestSockCheck(t *testing.T) { + convey.Convey("TestSockCheck", t, func() { + convey.Convey("should pass when socket paths are valid", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) + defer patches.Reset() + patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, nil) + + err := sockCheck(operator) + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when CRI endpoint check fails", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) + defer patches.Reset() + + err := sockCheck(operator) + convey.So(err, convey.ShouldNotBeNil) + }) + + convey.Convey("should return error when CRI endpoint permission check fails", func() { + operator := &RuntimeOperatorTool{ + CriEndpoint: testContainerdEndpoint, + OciEndpoint: testContainerdEndpoint, + } + + patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) + defer patches.Reset() + patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, errors.New("permission check failed")) + + err := sockCheck(operator) + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestRuntimeOperatorToolClose(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolClose", t, func() { + convey.Convey("should close connections successfully", func() { + operator := &RuntimeOperatorTool{ + conn: &grpc.ClientConn{}, + criConn: &grpc.ClientConn{}, + } + + patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { + return nil + }) + defer patches.Reset() + + err := operator.Close() + convey.So(err, convey.ShouldBeNil) + }) + + convey.Convey("should return error when OCI connection close fails", func() { + operator := &RuntimeOperatorTool{ + conn: &grpc.ClientConn{}, + criConn: &grpc.ClientConn{}, + } + + patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { + return errors.New("close failed") + }) + defer patches.Reset() + + err := operator.Close() + convey.So(err, convey.ShouldNotBeNil) + }) + }) +} + +func TestRuntimeOperatorToolGetContainers(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetContainers", t, func() { + convey.Convey("should return error when CRI client is empty", func() { + operator := &RuntimeOperatorTool{} + + patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) + defer patches.Reset() + + containers, err := operator.GetContainers(context.Background()) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) + convey.So(containers, convey.ShouldBeNil) + }) + + convey.Convey("should return error when CRI connection is nil", func() { + operator := &RuntimeOperatorTool{ + criClient: "mock-client", + } + + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + + containers, err := operator.GetContainers(context.Background()) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) + convey.So(containers, convey.ShouldBeNil) + }) + + convey.Convey("should return error when client type is unexpected", func() { + operator := &RuntimeOperatorTool{ + criClient: "unexpected", + criConn: &grpc.ClientConn{}, + } + + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + + containers, err := operator.GetContainers(context.Background()) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testUnexpectedClientError) + convey.So(containers, convey.ShouldBeNil) + }) + }) +} + +func TestIsUnimplementedError(t *testing.T) { + tests := []struct { + name string + err error + serviceName string + want bool + }{ + { + name: "nil error returns false", + err: nil, + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "non-grpc error returns false", + err: errors.New("unknown service " + testCriV1alpha2), + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "mismatched code returns false", + err: status.Error(codes.NotFound, "unknown service "+testCriV1alpha2), + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "mismatched message returns false", + err: status.Error(codes.Unimplemented, "unknown service "+testCriV1), + serviceName: testCriV1alpha2, + want: false, + }, + { + name: "matched unimplemented error returns true", + err: status.Error(codes.Unimplemented, "unknown service "+testCriV1alpha2), + serviceName: testCriV1alpha2, + want: true, + }, + { + name: "real grpc error format returns true", + err: fmt.Errorf("rpc error: code = Unimplemented desc = unknown service " + testCriV1alpha2), + serviceName: testCriV1alpha2, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isUnimplementedError(tt.err, tt.serviceName); got != tt.want { + t.Errorf("isUnimplementedError() = %v, want %v (err: %v)", got, tt.want, tt.err) + } + }) + } +} + +func TestRuntimeOperatorToolGetContainerInfoByID(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetContainerInfoByID", t, func() { + convey.Convey("should return error when OCI client is empty", func() { + operator := &RuntimeOperatorTool{} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when OCI connection is nil", func() { + operator := &RuntimeOperatorTool{client: "mock-client"} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when client type is unexpected", func() { + operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testUnexpectedContainerdClientError) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when GetContainer call fails", func() { + operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + convey.Convey("should return error when JSON unmarshal fails", func() { + operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(spec, convey.ShouldResemble, v1.Spec{}) + }) + + }) +} + +func TestRuntimeOperatorToolGetIsulaContainerInfoByID(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetIsulaContainerInfoByID", t, func() { + convey.Convey("should return error when OCI client is empty", func() { + operator := &RuntimeOperatorTool{} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when OCI connection is nil", func() { + operator := &RuntimeOperatorTool{client: "mock-client"} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when client type is unexpected", func() { + operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldEqual, testUnexpectedIsulaClientError) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when Inspect call fails", func() { + operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + convey.Convey("should return error when JSON unmarshal fails", func() { + operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} + patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) + defer patches.Reset() + containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) + convey.So(err, convey.ShouldNotBeNil) + convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) + }) + + }) +} + +func TestRuntimeOperatorToolGetContainerType(t *testing.T) { + convey.Convey("TestRuntimeOperatorToolGetContainerType", t, func() { + convey.Convey("should return isula when endpoint is isulad", func() { + operator := &RuntimeOperatorTool{ + OciEndpoint: DefaultIsuladAddr, + } + + containerType := operator.GetContainerType() + convey.So(containerType, convey.ShouldEqual, IsulaContainer) + }) + + convey.Convey("should return default when endpoint is not isulad", func() { + operator := &RuntimeOperatorTool{ + OciEndpoint: testContainerdEndpoint, + } + + containerType := operator.GetContainerType() + convey.So(containerType, convey.ShouldEqual, DefaultContainer) + }) + }) +} + +func TestSetGrpcNamespaceHeader(t *testing.T) { + convey.Convey("TestSetGrpcNamespaceHeader", t, func() { + convey.Convey("should set namespace header when context has no metadata", func() { + ctx := context.Background() + result := setGrpcNamespaceHeader(ctx, testNamespace) + convey.So(result, convey.ShouldNotBeNil) + }) + + convey.Convey("should set namespace header when context has existing metadata", func() { + ctx := context.Background() + ctx = context.WithValue(ctx, "test", "value") + result := setGrpcNamespaceHeader(ctx, testNamespace) + convey.So(result, convey.ShouldNotBeNil) + }) + }) +} + +func TestGenContainerRequestV1alpha2(t *testing.T) { + convey.Convey("TestGenContainerRequestV1alpha2", t, func() { + convey.Convey("should generate valid container request", func() { + request := genContainerRequestV1alpha2() + convey.So(request, convey.ShouldNotBeNil) + convey.So(request.Filter, convey.ShouldNotBeNil) + convey.So(request.Filter.State, convey.ShouldNotBeNil) + convey.So(request.Filter.State.State, convey.ShouldEqual, v1alpha2.ContainerState_CONTAINER_RUNNING) + }) + }) +} + +func TestGenContainerRequestV1(t *testing.T) { + convey.Convey("TestGenContainerRequestV1", t, func() { + convey.Convey("should generate valid container request", func() { + request := genContainerRequestV1() + convey.So(request, convey.ShouldNotBeNil) + convey.So(request.Filter, convey.ShouldNotBeNil) + convey.So(request.Filter.State, convey.ShouldNotBeNil) + convey.So(request.Filter.State.State, convey.ShouldEqual, criv1.ContainerState_CONTAINER_RUNNING) + }) + }) +} + +func TestGenIsulaRequest(t *testing.T) { + convey.Convey("TestGenIsulaRequest", t, func() { + convey.Convey("should generate valid isula request", func() { + request := genIsulaRequest() + convey.So(request, convey.ShouldNotBeNil) + convey.So(request.Filter, convey.ShouldNotBeNil) + convey.So(request.Filter.State, convey.ShouldNotBeNil) + convey.So(request.Filter.State.State, convey.ShouldEqual, isula.ContainerState_CONTAINER_RUNNING) + }) + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils.go b/mind-cluster/component/npu-exporter/collector/container/utils.go new file mode 100644 index 0000000..b5ff57e --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/utils.go @@ -0,0 +1,133 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package container for monitoring containers' npu allocation +package container + +import ( + "context" + "errors" + "fmt" + "net" + "net/url" + "strings" + "time" + + "google.golang.org/grpc" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + defaultTimeout = 5 * time.Second + unixPrefix = "unix" + // MaxLenDNS configName max len + MaxLenDNS = 512 + // MinLenDNS configName min len + MinLenDNS = 1 + maxContainers = 1024 + maxCgroupPath = 2048 + + maxDevicesNum = 100000 + maxEnvNum = 10000 +) + +// CgroupVersion is the cgroups mode of the host system +type CgroupVersion int + +// GetConnection return the grpc connection +func GetConnection(endPoint string) (*grpc.ClientConn, error) { + if endPoint == "" { + return nil, fmt.Errorf("endpoint is not set") + } + logger.Debugf("connect using endpoint '%s' with '%s' timeout", + utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://")), defaultTimeout) + addr, dialer, err := getAddressAndDialer(endPoint) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + ctx, cancelFn := context.WithTimeout(context.Background(), defaultTimeout) + defer cancelFn() + conn, err := grpc.DialContext(ctx, addr, grpc.WithInsecure(), grpc.WithBlock(), grpc.WithContextDialer(dialer)) + if err != nil { + return nil, err + } + logger.Debugf("connected successfully using endpoint: %s", + utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://"))) + return conn, nil +} + +func parseSocketEndpoint(endpoint string) (string, string, error) { + u, err := url.Parse(endpoint) + if err != nil { + return "", "", err + } + + switch u.Scheme { + case "unix": + return "unix", u.Path, nil + case "tcp": + return "tcp", u.Host, nil + default: + return u.Scheme, "", fmt.Errorf("protocol %q not supported", u.Scheme) + } +} + +// getAddressAndDialer returns the address parsed from the given socket endpoint and dialer +func getAddressAndDialer(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { + prefix, addr, err := parseSocketEndpoint(endpoint) + if err != nil { + return "", nil, err + } + if prefix != unixPrefix { + return "", nil, fmt.Errorf("only support unix socket") + } + return addr, dial, nil +} + +// dial return the context dialer +func dial(ctx context.Context, addr string) (net.Conn, error) { + return (&net.Dialer{}).DialContext(ctx, unixPrefix, addr) +} + +func validDNSRe(dnsContent string) error { + if len(dnsContent) < MinLenDNS || len(dnsContent) > MaxLenDNS { + return errors.New("param len invalid") + } + return nil +} + +func makeUpDeviceInfo(c *CommonContainer) (DevicesInfo, error) { + deviceInfo := DevicesInfo{} + var names []string + + ns := c.Labels[labelK8sPodNamespace] + names = append(names, ns) + podName := c.Labels[labelK8sPodName] + names = append(names, podName) + containerName := c.Labels[labelContainerName] + names = append(names, containerName) + for _, v := range names { + if err := validDNSRe(v); err != nil { + return DevicesInfo{}, err + } + } + + deviceInfo.ID = c.Id + deviceInfo.Name = ns + "_" + podName + "_" + containerName + return deviceInfo, nil +} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils_test.go b/mind-cluster/component/npu-exporter/collector/container/utils_test.go new file mode 100644 index 0000000..32e6716 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/utils_test.go @@ -0,0 +1,329 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// package container test methods in utils +package container + +import ( + "context" + "errors" + "net" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "google.golang.org/grpc" + + "ascend-common/common-utils/hwlog" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + testContainerID = "container123" + testPodNamespace = "default" + testPodName = "test-pod" + testContainerName = "test-container" + testUnixSocket = "unix:///test.sock" + testInvalidEndpoint = "invalid://endpoint" + testDialError = "dial error" + testGrpcDialError = "grpc dial error" + testInvalidEndpointError = "invalid endpoint" + testEndpointNotSetError = "endpoint is not set" + testDNSContent = "test-dns" + testMinDNSContent = "a" + testEmptyDNSContent = "" + testTarget = "test" + testUnixScheme = "unix" + testTcpScheme = "tcp" + testUnixAddr = "/tmp/test.sock" + testTcpAddr = "localhost:8080" + testInvalidURL = "://invalid" + testEmptyNamespace = "" + testEmptyPodName = "" + testEmptyContainerName = "" +) + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") +} + +func TestGetConnection(t *testing.T) { + convey.Convey("TestGetConnection", t, func() { + convey.Convey("should return error when endpoint is empty", func() { + testEmptyEndpoint() + }) + convey.Convey("should return error when endpoint is invalid", func() { + testInvalidEndpointFunc() + }) + convey.Convey("should return error when grpc dial context fails", func() { + testGrpcDialErrorFunc() + }) + convey.Convey("should return connection when successful", func() { + testSuccessfulConnection() + }) + }) +} + +func testEmptyEndpoint() { + conn, err := GetConnection("") + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testEndpointNotSetError) +} + +func testInvalidEndpointFunc() { + patches := gomonkey.ApplyFuncReturn(getAddressAndDialer, "", nil, errors.New(testInvalidEndpointError)) + defer patches.Reset() + conn, err := GetConnection(testInvalidEndpoint) + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testInvalidEndpointError) +} + +func testGrpcDialErrorFunc() { + patches := gomonkey.ApplyFunc(getAddressAndDialer, + func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { + return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { + return nil, errors.New(testDialError) + }, nil + }) + defer patches.Reset() + patches.ApplyFuncReturn(grpc.DialContext, nil, errors.New(testGrpcDialError)) + conn, err := GetConnection(testUnixSocket) + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, testGrpcDialError) +} + +func testSuccessfulConnection() { + mockConn := &grpc.ClientConn{} + patches := gomonkey.ApplyFunc(getAddressAndDialer, + func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { + return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { + return nil, nil + }, nil + }) + defer patches.Reset() + patches.ApplyFuncReturn(grpc.DialContext, mockConn, nil) + conn, err := GetConnection(testUnixSocket) + convey.So(conn, convey.ShouldEqual, mockConn) + convey.So(err, convey.ShouldBeNil) +} + +func TestParseSocketEndpoint(t *testing.T) { + testCases := []struct { + name string + endpoint string + expectedScheme string + expectedAddr string + expectedError bool + }{ + {name: "should parse unix endpoint when valid", endpoint: "unix:///tmp/test.sock", + expectedScheme: testUnixScheme, expectedAddr: testUnixAddr, expectedError: false}, + {name: "should parse tcp endpoint when valid", endpoint: "tcp://localhost:8080", + expectedScheme: testTcpScheme, expectedAddr: testTcpAddr, expectedError: false}, + {name: "should return error when scheme is invalid", endpoint: "http://localhost:8080", + expectedScheme: "http", expectedAddr: "", expectedError: true}, + {name: "should return error when url is invalid", endpoint: testInvalidURL, + expectedScheme: "", expectedAddr: "", expectedError: true}, + } + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + scheme, addr, err := parseSocketEndpoint(tc.endpoint) + convey.So(scheme, convey.ShouldEqual, tc.expectedScheme) + convey.So(addr, convey.ShouldEqual, tc.expectedAddr) + if tc.expectedError { + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(err, convey.ShouldBeNil) + } + }) + } +} + +func TestGetAddressAndDialer(t *testing.T) { + convey.Convey("TestGetAddressAndDialer", t, func() { + testCases := []struct { + name string + endpoint string + expectedAddr string + expectedError bool + }{ + { + name: "should return address when unix endpoint is valid", + endpoint: "unix:///tmp/test.sock", + expectedAddr: "/tmp/test.sock", + expectedError: false, + }, + { + name: "should return error when scheme is invalid", + endpoint: "tcp://localhost:8080", + expectedAddr: "", + expectedError: true, + }, + { + name: "should return error when parse fails", + endpoint: "://invalid", + expectedAddr: "", + expectedError: true, + }, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + addr, dialer, err := getAddressAndDialer(tc.endpoint) + convey.So(addr, convey.ShouldEqual, tc.expectedAddr) + if tc.expectedError { + convey.So(dialer, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + } else { + convey.So(dialer, convey.ShouldNotBeNil) + convey.So(err, convey.ShouldBeNil) + } + }) + } + }) +} + +func TestDial(t *testing.T) { + convey.Convey("should call net.Dialer.DialContext when dialing", t, func() { + var dialerCalled bool + patches := gomonkey.ApplyMethod(&net.Dialer{}, "DialContext", + func(d *net.Dialer, ctx context.Context, network, address string) (net.Conn, error) { + dialerCalled = true + return nil, errors.New("mock dial error") + }) + defer patches.Reset() + ctx := context.Background() + conn, err := dial(ctx, "/tmp/test.sock") + convey.So(conn, convey.ShouldBeNil) + convey.So(err, convey.ShouldNotBeNil) + convey.So(dialerCalled, convey.ShouldBeTrue) + }) +} + +func TestValidDNSRe(t *testing.T) { + convey.Convey("TestValidDNSRe", t, func() { + testCases := []struct { + name string + dnsContent string + expectedError bool + }{ + {name: "should pass validation when dns content has valid length", + dnsContent: testDNSContent, expectedError: false}, + {name: "should return error when dns content is empty", + dnsContent: testEmptyDNSContent, expectedError: true}, + {name: "should return error when dns content is too long", + dnsContent: string(make([]byte, MaxLenDNS+1)), expectedError: true}, + {name: "should pass validation when dns content has minimum valid length", + dnsContent: testMinDNSContent, expectedError: false}, + {name: "should pass validation when dns content has maximum valid length", + dnsContent: string(make([]byte, MaxLenDNS)), expectedError: false}, + } + + for _, tc := range testCases { + convey.Convey(tc.name, func() { + err := validDNSRe(tc.dnsContent) + if tc.expectedError { + convey.So(err, convey.ShouldNotBeNil) + convey.So(err.Error(), convey.ShouldContainSubstring, "param len invalid") + } else { + convey.So(err, convey.ShouldBeNil) + } + }) + } + }) +} + +func TestMakeUpDeviceInfo(t *testing.T) { + testCases := getMakeUpDeviceInfoTestCases() + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + deviceInfo, err := makeUpDeviceInfo(tc.container) + validateMakeUpDeviceInfoResult(deviceInfo, err, tc) + }) + } +} + +func getMakeUpDeviceInfoTestCases() []struct { + name string + container *CommonContainer + expectedError bool + expectedName string +} { + return []struct { + name string + container *CommonContainer + expectedError bool + expectedName string + }{ + {name: "should return valid device info when container has all labels", + container: createValidContainer(), expectedError: false, expectedName: "default_test-pod_test-container"}, + {name: "should return error when container has invalid namespace length", + container: createContainerWithEmptyNamespace(), expectedError: true, expectedName: ""}, + {name: "should return error when container has invalid pod name length", + container: createContainerWithEmptyPodName(), expectedError: true, expectedName: ""}, + {name: "should return error when container has invalid container name length", + container: createContainerWithEmptyContainerName(), expectedError: true, expectedName: ""}, + {name: "should return error when container has too long namespace", + container: createContainerWithLongNamespace(), expectedError: true, expectedName: ""}, + } +} + +func createValidContainer() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, + labelContainerName: testContainerName}} +} +func createContainerWithEmptyNamespace() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testEmptyNamespace, labelK8sPodName: testPodName, + labelContainerName: testContainerName}} +} +func createContainerWithEmptyPodName() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testEmptyPodName, + labelContainerName: testContainerName}} +} +func createContainerWithEmptyContainerName() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, + labelContainerName: testEmptyContainerName}} +} + +func createContainerWithLongNamespace() *CommonContainer { + return &CommonContainer{Id: testContainerID, Labels: map[string]string{ + labelK8sPodNamespace: string(make([]byte, MaxLenDNS+1)), + labelK8sPodName: testPodName, labelContainerName: testContainerName}} +} + +func validateMakeUpDeviceInfoResult(deviceInfo DevicesInfo, err error, tc struct { + name string + container *CommonContainer + expectedError bool + expectedName string +}) { + if tc.expectedError { + convey.So(err, convey.ShouldNotBeNil) + convey.So(deviceInfo, convey.ShouldResemble, DevicesInfo{}) + } else { + convey.So(err, convey.ShouldBeNil) + convey.So(deviceInfo.ID, convey.ShouldEqual, tc.container.Id) + convey.So(deviceInfo.Name, convey.ShouldEqual, tc.expectedName) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go new file mode 100644 index 0000000..46762f3 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go @@ -0,0 +1,310 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: containerd.proto +// protoc:3.13.0 +// protoc-gen-go 1.3.5 + +package v1 + +import ( + "context" + "fmt" + "math" + + "github.com/golang/protobuf/proto" + "github.com/golang/protobuf/ptypes/any" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = fmt.Errorf +var _ = math.Inf +var _ = proto.Marshal + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +type Container struct { + // ID the container id + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // Labels the container labels + Labels map[string]string `protobuf:"bytes,2,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // Image the container image + Image string `protobuf:"bytes,3,opt,name=image,proto3" json:"image,omitempty"` + // Spec runtime specific. + Spec *any.Any `protobuf:"bytes,5,opt,name=spec,proto3" json:"spec,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +// Reset reset the object +func (m *Container) Reset() { *m = Container{} } + +// String +func (m *Container) String() string { return proto.CompactTextString(m) } + +// ProtoMessage +func (*Container) ProtoMessage() {} + +// Descriptor +func (*Container) Descriptor() ([]byte, []int) { + return fileDescriptor_29bcc067d8d1b7d0, []int{0} +} + +// XXX_Unmarshal +func (m *Container) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Container.Unmarshal(m, b) +} + +// XXX_Marshal +func (m *Container) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Container.Marshal(b, m, deterministic) +} + +// XXX_Merge +func (m *Container) XXX_Merge(src proto.Message) { + xxx_messageInfo_Container.Merge(m, src) +} + +// XXX_Size +func (m *Container) XXX_Size() int { + return xxx_messageInfo_Container.Size(m) +} + +// XXX_DiscardUnknown +func (m *Container) XXX_DiscardUnknown() { + xxx_messageInfo_Container.DiscardUnknown(m) +} + +var xxx_messageInfo_Container proto.InternalMessageInfo + +// GetId +func (m *Container) GetId() string { + if m != nil { + return m.Id + } + return "" +} + +// GetLabels +func (m *Container) GetLabels() map[string]string { + if m != nil { + return m.Labels + } + return nil +} + +// GetImage +func (m *Container) GetImage() string { + if m != nil { + return m.Image + } + return "" +} + +// GetSpec +func (m *Container) GetSpec() *any.Any { + if m != nil { + return m.Spec + } + return nil +} + +type GetContainerRequest struct { + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *GetContainerRequest) Reset() { *m = GetContainerRequest{} } +func (m *GetContainerRequest) String() string { return proto.CompactTextString(m) } +func (*GetContainerRequest) ProtoMessage() {} +func (*GetContainerRequest) Descriptor() ([]byte, []int) { + return fileDescriptor_29bcc067d8d1b7d0, []int{1} +} + +func (m *GetContainerRequest) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_GetContainerRequest.Unmarshal(m, b) +} +func (m *GetContainerRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_GetContainerRequest.Marshal(b, m, deterministic) +} +func (m *GetContainerRequest) XXX_Merge(src proto.Message) { + xxx_messageInfo_GetContainerRequest.Merge(m, src) +} +func (m *GetContainerRequest) XXX_Size() int { + return xxx_messageInfo_GetContainerRequest.Size(m) +} +func (m *GetContainerRequest) XXX_DiscardUnknown() { + xxx_messageInfo_GetContainerRequest.DiscardUnknown(m) +} + +var xxx_messageInfo_GetContainerRequest proto.InternalMessageInfo + +func (m *GetContainerRequest) GetId() string { + if m != nil { + return m.Id + } + return "" +} + +type GetContainerResponse struct { + Container *Container `protobuf:"bytes,1,opt,name=container,proto3" json:"container,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *GetContainerResponse) Reset() { *m = GetContainerResponse{} } +func (m *GetContainerResponse) String() string { return proto.CompactTextString(m) } +func (*GetContainerResponse) ProtoMessage() {} +func (*GetContainerResponse) Descriptor() ([]byte, []int) { + return fileDescriptor_29bcc067d8d1b7d0, []int{2} +} + +func (m *GetContainerResponse) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_GetContainerResponse.Unmarshal(m, b) +} +func (m *GetContainerResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_GetContainerResponse.Marshal(b, m, deterministic) +} +func (m *GetContainerResponse) XXX_Merge(src proto.Message) { + xxx_messageInfo_GetContainerResponse.Merge(m, src) +} +func (m *GetContainerResponse) XXX_Size() int { + return xxx_messageInfo_GetContainerResponse.Size(m) +} +func (m *GetContainerResponse) XXX_DiscardUnknown() { + xxx_messageInfo_GetContainerResponse.DiscardUnknown(m) +} + +var xxx_messageInfo_GetContainerResponse proto.InternalMessageInfo + +func (m *GetContainerResponse) GetContainer() *Container { + if m != nil { + return m.Container + } + return nil +} + +func init() { + proto.RegisterType((*Container)(nil), "containerd.services.containers.v1.Container") + proto.RegisterMapType((map[string]string)(nil), "containerd.services.containers.v1.Container.LabelsEntry") + proto.RegisterType((*GetContainerRequest)(nil), "containerd.services.containers.v1.GetContainerRequest") + proto.RegisterType((*GetContainerResponse)(nil), "containerd.services.containers.v1.GetContainerResponse") +} + +func init() { + proto.RegisterFile("containerd.proto", fileDescriptor_29bcc067d8d1b7d0) +} + +var fileDescriptor_29bcc067d8d1b7d0 = []byte{ + // 327 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x51, 0x4f, 0x4b, 0xc3, 0x30, + 0x14, 0xa7, 0xad, 0x1b, 0xec, 0x15, 0x64, 0xc4, 0x1d, 0xea, 0x4e, 0x73, 0x20, 0xf4, 0xa0, 0xa9, + 0xab, 0xa0, 0x53, 0x4f, 0x2a, 0x32, 0x10, 0x0f, 0xd2, 0xa3, 0xb7, 0xb6, 0x7b, 0xce, 0x62, 0x96, + 0xd4, 0x24, 0xad, 0xf6, 0xee, 0x87, 0xf5, 0x63, 0xc8, 0xd2, 0xad, 0x4e, 0x11, 0x74, 0xb7, 0xf7, + 0x5e, 0x7f, 0x7f, 0x1b, 0xe8, 0xa6, 0x82, 0xeb, 0x38, 0xe3, 0x28, 0xa7, 0x34, 0x97, 0x42, 0x0b, + 0xb2, 0xb7, 0x76, 0x51, 0x28, 0xcb, 0x2c, 0x45, 0x45, 0x9b, 0x9b, 0xa2, 0xe5, 0xa8, 0xbf, 0x3b, + 0x13, 0x62, 0xc6, 0x30, 0x30, 0x84, 0xa4, 0x78, 0x0c, 0x62, 0x5e, 0xd5, 0xec, 0xe1, 0x87, 0x05, + 0x9d, 0xeb, 0x15, 0x98, 0x6c, 0x83, 0x9d, 0x4d, 0x3d, 0x6b, 0x60, 0xf9, 0x9d, 0xc8, 0xce, 0xa6, + 0xe4, 0x1e, 0xda, 0x2c, 0x4e, 0x90, 0x29, 0xcf, 0x1e, 0x38, 0xbe, 0x1b, 0x8e, 0xe9, 0x9f, 0x66, + 0xb4, 0x51, 0xa3, 0x77, 0x86, 0x7a, 0xc3, 0xb5, 0xac, 0xa2, 0xa5, 0x0e, 0xe9, 0x41, 0x2b, 0x9b, + 0xc7, 0x33, 0xf4, 0x1c, 0x63, 0x52, 0x2f, 0xc4, 0x87, 0x2d, 0x95, 0x63, 0xea, 0xb5, 0x06, 0x96, + 0xef, 0x86, 0x3d, 0x5a, 0xe7, 0xa5, 0xab, 0xbc, 0xf4, 0x92, 0x57, 0x91, 0x41, 0xf4, 0xcf, 0xc0, + 0x5d, 0x93, 0x25, 0x5d, 0x70, 0x9e, 0xb1, 0x5a, 0x26, 0x5e, 0x8c, 0x0b, 0x83, 0x32, 0x66, 0x05, + 0x7a, 0x76, 0x6d, 0x60, 0x96, 0x73, 0x7b, 0x6c, 0x0d, 0xf7, 0x61, 0x67, 0x82, 0xba, 0x89, 0x17, + 0xe1, 0x4b, 0x81, 0x4a, 0xff, 0xec, 0x3c, 0x4c, 0xa0, 0xf7, 0x1d, 0xa6, 0x72, 0xc1, 0x15, 0x92, + 0x5b, 0xe8, 0x34, 0x45, 0x0d, 0xdc, 0x0d, 0x0f, 0x36, 0xf9, 0x1d, 0xd1, 0x17, 0x3d, 0x7c, 0xb7, + 0x00, 0x9a, 0x0f, 0x8a, 0x94, 0xe0, 0x4c, 0x50, 0x93, 0x93, 0x7f, 0xc8, 0xfd, 0xd2, 0xa0, 0x7f, + 0xba, 0x31, 0xaf, 0xae, 0x74, 0x75, 0xf4, 0x40, 0x9f, 0x8a, 0xf8, 0x15, 0x33, 0x9a, 0x8a, 0x79, + 0xc0, 0xf3, 0xe2, 0x10, 0xdf, 0x72, 0x21, 0x35, 0xca, 0x20, 0x15, 0x8c, 0x61, 0xaa, 0xc5, 0x62, + 0x5a, 0xd2, 0x2e, 0xca, 0x51, 0xd2, 0x36, 0x4f, 0x72, 0xfc, 0x19, 0x00, 0x00, 0xff, 0xff, 0x30, + 0xcc, 0x1c, 0x74, 0x87, 0x02, 0x00, 0x00, +} + +// Reference imports to suppress errors if they are not otherwise used. +var _ context.Context +var _ grpc.ClientConnInterface + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +const _ = grpc.SupportPackageIsVersion6 + +// ContainersClient is the client API for Containers service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. +type ContainersClient interface { + Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) +} + +type containersClient struct { + cc grpc.ClientConnInterface +} + +func NewContainersClient(cc grpc.ClientConnInterface) ContainersClient { + return &containersClient{cc} +} + +func (c *containersClient) Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) { + out := new(GetContainerResponse) + err := c.cc.Invoke(ctx, "/containerd.services.containers.v1.Containers/Get", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// ContainersServer is the server API for Containers service. +type ContainersServer interface { + Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) +} + +// UnimplementedContainersServer can be embedded to have forward compatible implementations. +type UnimplementedContainersServer struct { +} + +func (*UnimplementedContainersServer) Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Get not implemented") +} + +func RegisterContainersServer(s *grpc.Server, srv ContainersServer) { + s.RegisterService(&_Containers_desc, srv) +} + +func _Containers_Get_Method(srv interface{}, ctx context.Context, desc func(interface{}) error, itcpt grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetContainerRequest) + if err := desc(in); err != nil { + return nil, err + } + if itcpt == nil { + return srv.(ContainersServer).Get(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/containerd.services.containers.v1.Containers/Get", + } + handler := func(ctx context.Context, request interface{}) (interface{}, error) { + return srv.(ContainersServer).Get(ctx, request.(*GetContainerRequest)) + } + return itcpt(ctx, in, info, handler) +} + +var _Containers_desc = grpc.ServiceDesc{ + ServiceName: "containerd.services.containers.v1.Containers", + HandlerType: (*ContainersServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "Get", + Handler: _Containers_Get_Method, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "containerd.proto", +} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto new file mode 100644 index 0000000..48a4a4b --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto @@ -0,0 +1,62 @@ +syntax = "proto3"; + +package containerd.services.containers.v1; + + +import "google/protobuf/any.proto"; +import "google/protobuf/timestamp.proto"; + +option go_package = "huawei.com/npu-exporter/v6/collector/container;v1"; + +// Containers provides metadata storage for containers used in the execution +// service. +service Containers { + rpc Get(GetContainerRequest) returns (GetContainerResponse); +} + +message Container { + // ID is the user-specified identifier. + string id = 1; + + // Labels provides an area to include arbitrary data on containers. + map labels = 2; + + // Image contains the reference of the image used to build the + string image = 3; + + message Runtime { + // Name is the name of the runtime. + string name = 1; + // Options runtime initialization options. + google.protobuf.Any options = 2; + } + // Runtime specifies runtime. + Runtime runtime = 4; + + // Spec opencotainer spec. + google.protobuf.Any spec = 5; + + // Snapshotter is the snapshotter name used for rootfs + string snapshotter = 6; + + // SnapshotKey the snapshot key to use for the container's root + string snapshot_key = 7; + + // CreatedAt is the create time of container. + google.protobuf.Timestamp created_at = 8 ; + + // UpdatedAt is the last update of container. + google.protobuf.Timestamp updated_at = 9 ; + + // Extensions allow clients to provide zero or more blobs that are directly + map extensions = 10 ; +} + +message GetContainerRequest { + string id = 1; +} + +message GetContainerResponse { + Container container = 1 ; +} + diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/spec.go b/mind-cluster/component/npu-exporter/collector/container/v1/spec.go new file mode 100644 index 0000000..2efa216 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/container/v1/spec.go @@ -0,0 +1,59 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package v1 implement the containerd client +package v1 + +// Spec is the base configuration for the container. +type Spec struct { + // Linux is platform-specific configuration for Linux based containers. + Linux *Linux `json:"linux,omitempty" platform:"linux"` + // Process for get capabilities + Process *Process `json:"process,omitempty" platform:"linux"` +} + +// Process is the base configuration for the container. +type Process struct { + // Env for container env + Env []string `json:"env,omitempty" platform:"linux"` +} + +// Linux contains platform-specific configuration for Linux based containers. +type Linux struct { + // Resources contain cgroup information for handling resource constraints + // for the container + Resources *LinuxResources `json:"resources,omitempty"` + // Devices are a list of device nodes that are created for the container +} + +// LinuxResources has container runtime resource constraints +type LinuxResources struct { + // Devices configures the device allowlist. + Devices []LinuxDeviceCgroup `json:"devices,omitempty"` +} + +// LinuxDeviceCgroup represents a device rule for the devices specified to +// the device controller +type LinuxDeviceCgroup struct { + // Allow or deny + Allow bool `json:"allow"` + // Device type, block, char, etc. + Type string `json:"type,omitempty"` + // Major is the device's major number. + Major *int64 `json:"major,omitempty"` + // Minor is the device's minor number. + Minor *int64 `json:"minor,omitempty"` + // Cgroup access permissions format, rwm. + Access string `json:"access,omitempty"` +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go new file mode 100644 index 0000000..53a7645 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go @@ -0,0 +1,142 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + descTotalMemory = colcommon.BuildDesc("npu_chip_info_total_memory", "the npu total memory") + descUsedMemory = colcommon.BuildDesc("npu_chip_info_used_memory", "the npu used memory") + + notSupportedDdrDevices = map[string]bool{ + api.Ascend910B: true, + api.Ascend910A3: true, + } +) + +type ddrCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo the memoryInfo of the chip + extInfo *common.MemoryInfo +} + +// DdrCollector collect ddr info +type DdrCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the metric is supported +func (c *DdrCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := !notSupportedDdrDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "there is no DDR module. DDR information cannot be queried.") + return isSupport +} + +// Describe description of the metric +func (c *DdrCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descTotalMemory + ch <- descUsedMemory +} + +// CollectToCache collect the metric to cache +func (c *DdrCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + + for _, chip := range chipList { + logicID := chip.LogicID + mem, err := n.Dmgr.GetDeviceMemoryInfo(logicID) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForDDR, logicID, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForDDR, logicID) + + c.LocalCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mem}) + } + colcommon.UpdateCache[ddrCache](n, colcommon.GetCacheKey(c), &c.LocalCache) + +} + +// UpdatePrometheus update prometheus metrics +func (c *DdrCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache ddrCache, cardLabel []string) { + extInfo := cache.extInfo + if extInfo == nil { + return + } + memorySize := extInfo.MemorySize + memoryAvailable := extInfo.MemoryAvailable + + doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, descTotalMemory) + doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, descUsedMemory) + + // vnpu not support this metrics + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + + containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) + if !c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { + doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, npuCtrTotalMemory) + doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, npuCtrUsedMemory) + } + } + + updateFrame[ddrCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf update telegraf metrics +func (c *DdrCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + logger.Debugf("cacheKey(%v) not found", chip.PhyId) + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + memoryInfo := cache.extInfo + if memoryInfo == nil { + logger.Debugf("info in cache is nil,cacheKey(%v)", chip.PhyId) + continue + } + memorySize := memoryInfo.MemorySize + memoryAvailable := memoryInfo.MemoryAvailable + + doUpdateTelegraf(fieldMap, descTotalMemory, memorySize, "") + doUpdateTelegraf(fieldMap, descUsedMemory, memorySize-memoryAvailable, "") + + } + return fieldsMap +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go new file mode 100644 index 0000000..d9f5601 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go @@ -0,0 +1,228 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +var ( + descHbmUsedMemory = colcommon.BuildDesc("npu_chip_info_hbm_used_memory", "the npu hbm used memory") + descHbmTotalMemory = colcommon.BuildDesc("npu_chip_info_hbm_total_memory", "the npu hbm total memory") + descHbmUtilization = colcommon.BuildDesc("npu_chip_info_hbm_utilization", "the npu hbm utilization") + descHbmTemperature = colcommon.BuildDesc("npu_chip_info_hbm_temperature", "the npu hbm temperature") + descHbmBWUtil = colcommon.BuildDesc("npu_chip_info_hbm_bandwidth_utilization", "the npu hbm bandwidth util rate") + + descEccEnableFlag = colcommon.BuildDesc("npu_chip_info_hbm_ecc_enable_flag", + "whether HBM ecc detection is enabled") + descEccSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_error_cnt", + "HBM Single Bit Error Count") + descEccDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_error_cnt", + "HBM Double Bit Error Count") + + descEccTotalSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_single_bit_error_cnt", + "HBM Single Bit Aggregate Total Err Cnt") + descEccTotalDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_double_bit_error_cnt", + "HBM Double Bit Aggregate Total Err Cnt") + descEccSingleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_isolated_pages_cnt", + "HBM Single Bit Isolated Pages Count") + descEccDoubleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_isolated_pages_cnt", + "HBM Double Bit Isolated Pages Count") +) + +var ( + supportedHbmDevices = map[string]bool{ + api.Ascend910A: true, + api.Ascend910B: true, + api.Ascend910A3: true, + } +) + +type hbmCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo the hbm info + extInfo *common.HbmAggregateInfo + // hbmUtilization the hbm utilization + hbmUtilization uint32 +} + +// HbmCollector collects hbm info +type HbmCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *HbmCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedHbmDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe describes all the metrics that will be exposed. +func (c *HbmCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descHbmUsedMemory + ch <- descHbmTotalMemory + ch <- descHbmUtilization + ch <- descHbmTemperature + ch <- descHbmBWUtil + + ch <- descEccEnableFlag + ch <- descEccSingleBitErrorCnt + ch <- descEccDoubleBitErrorCnt + ch <- descEccTotalSingleBitErrorCnt + ch <- descEccTotalDoubleBitErrorCnt + ch <- descEccSingleBitIoslatedPagesCnt + ch <- descEccDoubleBitIoslatedPagesCnt +} + +// CollectToCache collects hbm info +func (c *HbmCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + getAllHBMEccInfo(c, chip.LogicID, n.Dmgr, &chip) + } + colcommon.UpdateCache[hbmCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus updates the prometheus metrics. +func (c *HbmCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hbmCache, cardLabel []string) { + extInfo := cache.extInfo + if extInfo == nil { + return + } + timestamp := cache.timestamp + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.hbmUtilization), cardLabel, descHbmUtilization) + + c.updateHbmInfo(ch, cache, cardLabel, containerMap, chipWithVnpu) + + eccInfo := extInfo.ECCInfo + updateHbmEccInfo(ch, eccInfo, timestamp, cardLabel) + } + + updateFrame[hbmCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf updates the telegraf metrics. +func (c *HbmCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + caches := colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + + doUpdateTelegrafWithValidateNum(fieldMap, descHbmUtilization, float64(cache.hbmUtilization), "") + + hbmInfo := extInfo.HbmInfo + if hbmInfo != nil { + doUpdateTelegraf(fieldMap, descHbmUsedMemory, hbmInfo.Usage, "") + doUpdateTelegraf(fieldMap, descHbmTotalMemory, hbmInfo.MemorySize, "") + doUpdateTelegraf(fieldMap, descHbmTemperature, hbmInfo.Temp, "") + doUpdateTelegraf(fieldMap, descHbmBWUtil, hbmInfo.BandWidthUtilRate, "") + } + + eccInfo := extInfo.ECCInfo + if eccInfo != nil { + doUpdateTelegraf(fieldMap, descEccEnableFlag, eccInfo.EnableFlag, "") + doUpdateTelegraf(fieldMap, descEccSingleBitErrorCnt, eccInfo.SingleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccDoubleBitErrorCnt, eccInfo.DoubleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccTotalSingleBitErrorCnt, eccInfo.TotalSingleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccTotalDoubleBitErrorCnt, eccInfo.TotalDoubleBitErrorCnt, "") + doUpdateTelegraf(fieldMap, descEccSingleBitIoslatedPagesCnt, eccInfo.SingleBitIsolatedPagesCnt, "") + doUpdateTelegraf(fieldMap, descEccDoubleBitIoslatedPagesCnt, eccInfo.DoubleBitIsolatedPagesCnt, "") + + } + } + return fieldsMap + +} + +func getAllHBMEccInfo(c *HbmCollector, logicID int32, dmgr devmanager.DeviceInterface, chip *colcommon.HuaWeiAIChip) { + + hbmInfo := &common.HbmAggregateInfo{} + var utilizationRate uint32 + var err error + hbmInfo.HbmInfo, err = dmgr.GetDeviceHbmInfo(logicID) + handleErr(err, colcommon.DomainForHBM, logicID) + + utilizationRate, err = dmgr.GetDeviceUtilizationRate(logicID, common.HbmUtilization) + handleErr(err, colcommon.DomainForHbmUtilization, logicID) + + hbmInfo.ECCInfo, err = dmgr.GetDeviceEccInfo(logicID, common.DcmiDeviceTypeHBM) + handleErr(err, colcommon.DomainForHBMECC, logicID) + c.LocalCache.Store(chip.PhyId, hbmCache{ + chip: *chip, + timestamp: time.Now(), + extInfo: hbmInfo, + hbmUtilization: utilizationRate}, + ) +} + +func updateHbmEccInfo(ch chan<- prometheus.Metric, eccInfo *common.ECCInfo, timestamp time.Time, cardLabel []string) { + if eccInfo == nil { + return + } + doUpdateMetric(ch, timestamp, eccInfo.EnableFlag, cardLabel, descEccEnableFlag) + doUpdateMetric(ch, timestamp, eccInfo.SingleBitErrorCnt, cardLabel, descEccSingleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.DoubleBitErrorCnt, cardLabel, descEccDoubleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.TotalSingleBitErrorCnt, cardLabel, descEccTotalSingleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.TotalDoubleBitErrorCnt, cardLabel, descEccTotalDoubleBitErrorCnt) + doUpdateMetric(ch, timestamp, eccInfo.SingleBitIsolatedPagesCnt, cardLabel, descEccSingleBitIoslatedPagesCnt) + doUpdateMetric(ch, timestamp, eccInfo.DoubleBitIsolatedPagesCnt, cardLabel, descEccDoubleBitIoslatedPagesCnt) +} + +func (c *HbmCollector) updateHbmInfo(ch chan<- prometheus.Metric, cache hbmCache, cardLabel []string, + containerMap map[int32]container.DevicesInfo, chipWithVnpu colcommon.HuaWeiAIChip) { + hbmInfo := cache.extInfo + if hbmInfo == nil || hbmInfo.HbmInfo == nil { + return + } + timestamp := cache.timestamp + doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, descHbmUsedMemory) + doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, descHbmTotalMemory) + doUpdateMetric(ch, timestamp, hbmInfo.Temp, cardLabel, descHbmTemperature) + doUpdateMetric(ch, timestamp, hbmInfo.BandWidthUtilRate, cardLabel, descHbmBWUtil) + + // vnpu not support this metrics + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + + containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) + if c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { + doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, npuCtrTotalMemory) + doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, npuCtrUsedMemory) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go new file mode 100644 index 0000000..4bf59cd --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go @@ -0,0 +1,115 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" +) + +type TestCase struct { + name string + initFunc func() + expectMetricLen int +} + +const ( + expectMetricLen4 = 4 + expectMetricLen6 = 6 + vdevId = 132 + maxMetrics = 10 + mockNs = "mockNs" + mockPodName = "mockPodName" +) + +func TestUpdateHbmInfo(t *testing.T) { + collector := HbmCollector{} + ch := make(chan int, maxMetrics) + defer close(ch) + cache := buildHbmCache() + chipWithVnpu := &colcommon.HuaWeiAIChip{} + cases := buildTestCases(&collector, chipWithVnpu, &cache) + patch := gomonkey.NewPatches() + patch.ApplyFunc(doUpdateMetric, func(_ chan<- prometheus.Metric, _ time.Time, _ interface{}, _ []string, + desc *prometheus.Desc) { + ch <- 0 + }) + patch.ApplyFuncReturn(geenContainerInfo, nil) + patch.ApplyFuncReturn(getContainerNameArray, []string{mockNs, mockPodName, mockContainerName}) + defer patch.Reset() + + for _, c := range cases { + convey.Convey(c.name, t, func() { + ch = make(chan int, maxMetrics) + c.initFunc() + collector.updateHbmInfo(nil, cache, nil, nil, *chipWithVnpu) + convey.So(len(ch), convey.ShouldEqual, c.expectMetricLen) + }) + } +} + +func buildTestCases(collector *HbmCollector, chipWithVnpu *colcommon.HuaWeiAIChip, cache *hbmCache) []TestCase { + cases := []TestCase{ + {name: "when npu is not 910 series ", initFunc: func() {}, expectMetricLen: expectMetricLen4}, + {name: "when vnpu is nil and with container info", initFunc: func() { + collector.Is910Series = true + }, expectMetricLen: expectMetricLen6}, + {name: "when chip is vnpu", initFunc: func() { + chipWithVnpu.VDevActivityInfo = &common.VDevActivityInfo{ + VDevID: vdevId, + } + }, expectMetricLen: expectMetricLen4}, + {name: "when extInfo.HbmInfo is nil", initFunc: func() { cache.extInfo.HbmInfo = nil }, expectMetricLen: 0}, + {name: "when extInfo is nil", initFunc: func() { cache.extInfo = nil }, expectMetricLen: 0}, + } + return cases +} + +func buildHbmCache() hbmCache { + cache := hbmCache{ + chip: colcommon.HuaWeiAIChip{ + PhyId: 0, + }, + hbmUtilization: 0, + timestamp: time.Now(), + extInfo: &common.HbmAggregateInfo{ + HbmInfo: &common.HbmInfo{ + BandWidthUtilRate: 0, + Frequency: 0, + MemorySize: 0, + Temp: 0, + Usage: 0, + }, + ECCInfo: &common.ECCInfo{ + EnableFlag: 0, + SingleBitErrorCnt: 0, + DoubleBitErrorCnt: 0, + TotalSingleBitErrorCnt: 0, + TotalDoubleBitErrorCnt: 0, + SingleBitIsolatedPagesCnt: 0, + DoubleBitIsolatedPagesCnt: 0, + }, + }, + } + return cache +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go new file mode 100644 index 0000000..1ecc3a9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go @@ -0,0 +1,312 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "fmt" + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + hccsTxDescs []*prometheus.Desc + hccsRxDescs []*prometheus.Desc + hccsErrDescs []*prometheus.Desc + hccsBWTxDescs []*prometheus.Desc + hccsBWRxDescs []*prometheus.Desc + hccsBWProfilingTime *prometheus.Desc = nil + hccsBWTotalTx *prometheus.Desc = nil + hccsBWTotalRx *prometheus.Desc = nil + + supportedHccsDevices = map[string]bool{ + api.Ascend910B: true, + api.Ascend910A3: true, + } +) + +const ( + // MaxHccsNum max hccs num + MaxHccsNum int = 8 + // hccs info begin index, 1 or 2 + num1 = 1 + num2 = 2 +) + +// init add descs in init method +func init() { + for i := 0; i < MaxHccsNum; i++ { + index := strconv.Itoa(i) + colcommon.BuildDescSlice(&hccsTxDescs, api.Prefix+"tx_cnt_"+index, + "transmitted message count for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsRxDescs, api.Prefix+"rx_cnt_"+index, + "received message count for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsErrDescs, api.Prefix+"crc_err_cnt_"+index, + "crc error count for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsBWTxDescs, api.BwPrefix+"tx_"+index, + "single-link transmission data bandwidth for "+api.Hccs+" "+index) + colcommon.BuildDescSlice(&hccsBWRxDescs, api.BwPrefix+"rx_"+index, + "single-link receive data bandwidth for "+api.Hccs+" "+index) + } + hccsBWProfilingTime = colcommon.BuildDesc(api.BwPrefix+"profiling_time", + "sampling interval for "+api.Hccs+" bandwidth") + hccsBWTotalTx = colcommon.BuildDesc(api.BwPrefix+"total_tx", "total sent data bandwidth") + hccsBWTotalRx = colcommon.BuildDesc(api.BwPrefix+"total_rx", "total received data bandwidth") +} + +type hccsCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // hccsStat hccs info of npu chip + hccsStat *common.HccsStatisticInfo + + // hccsBW hccs bandwidth info of npu chip + hccsBW *common.HccsBandwidthInfo +} + +// HccsCollector collect hccs info +type HccsCollector struct { + colcommon.MetricsCollectorAdapter + hccsBeginIndex int + + // Automatically adapt according to the interface call + realGetStatisticInfoFunc func(logicID int32) (*common.HccsStatisticInfo, error) +} + +// IsSupported judge whether the collector is supported +func (c *HccsCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedHccsDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe description of the metric +func (c *HccsCollector) Describe(ch chan<- *prometheus.Desc) { + for _, desc := range hccsTxDescs { + ch <- desc + } + for _, desc := range hccsRxDescs { + ch <- desc + } + for _, desc := range hccsErrDescs { + ch <- desc + } + for _, desc := range hccsBWTxDescs { + ch <- desc + } + for _, desc := range hccsBWRxDescs { + ch <- desc + } + ch <- hccsBWProfilingTime + ch <- hccsBWTotalTx + ch <- hccsBWTotalRx +} + +// CollectToCache collect the metric to cache +func (c *HccsCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + logicID := chip.LogicID + var hccsStatisticInfo *common.HccsStatisticInfo + var err error + if c.realGetStatisticInfoFunc != nil { + hccsStatisticInfo, err = c.realGetStatisticInfoFunc(logicID) + } else { + hccsStatisticInfo = buildFailedHccsInfo() + err = fmt.Errorf("realGetStatisticInfoFunc is nil when get hccs info, " + + "maybe both GetHccsStatisticInfoInU64 and GetHccsStatisticInfo can't be unreached") + } + handleErr(err, colcommon.DomainForHccs, logicID) + + hccsBandwidthInfo, err := n.Dmgr.GetHccsBandwidthInfo(logicID) + handleErr(err, colcommon.DomainForHccsBW, logicID) + c.LocalCache.Store(chip.PhyId, hccsCache{ + chip: chip, + timestamp: time.Now(), + hccsStat: hccsStatisticInfo, + hccsBW: hccsBandwidthInfo}, + ) + } + + colcommon.UpdateCache[hccsCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// PreCollect pre collect hccs info +func (c *HccsCollector) PreCollect(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + if len(chipList) == 0 { + return + } + chipOne := chipList[0] + devType := n.Dmgr.GetDevType() + if devType == api.Ascend910B || common.IsA900A3SuperPod(chipOne.MainBoardId) || + common.Is800IA3Chip(chipOne.MainBoardId) { + // A2 or A900A3 SuperPod or 800IA3 begin at 1st bit + c.hccsBeginIndex = num1 + } else if common.IsA9000A3SuperPod(chipOne.MainBoardId) { + // A9000A3SuperPod begin at 2nd bit + c.hccsBeginIndex = num2 + } else { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: api.Hccs, ID: "0"}, + "not support main board id:%d", chipOne.MainBoardId) + } + + // Both failed, retry 3 times with 2s interval + const retryTimes = 3 + const retryInterval = 2 * time.Second + var success bool + var err1, err2 error + for i := 0; i < retryTimes; i++ { + _, err1 = n.Dmgr.GetHccsStatisticInfoInU64(chipOne.LogicID) + if err1 == nil { + logger.Infof("get hccs statistic info by subCmd(5) succeeded, will use subCmd(5) to get hccs info") + c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfoInU64 + success = true + break + } + _, err2 = n.Dmgr.GetHccsStatisticInfo(chipOne.LogicID) + if err2 == nil { + logger.Infof("get hccs statistic info by subCmd(3) succeeded, will use subCmd(3) to get hccs info") + c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfo + success = true + break + } + time.Sleep(retryInterval) + } + // If still failed after retries, set to nil and log error + if !success { + logger.Errorf("get hccs statistic info failed after trying both subCmd(5) and subCmd(3) with 3 retries, "+ + "err1: %v, err2: %v", err1, err2) + c.realGetStatisticInfoFunc = nil + } + +} + +// UpdatePrometheus update prometheus +func (c *HccsCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hccsCache, cardLabel []string) { + timestamp := cache.timestamp + promUpdateHccsStatisticInfo(ch, cache, c, timestamp, cardLabel) + promUpdateHccsBwInfo(ch, cache, c, timestamp, cardLabel) + } + updateFrame[hccsCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +func promUpdateHccsBwInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, + timestamp time.Time, cardLabel []string) { + bandwidthInfo := cache.hccsBW + if bandwidthInfo == nil { + return + } + if c.hccsBeginIndex < 0 { + logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateMetric(ch, timestamp, bandwidthInfo.TxBandwidth[i], cardLabel, hccsBWTxDescs[i]) + doUpdateMetric(ch, timestamp, bandwidthInfo.RxBandwidth[i], cardLabel, hccsBWRxDescs[i]) + } + doUpdateMetric(ch, timestamp, bandwidthInfo.ProfilingTime, cardLabel, hccsBWProfilingTime) + doUpdateMetric(ch, timestamp, bandwidthInfo.TotalTxbw, cardLabel, hccsBWTotalTx) + doUpdateMetric(ch, timestamp, bandwidthInfo.TotalRxbw, cardLabel, hccsBWTotalRx) +} + +func promUpdateHccsStatisticInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, + timestamp time.Time, cardLabel []string) { + statisticInfo := cache.hccsStat + + if statisticInfo == nil { + return + } + if c.hccsBeginIndex < 0 { + logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateMetric(ch, timestamp, statisticInfo.TxCnt[i], cardLabel, hccsTxDescs[i]) + doUpdateMetric(ch, timestamp, statisticInfo.RxCnt[i], cardLabel, hccsRxDescs[i]) + doUpdateMetric(ch, timestamp, statisticInfo.CrcErrCnt[i], cardLabel, hccsErrDescs[i]) + } +} + +// UpdateTelegraf update telegraf +func (c *HccsCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + telegrafUpdateHccsStatisticInfo(cache, c, fieldMap) + telegrafUpdateHccsBwInfo(cache, c, fieldMap) + } + + return fieldsMap + +} + +func telegrafUpdateHccsBwInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { + bandwidthInfo := cache.hccsBW + if bandwidthInfo == nil || c.hccsBeginIndex < 0 { + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateTelegraf(fieldMap, hccsBWTxDescs[i], bandwidthInfo.TxBandwidth[i], "") + doUpdateTelegraf(fieldMap, hccsBWRxDescs[i], bandwidthInfo.RxBandwidth[i], "") + } + doUpdateTelegraf(fieldMap, hccsBWProfilingTime, bandwidthInfo.ProfilingTime, "") + doUpdateTelegraf(fieldMap, hccsBWTotalTx, bandwidthInfo.TotalTxbw, "") + doUpdateTelegraf(fieldMap, hccsBWTotalRx, bandwidthInfo.TotalRxbw, "") +} + +func telegrafUpdateHccsStatisticInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { + statisticInfo := cache.hccsStat + + if statisticInfo == nil || c.hccsBeginIndex < 0 { + return + } + for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { + doUpdateTelegraf(fieldMap, hccsTxDescs[i], statisticInfo.TxCnt[i], "") + doUpdateTelegraf(fieldMap, hccsRxDescs[i], statisticInfo.RxCnt[i], "") + doUpdateTelegraf(fieldMap, hccsErrDescs[i], statisticInfo.CrcErrCnt[i], "") + } +} + +// buildFailedHccsInfo build failed hccs info +func buildFailedHccsInfo() *common.HccsStatisticInfo { + errorResult := &common.HccsStatisticInfo{ + TxCnt: make([]uint64, 8), + RxCnt: make([]uint64, 8), + CrcErrCnt: make([]uint64, 8), + } + for i := 0; i < 8; i++ { + errorResult.TxCnt[i] = common.FailedValue + errorResult.RxCnt[i] = common.FailedValue + errorResult.CrcErrCnt[i] = common.FailedValue + } + return errorResult +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go new file mode 100644 index 0000000..4b596df --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go @@ -0,0 +1,150 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" +) + +const ( + mockLogicID int32 = 0 + mockMainBoardId uint32 = 100 + errorMsgWith8001 string = "error code 8001 occurred" + errorMsgWithout8001 string = "error code 8002 occurred" + singleChipList int = 1 + unsupportedBoardId uint32 = 999 +) + +type preCollectTestCase struct { + name string + chipList []colcommon.HuaWeiAIChip + devType string + mainBoardId uint32 + isA900A3SuperPod bool + isA9000A3SuperPod bool + is800IA3Chip bool + getStatInfoErr error + expectedBeginIndex int + expectedFuncSet bool +} + +func TestPreCollect(t *testing.T) { + n := mockNewNpuCollector() + testCases := buildPreCollectTestCases() + + for _, tc := range testCases { + convey.Convey(tc.name, t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + + setupPatches(patches, n, tc) + collector := &HccsCollector{} + collector.PreCollect(n, tc.chipList) + verifyPreCollectResult(collector, tc) + }) + } +} + +func buildPreCollectTestCases() []preCollectTestCase { + cases := []preCollectTestCase{ + {name: "should return early when chipList is empty", + chipList: []colcommon.HuaWeiAIChip{}, + expectedBeginIndex: 0, + expectedFuncSet: false}, + {name: "should not set beginIndex when mainBoardId is not supported", + chipList: createMockChipList(singleChipList, unsupportedBoardId), + devType: api.Ascend910A3, + mainBoardId: unsupportedBoardId, + getStatInfoErr: nil, + expectedBeginIndex: 0, + expectedFuncSet: true}, + } + cases = append(cases, buildBeginIndexCases()...) + return cases +} + +func buildBeginIndexCases() []preCollectTestCase { + return []preCollectTestCase{ + {name: "should set beginIndex to num1 when devType is Ascend910B", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910B, + mainBoardId: mockMainBoardId, + getStatInfoErr: nil, + expectedBeginIndex: num1, + expectedFuncSet: true}, + {name: "should set beginIndex to num1 when IsA900A3SuperPod returns true", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910A3, + mainBoardId: mockMainBoardId, + isA900A3SuperPod: true, + getStatInfoErr: nil, + expectedBeginIndex: num1, + expectedFuncSet: true}, + {name: "should set beginIndex to num1 when Is800IA3Chip returns true", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910A3, + mainBoardId: mockMainBoardId, + is800IA3Chip: true, + getStatInfoErr: nil, + expectedBeginIndex: num1, + expectedFuncSet: true}, + {name: "should set beginIndex to num2 when IsA9000A3SuperPod returns true", + chipList: createMockChipList(singleChipList, mockMainBoardId), + devType: api.Ascend910A3, + mainBoardId: mockMainBoardId, + isA9000A3SuperPod: true, + getStatInfoErr: nil, + expectedBeginIndex: num2, + expectedFuncSet: true}, + } +} + +func createMockChipList(count int, mainBoardId uint32) []colcommon.HuaWeiAIChip { + if count == 0 { + return []colcommon.HuaWeiAIChip{} + } + return []colcommon.HuaWeiAIChip{ + { + LogicID: mockLogicID, + MainBoardId: mainBoardId, + }, + } +} + +func setupPatches(patches *gomonkey.Patches, n *colcommon.NpuCollector, tc preCollectTestCase) { + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tc.devType) + patches.ApplyFuncReturn(common.IsA900A3SuperPod, tc.isA900A3SuperPod) + patches.ApplyFuncReturn(common.IsA9000A3SuperPod, tc.isA9000A3SuperPod) + patches.ApplyFuncReturn(common.Is800IA3Chip, tc.is800IA3Chip) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", + &common.HccsStatisticInfo{}, tc.getStatInfoErr) +} + +func verifyPreCollectResult(collector *HccsCollector, tc preCollectTestCase) { + convey.So(collector.hccsBeginIndex, convey.ShouldEqual, tc.expectedBeginIndex) + if tc.expectedFuncSet { + convey.So(collector.realGetStatisticInfoFunc, convey.ShouldNotBeNil) + } else { + convey.So(collector.realGetStatisticInfoFunc, convey.ShouldBeNil) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go new file mode 100644 index 0000000..018a370 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go @@ -0,0 +1,190 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +var ( + // bandwidth + descBandwidthTx = colcommon.BuildDesc("npu_chip_info_bandwidth_tx", + "the npu interface transport speed, unit is 'MB/s'") + descBandwidthRx = colcommon.BuildDesc("npu_chip_info_bandwidth_rx", + "the npu interface receive speed, unit is 'MB/s'") + + // linkspeed + npuChipLinkSpeed = colcommon.BuildDesc("npu_chip_link_speed", + "the npu interface receive link speed, unit is 'Mb/s'") + + // linkupNum + npuChipLinkUpNum = colcommon.BuildDesc("npu_chip_link_up_num", "the npu interface receive link-up num") + + // linkstatus + descLinkStatus = colcommon.BuildDesc("npu_chip_info_link_status", "the npu link status") +) + +type netInfoCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + extInfo *common.NpuNetInfo +} + +// NetworkCollector collects the network info +type NetworkCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check if the collector is supported +func (c *NetworkCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := n.Dmgr.IsTrainingCard() + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "only training card supports network related info") + return isSupport +} + +// Describe description of the metric +func (c *NetworkCollector) Describe(ch chan<- *prometheus.Desc) { + // bandwidth + ch <- descBandwidthTx + ch <- descBandwidthRx + // linkspeed + ch <- npuChipLinkSpeed + // linkupNum + ch <- npuChipLinkUpNum + // linkstatus + ch <- descLinkStatus +} + +// CollectToCache collect the metric to cache +func (c *NetworkCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + netInfo := collectNetworkInfo(chip.PhyId) + c.LocalCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: &netInfo}) + } + colcommon.UpdateCache[netInfoCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *NetworkCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache netInfoCache, cardLabel []string) { + netInfo := cache.extInfo + if netInfo == nil { + return + } + time := cache.timestamp + if validateNotNilForEveryElement(netInfo.BandwidthInfo) { + doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.TxValue, cardLabel, descBandwidthTx) + doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.RxValue, cardLabel, descBandwidthRx) + } + if validateNotNilForEveryElement(netInfo.LinkSpeedInfo) { + doUpdateMetricWithValidateNum(ch, time, netInfo.LinkSpeedInfo.Speed, cardLabel, npuChipLinkSpeed) + } + if validateNotNilForEveryElement(netInfo.LinkStatInfo) { + doUpdateMetricWithValidateNum(ch, time, netInfo.LinkStatInfo.LinkUPNum, cardLabel, npuChipLinkUpNum) + } + if validateNotNilForEveryElement(netInfo.LinkStatusInfo) { + doUpdateMetricWithValidateNum(ch, time, float64(hccn.GetLinkStatusCode(netInfo.LinkStatusInfo.LinkState)), + cardLabel, descLinkStatus) + } + } + updateFrame[netInfoCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf update telegraf metrics +func (c *NetworkCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + extInfo := cache.extInfo + if extInfo == nil { + continue + } + if validateNotNilForEveryElement(extInfo.BandwidthInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthTx, extInfo.BandwidthInfo.TxValue, "") + doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthRx, extInfo.BandwidthInfo.RxValue, "") + } + if validateNotNilForEveryElement(extInfo.LinkSpeedInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkSpeed, extInfo.LinkSpeedInfo.Speed, "") + } + if validateNotNilForEveryElement(extInfo.LinkStatInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkUpNum, extInfo.LinkStatInfo.LinkUPNum, "") + } + if validateNotNilForEveryElement(extInfo.LinkStatusInfo) { + doUpdateTelegrafWithValidateNum(fieldMap, descLinkStatus, + float64(hccn.GetLinkStatusCode(extInfo.LinkStatusInfo.LinkState)), "") + } + } + return fieldsMap +} + +func collectNetworkInfo(phyID int32) common.NpuNetInfo { + newNetInfo := common.NpuNetInfo{} + + newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} + if linkState, err := hccn.GetNPULinkStatus(phyID); err == nil { + newNetInfo.LinkStatusInfo.LinkState = linkState + hwlog.ResetErrCnt(colcommon.DomainForLinkState, phyID) + } else { + logErrMetricsWithLimit(colcommon.DomainForLinkState, phyID, err) + newNetInfo.LinkStatusInfo.LinkState = colcommon.Abnormal + } + + if tx, rx, err := hccn.GetNPUInterfaceTraffic(phyID); err == nil { + newNetInfo.BandwidthInfo = &common.BandwidthInfo{} + newNetInfo.BandwidthInfo.RxValue = rx + newNetInfo.BandwidthInfo.TxValue = tx + hwlog.ResetErrCnt(colcommon.DomainForBandwidth, phyID) + } else { + newNetInfo.BandwidthInfo = nil + logErrMetricsWithLimit(colcommon.DomainForBandwidth, phyID, err) + } + if linkUpNum, err := hccn.GetNPULinkUpNum(phyID); err == nil { + newNetInfo.LinkStatInfo = &common.LinkStatInfo{} + newNetInfo.LinkStatInfo.LinkUPNum = float64(linkUpNum) + hwlog.ResetErrCnt(colcommon.DomainForLinkStat, phyID) + } else { + newNetInfo.LinkStatInfo = nil + logErrMetricsWithLimit(colcommon.DomainForLinkStat, phyID, err) + } + + if speed, err := hccn.GetNPULinkSpeed(phyID); err == nil { + newNetInfo.LinkSpeedInfo = &common.LinkSpeedInfo{} + newNetInfo.LinkSpeedInfo.Speed = float64(speed) + hwlog.ResetErrCnt(colcommon.DomainForLinkSpeed, phyID) + } else { + newNetInfo.LinkSpeedInfo = nil + logErrMetricsWithLimit(colcommon.DomainForLinkSpeed, phyID, err) + } + + return newNetInfo +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go new file mode 100644 index 0000000..975ffcf --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go @@ -0,0 +1,453 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "math" + "strconv" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + errorCodeDescs []*prometheus.Desc + cardLabelForProcess = append(colcommon.CardLabel, "process_id", "container_id") + cardLabelForContainer []string + cardLabelForSN []string + cardLabelForNpuName = make([]string, len(colcommon.CardLabel)) +) + +var ( + machineInfoNPUDesc = colcommon.BuildDescWithLabel("machine_npu_nums", "Amount of npu installed on the machine.", nil) + + descUtil = colcommon.BuildDesc("npu_chip_info_utilization", "the ai core utilization") + descOverUtil = colcommon.BuildDesc("npu_chip_info_overall_utilization", "the overall utilization of npu") + descVectorUtil = colcommon.BuildDesc("npu_chip_info_vector_utilization", "the vector ai core utilization") + descTemp = colcommon.BuildDesc("npu_chip_info_temperature", "the npu temperature") + descPower = colcommon.BuildDesc("npu_chip_info_power", "the npu power") + descVoltage = colcommon.BuildDesc("npu_chip_info_voltage", "the npu voltage") + + descAICoreFreq = colcommon.BuildDesc("npu_chip_info_aicore_current_freq", + "the npu ai core current frequency, unit is 'MHz'") + descHealthStatus = colcommon.BuildDesc("npu_chip_info_health_status", "the npu health status") + descDevProcessNum = colcommon.BuildDesc("npu_chip_info_process_info_num", + "the npu process num") + + descDevProcessInfo = colcommon.BuildDescWithLabel("npu_chip_info_process_info", + "the npu process info, unit is 'MB'. if process run on host, container_id and container_name will be empty", + cardLabelForProcess) + + // net status + descNetworkStatus = colcommon.BuildDesc("npu_chip_info_network_status", "the npu network health status") + + // container (vnpu not support this metrics), only report to prometheus + npuCtrUtilization = colcommon.BuildDesc("container_npu_utilization", + "npu ai core utilization in container, unit is '%'") + npuCtrTotalMemory = colcommon.BuildDesc("container_npu_total_memory", + "npu total memory in container, unit is 'MB'") + npuCtrUsedMemory = colcommon.BuildDesc("container_npu_used_memory", + "the npu used memory in container, unit is 'MB'") + + npuCtrInfo *prometheus.Desc = nil + descNpuName *prometheus.Desc = nil + descNPUSerialNumber *prometheus.Desc = nil +) + +func init() { + + colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code", "the npu error code") + for i := 1; i < common.MaxErrorCodeLen; i++ { + colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code_"+strconv.Itoa(i), "the npu error code") + } + + cardLabelForContainer = append(colcommon.CardLabel, "containerID", "containerName") + cardLabelForContainer[0] = "npuID" + npuCtrInfo = colcommon.BuildDescWithLabel("npu_container_info", "the container name and deviceID relationship", + cardLabelForContainer) + + cardLabelForSN = append(colcommon.CardLabel, "serial_number") + // NPU SN related metrics + descNPUSerialNumber = colcommon.BuildDescWithLabel("npu_chip_info_serial_number", + "the npu serial number information", cardLabelForSN) + + copy(cardLabelForNpuName, colcommon.CardLabel) + cardLabelForNpuName[1] = "name" + descNpuName = colcommon.BuildDescWithLabel("npu_chip_info_name", "the Ascend npu name with value '1'", + cardLabelForNpuName) +} + +type chipCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + + // the healthy status of the AI chip + HealthStatus string `json:"health_status"` + // the all error codes of the chip + ErrorCodes []int64 `json:"error_codes"` + // the utilization of the chip + Utilization int `json:"utilization"` + // the overall utilization of the chip + OverallUtilization int `json:"overall_utilization"` + // the vector utilization of the chip + VectorUtilization int `json:"vector_utilization"` + // the temperature of the chip + Temperature int `json:"temperature"` + // the work power of the chip + Power float32 `json:"power"` + // the work voltage of the chip + Voltage float32 `json:"voltage"` + // the AI core current frequency of the chip + AICoreCurrentFreq uint32 `json:"aicore_current_freq"` + // NetHealthStatus chip network health status + NetHealthStatus string `json:"net_health_status"` + // DevProcessInfo chip process info + DevProcessInfo *common.DevProcessInfo +} + +// BaseInfoCollector collects the base info of the chip +type BaseInfoCollector struct { + colcommon.MetricsCollectorAdapter +} + +// Describe collects the base info of the chip +func (c *BaseInfoCollector) Describe(ch chan<- *prometheus.Desc) { + // base info + ch <- machineInfoNPUDesc + ch <- descUtil + ch <- descVectorUtil + ch <- descOverUtil + ch <- descTemp + ch <- descPower + ch <- descVoltage + ch <- descHealthStatus + ch <- descNpuName + ch <- descAICoreFreq + ch <- descNPUSerialNumber + ch <- descDevProcessInfo + // status + ch <- descNetworkStatus + // container + ch <- npuCtrInfo + ch <- npuCtrUtilization + ch <- npuCtrTotalMemory + ch <- npuCtrUsedMemory + + // error code + for _, desc := range errorCodeDescs { + ch <- desc + } +} + +// CollectToCache collects the base info of the chip +func (c *BaseInfoCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + logicID := chip.LogicID + + dmgr := n.Dmgr + + freq, err := dmgr.GetDeviceFrequency(logicID, common.AICoreCurrentFreq) + if err != nil { + freq = common.UnRetError + } + temp, err := dmgr.GetDeviceTemperature(logicID) + if err != nil { + temp = common.RetError + } + vol, err := dmgr.GetDeviceVoltage(logicID) + if err != nil { + vol = common.UnRetError + } + + _, errCodes, err := dmgr.GetDeviceAllErrorCode(logicID) + if err != nil { + errCodes = make([]int64, 0) + } + + cache := &chipCache{ + chip: chip, + AICoreCurrentFreq: freq, + Temperature: int(temp), + Voltage: vol, + HealthStatus: getHealth(logicID, dmgr), + ErrorCodes: errCodes, + } + collectPower(logicID, dmgr, cache) + collectUtil(logicID, dmgr, cache) + setNetHealthStatus(logicID, dmgr, cache) + setProcessInfo(logicID, dmgr, cache) + + cache.timestamp = time.Now() + c.LocalCache.Store(chip.PhyId, *cache) + } + colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +func collectPower(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { + if dmgr.GetDevType() == api.Ascend310P { + cardPower, err := dmgr.GetMcuPowerInfo(chip.chip.CardId) + handleErr(err, colcommon.DomainForMcuPower, chip.chip.CardId) + // Ascend310P use cardPower to replace chipPower + chip.Power = cardPower + } else { + power, err := dmgr.GetDevicePowerInfo(logicID) + handleErr(err, colcommon.DomainForChipPower, logicID) + chip.Power = power + } +} + +// UpdatePrometheus updates the base info of the chip +func (c *BaseInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { + containerInfo := geenContainerInfo(&chipWithVnpu, containerMap) + timestamp := cache.timestamp + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Power), cardLabel, descPower) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Voltage), cardLabel, descVoltage) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.AICoreCurrentFreq), cardLabel, descAICoreFreq) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Temperature), cardLabel, descTemp) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Utilization), cardLabel, descUtil) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.OverallUtilization), cardLabel, descOverUtil) + doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.VectorUtilization), cardLabel, descVectorUtil) + doUpdateMetricWithValidateNum(ch, timestamp, 1, cardLabel, descNpuName) + doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.HealthStatus)), cardLabel, descHealthStatus) + doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.NetHealthStatus)), + cardLabel, descNetworkStatus) + + updateContainerInfo(ch, containerInfo, cardLabel, &cache, chipWithVnpu) + + updateProcessInfoForPrometheus(ch, &cache, containerInfo, timestamp, cardLabel) + updateErrorCodesInfo(ch, &cache, timestamp, cardLabel) + // Update NPU serial number info + if cache.chip.ElabelInfo != nil { + snLabel := append(cardLabel, cache.chip.ElabelInfo.SerialNumber) + doUpdateMetricWithValidateNum(ch, timestamp, 1, snLabel, descNPUSerialNumber) + } + } + updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + + ch <- prometheus.MustNewConstMetric(machineInfoNPUDesc, prometheus.GaugeValue, float64(len(chips))) +} + +func updateContainerInfo(ch chan<- prometheus.Metric, containerInfo container.DevicesInfo, + cardLabel []string, chip *chipCache, chipWithVnpu colcommon.HuaWeiAIChip) { + containerName := getContainerNameArray(containerInfo) + if len(containerName) != colcommon.ContainerNameLen { + return + } + // based on chipType , container_npu_total_memory、container_npu_used_memory reported in hbm or ddr group + doUpdateMetric(ch, chip.timestamp, 1, append(cardLabel, containerInfo.ID, strings.Join(containerName, "_")), + npuCtrInfo) + + // vnpu not support this metrics + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + + doUpdateMetricWithValidateNum(ch, chip.timestamp, float64(chip.Utilization), cardLabel, npuCtrUtilization) +} + +func updateErrorCodesInfo(ch chan<- prometheus.Metric, chip *chipCache, timestamp time.Time, cardLabel []string) { + if len(chip.ErrorCodes) > common.MaxErrorCodeLen { + logger.Warnf("Error code number is larger than %v, only the first %v will be reported, "+ + "all errorCode is: %v", common.MaxErrorCodeLen, common.MaxErrorCodeLen, chip.ErrorCodes) + } + for i := 0; i < len(chip.ErrorCodes) && i < len(errorCodeDescs); i++ { + doUpdateMetricWithValidateNum(ch, timestamp, float64(chip.ErrorCodes[i]), cardLabel, errorCodeDescs[i]) + } +} + +func updateProcessInfoForPrometheus(ch chan<- prometheus.Metric, chip *chipCache, + containerInfo container.DevicesInfo, timestamp time.Time, cardLabel []string) { + devProcessInfo := chip.DevProcessInfo + if devProcessInfo == nil { + return + } + doUpdateMetric(ch, timestamp, devProcessInfo.ProcNum, cardLabel, descDevProcessNum) + + containerID := "" + containerName := "" + cNameArray := getContainerNameArray(containerInfo) + if len(cNameArray) == colcommon.ContainerNameLen { + containerID = containerInfo.ID + containerName = strings.Join(cNameArray, "_") + } + + newCardLabel := make([]string, len(cardLabel)) + copy(newCardLabel, cardLabel) + // containerName in process info is namespace_podName_containerName + newCardLabel[len(newCardLabel)-1] = containerName + + if devProcessInfo.ProcNum == 0 { + doUpdateMetric(ch, timestamp, 0, append(newCardLabel, "", containerID), descDevProcessInfo) + return + } + + for i := int32(0); i < devProcessInfo.ProcNum; i++ { + procInfo := devProcessInfo.DevProcArray[i] + doUpdateMetric(ch, timestamp, procInfo.MemUsage, + append(newCardLabel, strconv.FormatInt(int64(procInfo.Pid), colcommon.Base), containerID), descDevProcessInfo) + } +} + +// UpdateTelegraf updates the base info of the chip +func (c *BaseInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + doUpdateTelegrafWithValidateNum(fieldMap, descTemp, float64(cache.Temperature), "") + doUpdateTelegrafWithValidateNum(fieldMap, descPower, float64(cache.Power), "") + doUpdateTelegrafWithValidateNum(fieldMap, descVoltage, float64(cache.Voltage), "") + doUpdateTelegrafWithValidateNum(fieldMap, descAICoreFreq, float64(cache.AICoreCurrentFreq), "") + doUpdateTelegrafWithValidateNum(fieldMap, descUtil, float64(cache.Utilization), "") + doUpdateTelegrafWithValidateNum(fieldMap, descVectorUtil, float64(cache.VectorUtilization), "") + doUpdateTelegrafWithValidateNum(fieldMap, descOverUtil, float64(cache.OverallUtilization), "") + doUpdateTelegrafWithValidateNum(fieldMap, descHealthStatus, float64(getHealthCode(cache.HealthStatus)), "") + doUpdateTelegrafWithValidateNum(fieldMap, descNetworkStatus, float64(getHealthCode(cache.NetHealthStatus)), "") + doUpdateTelegraf(fieldMap, descNpuName, chip.ChipInfo.Name, "") + + updateProcessInfoForTelegraf(&cache, fieldMap) + updateErrorCode(&cache, fieldMap) + // Update NPU serial number info + if cache.chip.ElabelInfo != nil { + doUpdateTelegraf(fieldMap, descNPUSerialNumber, cache.chip.ElabelInfo.SerialNumber, "") + } + + } + + if fieldsMap[colcommon.GeneralDevTagKey] == nil { + fieldsMap[colcommon.GeneralDevTagKey] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[colcommon.GeneralDevTagKey], machineInfoNPUDesc, len(chips), "") + return fieldsMap +} + +func updateErrorCode(chip *chipCache, fieldMap map[string]interface{}) { + if len(errorCodeDescs) == 0 { + return + } + descErrorCode := errorCodeDescs[0] + for i := 0; i < len(chip.ErrorCodes); i++ { + extInfo := "" + if i != 0 { + extInfo = "_" + strconv.Itoa(i) + } + doUpdateTelegrafWithValidateNum(fieldMap, descErrorCode, float64(chip.ErrorCodes[i]), extInfo) + } +} + +func updateProcessInfoForTelegraf(chip *chipCache, fieldMap map[string]interface{}) { + devProcessInfo := chip.DevProcessInfo + doUpdateTelegraf(fieldMap, descDevProcessNum, devProcessInfo.ProcNum, "") + if devProcessInfo.ProcNum == 0 { + doUpdateTelegraf(fieldMap, descDevProcessInfo, 0, "") + return + } + for i := int32(0); i < devProcessInfo.ProcNum; i++ { + procInfo := devProcessInfo.DevProcArray[i] + doUpdateTelegraf(fieldMap, descDevProcessInfo, procInfo.MemUsage, "_"+strconv.Itoa(int(procInfo.Pid))) + } +} + +func collectUtil(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { + util, err := dmgr.GetDeviceUtilizationRate(logicID, common.AICore) + handleErr(err, colcommon.DomainForAICoreUtilization, logicID) + chip.Utilization = int(util) + + overAllUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.Overall) + handleErr(err, colcommon.DomainForOverallUtilization, logicID) + chip.OverallUtilization = int(overAllUtil) + + vecUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.VectorCore) + handleErr(err, colcommon.DomainForVectorCoreUtilization, logicID) + chip.VectorUtilization = int(vecUtil) +} + +func setNetHealthStatus(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { + chip.NetHealthStatus = colcommon.Abnormal + if !dmgr.IsTrainingCard() { + return + } + + netCode, err := dmgr.GetDeviceNetWorkHealth(logicID) + logger.Debugf("chip %d network healthy code is %d", logicID, netCode) + if err != nil { + netCode = math.MaxUint32 + } + chip.NetHealthStatus = getNetworkHealthy(netCode) +} + +func getNetworkHealthy(netCode uint32) string { + if netCode == math.MaxUint32 { + return colcommon.Abnormal + } + + if netCode == common.NetworkInit || netCode == common.NetworkSuccess { + return colcommon.Healthy + } + + return colcommon.UnHealthy +} + +func getHealth(logicID int32, dmgr devmanager.DeviceInterface) string { + health, err := dmgr.GetDeviceHealth(logicID) + if err != nil || health != 0 { + return colcommon.UnHealthy + } + return colcommon.Healthy +} + +func getHealthCode(health string) int { + if health == colcommon.Abnormal { + return common.RetError + } + + if colcommon.Healthy == health { + return 1 + } + return 0 +} + +func setProcessInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *chipCache) { + productTypes := dmgr.GetProductTypeArray() + info, err := dmgr.GetDevProcessInfo(logicID) + if err != nil { + if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { + logger.Debugf("process info is not supported on %s", common.Atlas200ISoc) + hwChip.DevProcessInfo = &common.DevProcessInfo{} + return + } + handleErr(err, colcommon.DomainForProcess, logicID) + info = &common.DevProcessInfo{} + } + hwChip.DevProcessInfo = info +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go new file mode 100644 index 0000000..ca49804 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go @@ -0,0 +1,200 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + txPower0 = "Tx_Power0" + txPower1 = "Tx_Power1" + txPower2 = "Tx_Power2" + txPower3 = "Tx_Power3" + + rxPower0 = "Rx_Power0" + rxPower1 = "Rx_Power1" + rxPower2 = "Rx_Power2" + rxPower3 = "Rx_Power3" + + notPresent = "not present" + present = "present" + temperature = "temperature" + voltage = "Vcc" +) + +var ( + + // optical + descOpticalState = colcommon.BuildDesc("npu_chip_optical_state", "the npu interface receive optical-state") + descOpticalVcc = colcommon.BuildDesc("npu_chip_optical_vcc", "the npu interface receive optical-vcc") + descOpticalTemp = colcommon.BuildDesc("npu_chip_optical_temp", "the npu interface receive optical-temperature") + descOpticalTxPower0 = colcommon.BuildDesc("npu_chip_optical_tx_power_0", "npu interface receive optical-tx-power-0") + descOpticalTxPower1 = colcommon.BuildDesc("npu_chip_optical_tx_power_1", "npu interface receive optical-tx-power-1") + descOpticalTxPower2 = colcommon.BuildDesc("npu_chip_optical_tx_power_2", "npu interface receive optical-tx-power-2") + descOpticalTxPower3 = colcommon.BuildDesc("npu_chip_optical_tx_power_3", "npu interface receive optical-tx-power-3") + + descOpticalRxPower0 = colcommon.BuildDesc("npu_chip_optical_rx_power_0", "npu interface receive optical-rx-power-0") + descOpticalRxPower1 = colcommon.BuildDesc("npu_chip_optical_rx_power_1", "npu interface receive optical-rx-power-1") + descOpticalRxPower2 = colcommon.BuildDesc("npu_chip_optical_rx_power_2", "npu interface receive optical-rx-power-2") + descOpticalRxPower3 = colcommon.BuildDesc("npu_chip_optical_rx_power_3", "npu interface receive optical-rx-power-3") +) + +type opticalCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo indicates the optical module information + extInfo *common.OpticalInfo +} + +// OpticalCollector collect the optical metrics +type OpticalCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported judge whether the collector is supported +func (c *OpticalCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := n.Dmgr.IsTrainingCard() + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "only training card supports network related info") + return isSupport +} + +// Describe description of the metric +func (c *OpticalCollector) Describe(ch chan<- *prometheus.Desc) { + // optical + ch <- descOpticalState + ch <- descOpticalTxPower0 + ch <- descOpticalTxPower1 + ch <- descOpticalTxPower2 + ch <- descOpticalTxPower3 + ch <- descOpticalRxPower0 + ch <- descOpticalRxPower1 + ch <- descOpticalRxPower2 + ch <- descOpticalRxPower3 + ch <- descOpticalVcc + ch <- descOpticalTemp +} + +// CollectToCache collect the metric to cache +func (c *OpticalCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + opticalInfo, err := hccn.GetNPUOpticalInfo(chip.PhyId) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForOptical, chip.PhyId, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForOptical, chip.PhyId) + info := getMainOptInfo(opticalInfo) + c.LocalCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), extInfo: info}) + } + colcommon.UpdateCache[opticalCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *OpticalCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache opticalCache, cardLabel []string) { + opticalInfo := cache.extInfo + if opticalInfo == nil { + return + } + timestamp := cache.timestamp + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalState, cardLabel, descOpticalState) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalVcc, cardLabel, descOpticalVcc) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTemp, cardLabel, descOpticalTemp) + + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower0, cardLabel, descOpticalTxPower0) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower1, cardLabel, descOpticalTxPower1) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower2, cardLabel, descOpticalTxPower2) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower3, cardLabel, descOpticalTxPower3) + + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower0, cardLabel, descOpticalRxPower0) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower1, cardLabel, descOpticalRxPower1) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower2, cardLabel, descOpticalRxPower2) + doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower3, cardLabel, descOpticalRxPower3) + } + + updateFrame[opticalCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *OpticalCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalState, extInfo.OpticalState, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalVcc, extInfo.OpticalVcc, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTemp, extInfo.OpticalTemp, "") + + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower0, extInfo.OpticalTxPower0, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower1, extInfo.OpticalTxPower1, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower2, extInfo.OpticalTxPower2, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower3, extInfo.OpticalTxPower3, "") + + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower0, extInfo.OpticalRxPower0, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower1, extInfo.OpticalRxPower1, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower2, extInfo.OpticalRxPower2, "") + doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower3, extInfo.OpticalRxPower3, "") + } + return fieldsMap +} + +func getMainOptInfo(opticalInfo map[string]string) *common.OpticalInfo { + mainOpticalInfo := common.OpticalInfo{} + mainOpticalInfo.OpticalTxPower0 = hccn.GetFloatDataFromStr(opticalInfo[txPower0], txPower0) + mainOpticalInfo.OpticalTxPower1 = hccn.GetFloatDataFromStr(opticalInfo[txPower1], txPower1) + mainOpticalInfo.OpticalTxPower2 = hccn.GetFloatDataFromStr(opticalInfo[txPower2], txPower2) + mainOpticalInfo.OpticalTxPower3 = hccn.GetFloatDataFromStr(opticalInfo[txPower3], txPower3) + mainOpticalInfo.OpticalRxPower0 = hccn.GetFloatDataFromStr(opticalInfo[rxPower0], rxPower0) + mainOpticalInfo.OpticalRxPower1 = hccn.GetFloatDataFromStr(opticalInfo[rxPower1], rxPower1) + mainOpticalInfo.OpticalRxPower2 = hccn.GetFloatDataFromStr(opticalInfo[rxPower2], rxPower2) + mainOpticalInfo.OpticalRxPower3 = hccn.GetFloatDataFromStr(opticalInfo[rxPower3], rxPower3) + mainOpticalInfo.OpticalVcc = hccn.GetFloatDataFromStr(opticalInfo[voltage], voltage) + mainOpticalInfo.OpticalTemp = hccn.GetFloatDataFromStr(opticalInfo[temperature], temperature) + var optState float64 + if opticalInfo[present] == present { + optState = 1.0 + } else if opticalInfo[present] == notPresent { + optState = 0.0 + } else { + optState = common.RetError + } + mainOpticalInfo.OpticalState = optState + + return &mainOpticalInfo +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go new file mode 100644 index 0000000..f68f95b --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go @@ -0,0 +1,234 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + pcieBwType = "pcie_bw_type" + avgPcieBw = "avgPcieBw" + minPcieBw = "minPcieBw" + maxPcieBw = "maxPcieBw" + + avgPostfix = "_avgPcieBw" + minPostfix = "_minPcieBw" + maxPostfix = "_maxPcieBw" +) + +var ( + pcieBwLabel = append(colcommon.CardLabel, pcieBwType) + + descRxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_p_bw", + "the npu write bw to remote‘s speed, unit is 'MB/ms'", pcieBwLabel) + + descRxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_np_bw", + "the npu read bw's speed from remote, unit is 'MB/ms'", pcieBwLabel) + + descRxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_cpl_bw", + "the npu reply remote read operate cpl's speed, unit is 'MB/ms'", pcieBwLabel) + + descTxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_p_bw", + "the npu receive remote write operate's speed, unit is 'MB/ms'", pcieBwLabel) + + descTxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_np_bw", + "the npu receive remote read operate's speed, unit is 'MB/ms'", pcieBwLabel) + + descTxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_cpl_bw", + "the npu read cpl's responese bw speed from remote, unit is 'MB/ms'", pcieBwLabel) +) +var ( + supportedPcieDevices = map[string]bool{ + api.Ascend910B: true, + } +) + +type pcieCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo pcie transport and receive bandwidth, have six metrics + extInfo *common.PCIEBwStat +} + +// PcieCollector collect pcie info +type PcieCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *PcieCollector) IsSupported(n *colcommon.NpuCollector) bool { + // only 910A2 supports pcie info + isSupport := supportedPcieDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe description of the metric +func (c *PcieCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descRxPBW + ch <- descTxPBW + ch <- descRxNpBW + ch <- descTxNpBW + ch <- descRxCplBW + ch <- descTxCplBW +} + +// CollectToCache collect the metric to cache +func (c *PcieCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + pcieBwInfo, err := n.Dmgr.GetPCIEBandwidth(chip.LogicID, common.ProfilingTime) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForPcieBandwidth, chip.LogicID, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForPcieBandwidth, chip.LogicID) + c.LocalCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieBwInfo}) + } + colcommon.UpdateCache[pcieCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *PcieCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache pcieCache, cardLabel []string) { + pcieBwInfo := cache.extInfo + if pcieBwInfo == nil { + return + } + + if cache.chip.VDevActivityInfo != nil && common.IsValidVDevID(cache.chip.VDevActivityInfo.VDevID) { + logger.Debug("vnpu does not supports pcie info query") + return + } + + timestamp := cache.timestamp + + updateAvgPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) + updateMinPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) + updateMaxPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) + } + + updateFrame[pcieCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *PcieCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieAvgBw, avgPostfix) + doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieAvgBw, avgPostfix) + + doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMinBw, minPostfix) + doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMinBw, minPostfix) + + doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMaxBw, maxPostfix) + doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMaxBw, maxPostfix) + + } + return fieldsMap +} + +func pcieBwLabelVal(cardLabels []string, pcieBwType string) []string { + return append(cardLabels, pcieBwType) +} + +func metricWithPcieBw(labelsVal []string, metrics *prometheus.Desc, val float64, valType string) prometheus.Metric { + return prometheus.MustNewConstMetric(metrics, prometheus.GaugeValue, val, pcieBwLabelVal(labelsVal, valType)...) +} + +func updateAvgPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, + cardLabel []string) { + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieAvgBw), avgPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieAvgBw), avgPcieBw)) +} + +func updateMinPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, + cardLabel []string) { + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMinBw), minPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMinBw), minPcieBw)) +} + +func updateMaxPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, + cardLabel []string) { + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMaxBw), maxPcieBw)) + ch <- prometheus.NewMetricWithTimestamp(timestamp, + metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMaxBw), maxPcieBw)) +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go new file mode 100644 index 0000000..b1d307c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go @@ -0,0 +1,263 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + macRxMacPauseNum = "mac_rx_mac_pause_num" + macTxMacPauseNum = "mac_tx_mac_pause_num" + macRxPfcPktNum = "mac_rx_pfc_pkt_num" + macTxPfcPktNum = "mac_tx_pfc_pkt_num" + macRxBadPktNum = "mac_rx_bad_pkt_num" + macTxBadPktNum = "mac_tx_bad_pkt_num" + roCERxAllPktNum = "roce_rx_all_pkt_num" + roCETxAllPktNum = "roce_tx_all_pkt_num" + roCERxErrPktNum = "roce_rx_err_pkt_num" + roCETxErrPktNum = "roce_tx_err_pkt_num" + roCERxCnpPktNum = "roce_rx_cnp_pkt_num" + roCETxCnpPktNum = "roce_tx_cnp_pkt_num" + macRxBadOctNum = "mac_rx_bad_oct_num" + macTxBadOctNum = "mac_tx_bad_oct_num" + roCEUnexpectedAckNum = "roce_unexpected_ack_num" + roCEOutOfOrderNum = "roce_out_of_order_num" + roCEVerificationErrNum = "roce_verification_err_num" + roCEQpStatusErrNum = "roce_qp_status_err_num" + roCENewPktRtyNum = "roce_new_pkt_rty_num" + roCEEcnDBNum = "roce_ecn_db_num" + macRXFcsErrPktNum = "mac_rx_fcs_err_pkt_num" +) + +var ( + // mac + descMacRxPauseNum = colcommon.BuildDesc("npu_chip_mac_rx_pause_num", "npu interface receive mac-rx-pause-num") + descMacTxPauseNum = colcommon.BuildDesc("npu_chip_mac_tx_pause_num", "npu interface receive mac-tx-pause-num") + descMacRxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_rx_pfc_pkt_num", "npu interface receive mac-rx-pfc-pkt-num") + descMacTxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_tx_pfc_pkt_num", "npu interface receive mac-tx-pfc-pkt-num") + descMacRxBadPktNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_pkt_num", "npu interface receive mac-rx-bad-pkt-num") + descMacTxBadPktNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_pkt_num", "npu interface receive mac-tx-bad-pkt-num") + descMacTxBadOctNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_oct_num", "npu interface receive mac-tx-bad-oct-num") + descMacRxBadOctNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_oct_num", "npu interface receive mac-rx-bad-oct-num") + + descRxFCSNum = colcommon.BuildDesc("npu_chip_info_rx_fcs_num", "the npu network fcs receive number") + descRxECNNum = colcommon.BuildDesc("npu_chip_info_rx_ecn_num", "the npu network ecn receive number") + + // roce + descRoceRxAllPktNum = colcommon.BuildDesc("npu_chip_roce_rx_all_pkt_num", "npu interface receive roce-rx-all-pkt-num") + descRoceTxAllPktNum = colcommon.BuildDesc("npu_chip_roce_tx_all_pkt_num", "npu interface receive roce-tx-all-pkt-num") + descRoceRxErrPktNum = colcommon.BuildDesc("npu_chip_roce_rx_err_pkt_num", "npu interface receive roce-rx-err-pkt-num") + descRoceTxErrPktNum = colcommon.BuildDesc("npu_chip_roce_tx_err_pkt_num", "npu interface receive roce-tx-err-pkt-num") + descRoceRxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_rx_cnp_pkt_num", "npu interface receive roce-rx-cnp-pkt-num") + descRoceTxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_tx_cnp_pkt_num", "npu interface receive roce-tx-cnp-pkt-num") + + descRoceNewPktRtyNum = colcommon.BuildDesc("npu_chip_roce_new_pkt_rty_num", + "npu interface receive roce-new-pkt-rty-num") + descRoceOutOfOrderNum = colcommon.BuildDesc("npu_chip_roce_out_of_order_num", + "the npu interface receive roce-out-of-order-num") + descRoceQpStatusErrNum = colcommon.BuildDesc("npu_chip_roce_qp_status_err_num", + "the npu interface receive roce-qp-status-err-num") + descRoceUnexpectedAcktNum = colcommon.BuildDesc("npu_chip_roce_unexpected_ack_num", + "the npu interface receive roce-unexpected-ack-num") + descRoceVerificationErrNum = colcommon.BuildDesc("npu_chip_roce_verification_err_num", + "the npu interface receive roce-verification-err-num") +) + +type roceCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo the statistics about packets + extInfo *common.StatInfo +} + +// RoceCollector collect roce info +type RoceCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *RoceCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := n.Dmgr.IsTrainingCard() + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "only training card supports network related info") + return isSupport +} + +// Describe description of the metric +func (c *RoceCollector) Describe(ch chan<- *prometheus.Desc) { + + // mac + ch <- descMacRxPauseNum + ch <- descMacTxPauseNum + ch <- descMacRxPfcPktNum + ch <- descMacTxPfcPktNum + ch <- descMacRxBadPktNum + ch <- descMacTxBadPktNum + ch <- descMacTxBadOctNum + ch <- descMacRxBadOctNum + ch <- descRxFCSNum + + // roce + ch <- descRoceRxAllPktNum + ch <- descRoceTxAllPktNum + ch <- descRoceRxErrPktNum + ch <- descRoceTxErrPktNum + ch <- descRoceRxCnpPktNum + ch <- descRoceTxCnpPktNum + ch <- descRoceNewPktRtyNum + ch <- descRoceUnexpectedAcktNum + ch <- descRoceOutOfOrderNum + ch <- descRoceVerificationErrNum + ch <- descRoceQpStatusErrNum + ch <- descRxECNNum + +} + +// CollectToCache collect the metric to cache +func (c *RoceCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + statInfo, err := hccn.GetNPUStatInfo(chip.DeviceID) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForRoce, chip.LogicID, err) + return + } + hwlog.ResetErrCnt(colcommon.DomainForRoce, chip.LogicID) + c.LocalCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), extInfo: getMainStatInfo(statInfo)}) + } + colcommon.UpdateCache[roceCache](n, colcommon.GetCacheKey(c), &c.LocalCache) + +} + +// UpdatePrometheus update prometheus metrics +func (c *RoceCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache roceCache, cardLabel []string) { + statInfo := cache.extInfo + if statInfo == nil { + return + } + updateStatInfoOfMac(ch, cache.timestamp, statInfo, cardLabel) + updateStatInfoOfRoCE(ch, cache.timestamp, statInfo, cardLabel) + } + updateFrame[roceCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *RoceCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + doUpdateTelegraf(fieldMap, descMacRxPauseNum, extInfo.MacRxPauseNum, "") + doUpdateTelegraf(fieldMap, descMacTxPauseNum, extInfo.MacTxPauseNum, "") + doUpdateTelegraf(fieldMap, descMacRxPfcPktNum, extInfo.MacRxPfcPktNum, "") + doUpdateTelegraf(fieldMap, descMacTxPfcPktNum, extInfo.MacTxPfcPktNum, "") + doUpdateTelegraf(fieldMap, descMacRxBadPktNum, extInfo.MacRxBadPktNum, "") + doUpdateTelegraf(fieldMap, descMacTxBadPktNum, extInfo.MacTxBadPktNum, "") + doUpdateTelegraf(fieldMap, descMacTxBadOctNum, extInfo.MacTxBadOctNum, "") + doUpdateTelegraf(fieldMap, descMacRxBadOctNum, extInfo.MacRxBadOctNum, "") + doUpdateTelegraf(fieldMap, descRxFCSNum, extInfo.MacRXFcsErrPktNum, "") + + doUpdateTelegraf(fieldMap, descRoceRxAllPktNum, extInfo.RoceRxAllPktNum, "") + doUpdateTelegraf(fieldMap, descRoceTxAllPktNum, extInfo.RoceTxAllPktNum, "") + doUpdateTelegraf(fieldMap, descRoceRxErrPktNum, extInfo.RoceRxErrPktNum, "") + doUpdateTelegraf(fieldMap, descRoceTxErrPktNum, extInfo.RoceTxErrPktNum, "") + doUpdateTelegraf(fieldMap, descRoceRxCnpPktNum, extInfo.RoceRxCnpPktNum, "") + doUpdateTelegraf(fieldMap, descRoceTxCnpPktNum, extInfo.RoceTxCnpPktNum, "") + doUpdateTelegraf(fieldMap, descRoceNewPktRtyNum, extInfo.RoceNewPktRtyNum, "") + doUpdateTelegraf(fieldMap, descRoceUnexpectedAcktNum, extInfo.RoceUnexpectedAckNum, "") + doUpdateTelegraf(fieldMap, descRoceOutOfOrderNum, extInfo.RoceOutOfOrderNum, "") + doUpdateTelegraf(fieldMap, descRoceVerificationErrNum, extInfo.RoceVerificationErrNum, "") + doUpdateTelegraf(fieldMap, descRoceQpStatusErrNum, extInfo.RoceQpStatusErrNum, "") + doUpdateTelegraf(fieldMap, descRxECNNum, extInfo.RoceEcnDBNum, "") + } + return fieldsMap +} +func getMainStatInfo(statInfo map[string]int) *common.StatInfo { + mainStatInfo := common.StatInfo{} + mainStatInfo.MacRxPauseNum = float64(statInfo[macRxMacPauseNum]) + mainStatInfo.MacTxPauseNum = float64(statInfo[macTxMacPauseNum]) + mainStatInfo.MacRxPfcPktNum = float64(statInfo[macRxPfcPktNum]) + mainStatInfo.MacTxPfcPktNum = float64(statInfo[macTxPfcPktNum]) + mainStatInfo.MacRxBadPktNum = float64(statInfo[macRxBadPktNum]) + mainStatInfo.MacTxBadPktNum = float64(statInfo[macTxBadPktNum]) + mainStatInfo.RoceRxAllPktNum = float64(statInfo[roCERxAllPktNum]) + mainStatInfo.RoceTxAllPktNum = float64(statInfo[roCETxAllPktNum]) + mainStatInfo.RoceRxErrPktNum = float64(statInfo[roCERxErrPktNum]) + mainStatInfo.RoceTxErrPktNum = float64(statInfo[roCETxErrPktNum]) + mainStatInfo.RoceRxCnpPktNum = float64(statInfo[roCERxCnpPktNum]) + mainStatInfo.RoceTxCnpPktNum = float64(statInfo[roCETxCnpPktNum]) + mainStatInfo.MacRxBadOctNum = float64(statInfo[macRxBadOctNum]) + mainStatInfo.MacTxBadOctNum = float64(statInfo[macTxBadOctNum]) + mainStatInfo.RoceUnexpectedAckNum = float64(statInfo[roCEUnexpectedAckNum]) + mainStatInfo.RoceOutOfOrderNum = float64(statInfo[roCEOutOfOrderNum]) + mainStatInfo.RoceVerificationErrNum = float64(statInfo[roCEVerificationErrNum]) + mainStatInfo.RoceQpStatusErrNum = float64(statInfo[roCEQpStatusErrNum]) + mainStatInfo.RoceNewPktRtyNum = float64(statInfo[roCENewPktRtyNum]) + mainStatInfo.RoceEcnDBNum = float64(statInfo[roCEEcnDBNum]) + mainStatInfo.MacRXFcsErrPktNum = float64(statInfo[macRXFcsErrPktNum]) + + return &mainStatInfo +} + +func updateStatInfoOfMac(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { + doUpdateMetric(ch, ts, statInfo.MacRxPauseNum, cardLabel, descMacRxPauseNum) + doUpdateMetric(ch, ts, statInfo.MacTxPauseNum, cardLabel, descMacTxPauseNum) + doUpdateMetric(ch, ts, statInfo.MacRxPfcPktNum, cardLabel, descMacRxPfcPktNum) + doUpdateMetric(ch, ts, statInfo.MacTxPfcPktNum, cardLabel, descMacTxPfcPktNum) + doUpdateMetric(ch, ts, statInfo.MacRxBadPktNum, cardLabel, descMacRxBadPktNum) + doUpdateMetric(ch, ts, statInfo.MacTxBadPktNum, cardLabel, descMacTxBadPktNum) + doUpdateMetric(ch, ts, statInfo.MacTxBadOctNum, cardLabel, descMacTxBadOctNum) + doUpdateMetric(ch, ts, statInfo.MacRxBadOctNum, cardLabel, descMacRxBadOctNum) + doUpdateMetric(ch, ts, statInfo.MacRXFcsErrPktNum, cardLabel, descRxFCSNum) +} + +func updateStatInfoOfRoCE(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { + doUpdateMetric(ch, ts, statInfo.RoceRxAllPktNum, cardLabel, descRoceRxAllPktNum) + doUpdateMetric(ch, ts, statInfo.RoceTxAllPktNum, cardLabel, descRoceTxAllPktNum) + doUpdateMetric(ch, ts, statInfo.RoceRxErrPktNum, cardLabel, descRoceRxErrPktNum) + doUpdateMetric(ch, ts, statInfo.RoceTxErrPktNum, cardLabel, descRoceTxErrPktNum) + doUpdateMetric(ch, ts, statInfo.RoceRxCnpPktNum, cardLabel, descRoceRxCnpPktNum) + doUpdateMetric(ch, ts, statInfo.RoceTxCnpPktNum, cardLabel, descRoceTxCnpPktNum) + doUpdateMetric(ch, ts, statInfo.RoceNewPktRtyNum, cardLabel, descRoceNewPktRtyNum) + doUpdateMetric(ch, ts, statInfo.RoceUnexpectedAckNum, cardLabel, descRoceUnexpectedAcktNum) + doUpdateMetric(ch, ts, statInfo.RoceOutOfOrderNum, cardLabel, descRoceOutOfOrderNum) + doUpdateMetric(ch, ts, statInfo.RoceVerificationErrNum, cardLabel, descRoceVerificationErrNum) + doUpdateMetric(ch, ts, statInfo.RoceQpStatusErrNum, cardLabel, descRoceQpStatusErrNum) + doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) + doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go new file mode 100644 index 0000000..918469c --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go @@ -0,0 +1,120 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +var ( + descSioCrcTxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_tx_err_cnt", + "sio transmitted error count between die") + descSioCrcRxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_rx_err_cnt", + "sio received error count between die") +) +var ( + supportedSioDevices = map[string]bool{ + api.Ascend910A3: true, + } +) + +type sioCache struct { + chip colcommon.HuaWeiAIChip + timestamp time.Time + // extInfo sio status between dies, only support super pod + extInfo *common.SioCrcErrStatisticInfo +} + +// SioCollector collect sio info +type SioCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *SioCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedSioDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), + "sio information cannot be queried.") + return isSupport +} + +// Describe description of the metric +func (c *SioCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descSioCrcTxErrCnt + ch <- descSioCrcRxErrCnt +} + +// CollectToCache collect the metric to cache +func (c *SioCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + logicID := chip.LogicID + sioInfo, err := n.Dmgr.GetSioInfo(logicID) + if err != nil { + logErrMetricsWithLimit(colcommon.DomainForSio, logicID, err) + continue + } + hwlog.ResetErrCnt(colcommon.DomainForSio, logicID) + + c.LocalCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: sioInfo}) + } + colcommon.UpdateCache[sioCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *SioCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache sioCache, cardLabel []string) { + extInfo := cache.extInfo + if extInfo == nil { + return + } + doUpdateMetric(ch, cache.timestamp, extInfo.TxErrCnt, cardLabel, descSioCrcTxErrCnt) + doUpdateMetric(ch, cache.timestamp, extInfo.RxErrCnt, cardLabel, descSioCrcRxErrCnt) + } + updateFrame[sioCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) +} + +// UpdateTelegraf update telegraf metrics +func (c *SioCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) + + extInfo := cache.extInfo + if extInfo == nil { + continue + } + + doUpdateTelegraf(fieldMap, descSioCrcTxErrCnt, extInfo.TxErrCnt, "") + doUpdateTelegraf(fieldMap, descSioCrcRxErrCnt, extInfo.RxErrCnt, "") + } + return fieldsMap +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go new file mode 100644 index 0000000..8cb32bd --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go @@ -0,0 +1,56 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/versions" +) + +var ( + versionInfoDesc = common.BuildDescWithLabel("npu_exporter_version_info", "exporter version with value '1'", + []string{"exporterVersion"}) +) + +// VersionCollector collect sio info +type VersionCollector struct { + common.MetricsCollectorAdapter +} + +// Describe description of the metric +func (c *VersionCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- versionInfoDesc +} + +// UpdatePrometheus update prometheus metric +func (c *VersionCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { + ch <- prometheus.MustNewConstMetric(versionInfoDesc, prometheus.GaugeValue, 1, []string{versions.BuildVersion}...) +} + +// UpdateTelegraf update telegraf metric +func (c *VersionCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + + if fieldsMap[common.GeneralDevTagKey] == nil { + fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], versionInfoDesc, versions.BuildVersion, "") + return fieldsMap +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go new file mode 100644 index 0000000..5117ec9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go @@ -0,0 +1,169 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + cardLabelForVNpuName = make([]string, len(colcommon.CardLabel)) + podAiCoreUtilizationRate *prometheus.Desc = nil + podTotalMemory *prometheus.Desc = nil + podUsedMemory *prometheus.Desc = nil +) + +var ( + supportedVnpuDevices = map[string]bool{ + api.Ascend310P: true, + } +) + +const ( + vNpuUUID = "v_dev_id" + aiCoreCnt = "aicore_count" + isVirtual = "is_virtual" +) + +func init() { + cardLabelForVNpuName = append(colcommon.CardLabel, isVirtual) + cardLabelForVNpuName[2] = vNpuUUID + cardLabelForVNpuName[3] = aiCoreCnt + + podAiCoreUtilizationRate = colcommon.BuildDescWithLabel("vnpu_pod_aicore_utilization", + "the vnpu aicore utilization rate, unit is '%'", cardLabelForVNpuName) + podTotalMemory = colcommon.BuildDescWithLabel("vnpu_pod_total_memory", + "the vnpu total memory on pod, unit is 'KB'", cardLabelForVNpuName) + podUsedMemory = colcommon.BuildDescWithLabel("vnpu_pod_used_memory", + "the vnpu used memory on pod, unit is 'KB'", cardLabelForVNpuName) + +} + +// VnpuCollector collect vnpu info +type VnpuCollector struct { + colcommon.MetricsCollectorAdapter +} + +// IsSupported check whether the collector is supported +func (c *VnpuCollector) IsSupported(n *colcommon.NpuCollector) bool { + isSupport := supportedVnpuDevices[n.Dmgr.GetDevType()] + logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") + return isSupport +} + +// Describe description of the metric +func (c *VnpuCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- podAiCoreUtilizationRate + ch <- podTotalMemory + ch <- podUsedMemory +} + +// CollectToCache collect the metric to cache +func (c *VnpuCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { + for _, chip := range chipList { + cache := &chipCache{ + chip: chip, + } + cache.timestamp = time.Now() + c.LocalCache.Store(chip.PhyId, *cache) + } + colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) +} + +// UpdatePrometheus update prometheus metrics +func (c *VnpuCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { + + updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { + if chipWithVnpu.VDevActivityInfo == nil { + return + } + vDevActivityInfo := chipWithVnpu.VDevActivityInfo + if !common.IsValidVDevID(vDevActivityInfo.VDevID) { + return + } + containerName := getContainerNameArray(containerMap[int32(vDevActivityInfo.VDevID)]) + if len(containerName) != colcommon.ContainerNameLen { + return + } + cardLabel = getPodDisplayInfo(&chipWithVnpu, containerName) + doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevAiCoreRate, cardLabel, podAiCoreUtilizationRate) + doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevTotalMem, cardLabel, podTotalMemory) + doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevUsedMem, cardLabel, podUsedMemory) + } + + updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) + +} + +// UpdateTelegraf update telegraf metrics +func (c *VnpuCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { + + caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) + for _, chip := range chips { + cache, ok := caches[chip.PhyId] + if !ok { + continue + } + + vDevActivityInfo := chip.VDevActivityInfo + if vDevActivityInfo == nil || !common.IsValidVDevID(vDevActivityInfo.VDevID) { + continue + } + + devTagKey := strconv.Itoa(int(cache.chip.LogicID)) + "_" + strconv.Itoa(int(vDevActivityInfo.VDevID)) + + if fieldsMap[devTagKey] == nil { + fieldsMap[devTagKey] = make(map[string]interface{}) + } + + doUpdateTelegraf(fieldsMap[devTagKey], podAiCoreUtilizationRate, vDevActivityInfo.VDevAiCoreRate, "") + doUpdateTelegraf(fieldsMap[devTagKey], podTotalMemory, vDevActivityInfo.VDevTotalMem, "") + doUpdateTelegraf(fieldsMap[devTagKey], podUsedMemory, vDevActivityInfo.VDevUsedMem, "") + } + return fieldsMap +} + +func getPodDisplayInfo(chip *colcommon.HuaWeiAIChip, containerName []string) []string { + if len(containerName) != colcommon.ContainerNameLen { + logger.Errorf("container name length %v is not %v", len(containerName), colcommon.ContainerNameLen) + return nil + } + + chipInfo := common.DeepCopyChipInfo(chip.ChipInfo) + vDevActivityInfo := common.DeepCopyVDevActivityInfo(chip.VDevActivityInfo) + + return []string{ + strconv.Itoa(int(chip.DeviceID)), + common.GetNpuName(chipInfo), + strconv.Itoa(int(vDevActivityInfo.VDevID)), + strconv.FormatFloat(vDevActivityInfo.VDevAiCore, 'f', colcommon.DecimalPlaces, colcommon.BitSize), + containerName[colcommon.NameSpaceIdx], + containerName[colcommon.PodNameIdx], + containerName[colcommon.ConNameIdx], + strconv.FormatBool(vDevActivityInfo.IsVirtualDev), + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go new file mode 100644 index 0000000..d57ade0 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go @@ -0,0 +1,202 @@ +/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "strconv" + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" +) + +const ( + vnpuMetricNum = 3 + validVnpuID = 100 + invalidVnpuID = 1 +) + +// TestVnpuCollectorIsSupported test VnpuCollector IsSupported +func TestVnpuCollectorIsSupported(t *testing.T) { + n := mockNewNpuCollector() + cases := []testCase{ + buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), + buildTestCase("VnpuCollector: testIsSupported on other type", &VnpuCollector{}, "OTHER", false), + } + + for _, c := range cases { + patches := gomonkey.NewPatches() + convey.Convey(c.name, t, func() { + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) + isSupported := c.collectorType.IsSupported(n) + convey.So(isSupported, convey.ShouldEqual, c.expectValue) + }) + } +} + +func TestVnpuCollectorDescribe(t *testing.T) { + collector := &VnpuCollector{} + convey.Convey("TestVnpuCollectorDescribe", t, func() { + ch := make(chan *prometheus.Desc, vnpuMetricNum) + collector.Describe(ch) + convey.So(len(ch), convey.ShouldEqual, vnpuMetricNum) + close(ch) + }) +} + +func TestVnpuCollectorCollectToCache(t *testing.T) { + collector := &VnpuCollector{} + n := mockNewNpuCollector() + testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} + + convey.Convey("TestVnpuCollectorCollectToCache", t, func() { + collector.CollectToCache(n, testChips) + cacheInfo := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(collector)) + convey.So(cacheInfo, convey.ShouldNotBeNil) + }) +} + +func TestVnpuCollectorUpdatePrometheus(t *testing.T) { + collector := &VnpuCollector{} + n := mockNewNpuCollector() + containerMap := mockContainerInfo() + + testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} + collector.CollectToCache(n, testChips) + chip := createValidVnpuChip() + testCases := []struct { + name string + preHandleFunc func() + expectValue int + }{ + {name: "TestVnpuCollectorUpdatePrometheus_effective virtual device scenarios", + preHandleFunc: func() {}, + expectValue: vnpuMetricNum, + }, + {name: "TestVnpuCollectorUpdatePrometheus_there is no container info", + preHandleFunc: func() { + containerMap = map[int32]container.DevicesInfo{} + }, + expectValue: 0, + }, + {name: "TestVnpuCollectorUpdatePrometheus_the vdevid is invalid", + preHandleFunc: func() { + chip.VDevActivityInfo.VDevID = invalidVnpuID + }, + expectValue: 0, + }, + {name: "TestVnpuCollectorUpdatePrometheus_there is no vdev info", + preHandleFunc: func() { + chip.VDevActivityInfo = nil + }, + expectValue: 0, + }, + } + ch := make(chan prometheus.Metric, vnpuMetricNum) + defer close(ch) + for _, tt := range testCases { + convey.Convey(tt.name, t, func() { + tt.preHandleFunc() + collector.UpdatePrometheus(ch, n, containerMap, []colcommon.HuaWeiAIChip{chip}) + convey.So(len(ch), convey.ShouldEqual, tt.expectValue) + //clean ch + for { + if len(ch) == 0 { + break + } + <-ch + } + }) + } +} + +func mockContainerInfo() map[int32]container.DevicesInfo { + containerMap := map[int32]container.DevicesInfo{ + validVnpuID: { + Devices: []int{0}, + ID: strconv.Itoa(validVnpuID), + Name: "nsName_podName_ctrName", + }, + } + return containerMap +} + +func TestVnpuCollectorUpdateTelegraf(t *testing.T) { + collector := &VnpuCollector{} + n := mockNewNpuCollector() + containerMap := mockContainerInfo() + testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} + collector.CollectToCache(n, testChips) + chip := createValidVnpuChip() + convey.Convey("TestVnpuCollectorUpdateTelegraf", t, func() { + convey.Convey("effective virtual device scenarios", func() { + chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} + newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) + convey.So(len(newFieldMaps), convey.ShouldEqual, 1) + convey.So(len(newFieldMaps["0_100"]), convey.ShouldEqual, vnpuMetricNum) + }) + convey.Convey("there is no container info", func() { + chip.VDevActivityInfo = nil + chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} + containerMap = map[int32]container.DevicesInfo{} + newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) + convey.So(len(newFieldMaps), convey.ShouldEqual, 0) + }) + + }) +} + +func TestGetPodDisplayInfo(t *testing.T) { + const num8 = 8 + convey.Convey("TestGetPodDisplayInfo", t, func() { + chip := createValidVnpuChip() + convey.Convey("valid container information", func() { + containerNames := []string{"namespace", "pod-name", "container-name"} + labels := getPodDisplayInfo(&chip, containerNames) + convey.Convey("should return 8 metrics", func() { + convey.So(len(labels), convey.ShouldEqual, num8) + convey.So(labels[len(labels)-1], convey.ShouldEqual, "true") + }) + }) + + convey.Convey("invalid container information", func() { + containerNames := []string{"short"} + labels := getPodDisplayInfo(&chip, containerNames) + convey.Convey("should return nil", func() { + convey.So(labels, convey.ShouldBeNil) + }) + }) + }) +} + +func createValidVnpuChip() colcommon.HuaWeiAIChip { + chip := createChip() + chip.VDevActivityInfo = &common.VDevActivityInfo{ + VDevID: validVnpuID, + VDevAiCore: 1, + VDevTotalMem: 1, + VDevUsedMem: 1, + IsVirtualDev: true, + } + return chip +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go new file mode 100644 index 0000000..7524c68 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go @@ -0,0 +1,548 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics for general collector +package metrics + +import ( + "strconv" + "sync" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "ascend-common/devmanager/common" + "ascend-common/devmanager/hccn" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + maxMetricsCount = 2000 + num5 = 5 + mockContainerName = "mockContainerName" + maxChipNum int32 = 8 +) + +var ( + collectorChain []colcommon.MetricsCollector +) + +// TestDescribe test Describe +func TestDescribe(t *testing.T) { + + convey.Convey("test prometheus desc ", t, func() { + ch := make(chan *prometheus.Desc, maxMetricsCount) + for _, c := range collectorChain { + c.Describe(ch) + } + t.Logf("Describe len(ch):%v", len(ch)) + convey.So(ch, convey.ShouldNotBeEmpty) + }) +} + +type testCase struct { + name string + collectorType colcommon.MetricsCollector + deviceType string + expectValue bool +} + +func buildTestCase(name string, collectorType colcommon.MetricsCollector, deviceType string, + expectValue bool) testCase { + return testCase{ + name: name, + collectorType: collectorType, + deviceType: deviceType, + expectValue: expectValue, + } +} + +// testIsSupported test IsSupported +func TestIsSupported(t *testing.T) { + n := mockNewNpuCollector() + cases := []testCase{ + buildTestCase("DdrCollector: testIsSupported on Ascend310", &DdrCollector{}, api.Ascend310, true), + buildTestCase("DdrCollector: testIsSupported on Ascend310P", &DdrCollector{}, api.Ascend310P, true), + buildTestCase("DdrCollector: testIsSupported on Ascend910", &DdrCollector{}, api.Ascend910, true), + buildTestCase("DdrCollector: testIsSupported on Ascend910B", &DdrCollector{}, api.Ascend910B, false), + buildTestCase("DdrCollector: testIsSupported on Ascend910A3", &DdrCollector{}, api.Ascend910A3, false), + + buildTestCase("HccsCollector: testIsSupported on Ascend310", &HccsCollector{}, api.Ascend310, false), + buildTestCase("HccsCollector: testIsSupported on Ascend310P", &HccsCollector{}, api.Ascend310P, false), + buildTestCase("HccsCollector: testIsSupported on Ascend910", &HccsCollector{}, api.Ascend910, false), + buildTestCase("HccsCollector: testIsSupported on Ascend910B", &HccsCollector{}, api.Ascend910B, true), + buildTestCase("HccsCollector: testIsSupported on Ascend910A3", &HccsCollector{}, api.Ascend910A3, true), + + buildTestCase("SioCollector: testIsSupported on Ascend310", &SioCollector{}, api.Ascend310, false), + buildTestCase("SioCollector: testIsSupported on Ascend310P", &SioCollector{}, api.Ascend310P, false), + buildTestCase("SioCollector: testIsSupported on Ascend910", &SioCollector{}, api.Ascend910, false), + buildTestCase("SioCollector: testIsSupported on Ascend910B", &SioCollector{}, api.Ascend910B, false), + buildTestCase("SioCollector: testIsSupported on Ascend910A3", &SioCollector{}, api.Ascend910A3, true), + + buildTestCase("VnpuCollector: testIsSupported on Ascend310", &VnpuCollector{}, api.Ascend310, false), + buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), + buildTestCase("VnpuCollector: testIsSupported on Ascend910", &VnpuCollector{}, api.Ascend910, false), + buildTestCase("VnpuCollector: testIsSupported on Ascend910B", &VnpuCollector{}, api.Ascend910B, false), + buildTestCase("VnpuCollector: testIsSupported on Ascend910A3", &VnpuCollector{}, api.Ascend910A3, false), + } + + for _, c := range cases { + patches := gomonkey.NewPatches() + convey.Convey(c.name, t, func() { + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) + isSupported := c.collectorType.IsSupported(n) + convey.So(isSupported, convey.ShouldEqual, c.expectValue) + }) + } +} + +// TestIsSupported2 test IsSupported +func TestIsSupported2(t *testing.T) { + n := mockNewNpuCollector() + convey.Convey("TestIsSupported ", t, func() { + for _, c := range collectorChain { + c.IsSupported(n) + } + }) + +} + +// TestCollectToCache test CollectToCache +func TestCollectToCache(t *testing.T) { + n := mockNewNpuCollector() + + convey.Convey("TestCollectToCache", t, func() { + + patches := gomonkey.NewPatches() + defer patches.Reset() + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceMemoryInfo", mockMemoryInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHbmInfo", mockHbmAggregateInfo().HbmInfo, nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceEccInfo", mockHbmAggregateInfo().ECCInfo, nil) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfo", mockHccsStaticsInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", mockHccsStaticsInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetHccsBandwidthInfo", mockHccsBWInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetPCIEBandwidth", mockPcieInfo(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetSioInfo", mockSioInfo(), nil) + patches.ApplyFuncReturn(hccn.GetNPULinkStatus, "UP", nil) + patches.ApplyFuncReturn(hccn.GetNPUInterfaceTraffic, float64(0), float64(0), nil) + patches.ApplyFuncReturn(hccn.GetNPULinkUpNum, 0, nil) + patches.ApplyFuncReturn(hccn.GetNPULinkSpeed, 0, nil) + patches.ApplyFuncReturn(hccn.GetNPUOpticalInfo, mockOpticalInfo(), nil) + patches.ApplyFuncReturn(hccn.GetNPUStatInfo, mockRoceInfoMap(), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceFrequency", uint32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceTemperature", int32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceVoltage", float32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceAllErrorCode", int32(1), []int64{0}, nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHealth", uint32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDevicePowerInfo", float32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDeviceUtilizationRate", uint32(0), nil) + patches.ApplyMethodReturn(n.Dmgr, "GetDevProcessInfo", mockProcessInfo(), nil) + + chips := mockGetNPUChipList() + for _, c := range collectorChain { + c.PreCollect(n, chips) + c.CollectToCache(n, chips) + } + + convey.So(colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(&DdrCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(&HbmCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(&HccsCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(&NetworkCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(&BaseInfoCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(&OpticalCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(&PcieCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(&RoceCollector{})), + convey.ShouldNotBeEmpty) + convey.So(colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(&SioCollector{})), + convey.ShouldNotBeEmpty) + + }) +} + +// TestUpdatePrometheus test UpdatePrometheus +func TestUpdatePrometheus(t *testing.T) { + n := mockNewNpuCollector() + + convey.Convey("TestUpdatePrometheus", t, func() { + + ch := make(chan prometheus.Metric, maxMetricsCount) + + patches := gomonkey.NewPatches() + defer patches.Reset() + containerInfos := mockGetContainerNPUInfo() + chips := mockGetNPUChipList() + + mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) + mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) + mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) + mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) + mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) + mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) + mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) + mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) + mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) + + for _, c := range collectorChain { + c.UpdatePrometheus(ch, n, containerInfos, chips) + } + + t.Logf("TestUpdatePrometheus len(ch):%v", len(ch)) + convey.So(ch, convey.ShouldNotBeEmpty) + }) +} + +// TestUpdateTelegraf test UpdateTelegraf +func TestUpdateTelegraf(t *testing.T) { + n := mockNewNpuCollector() + + convey.Convey("TestUpdatePrometheus", t, func() { + + patches := gomonkey.NewPatches() + defer patches.Reset() + containerInfos := mockGetContainerNPUInfo() + chips := mockGetNPUChipList() + + mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) + mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) + mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) + mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) + mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) + mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) + mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) + mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) + mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) + fieldsMap := make(map[string]map[string]interface{}) + + for _, c := range collectorChain { + c.UpdateTelegraf(fieldsMap, n, containerInfos, chips) + } + + t.Logf("fieldsMap len(ch):%v", len(fieldsMap)) + convey.So(fieldsMap, convey.ShouldNotBeEmpty) + }) +} + +func mockRoceCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), + extInfo: getMainStatInfo(mockRoceInfoMap())}) + } + colcommon.UpdateCache[roceCache](n, cacheKey, &localCache) +} + +func mockRoceInfoMap() map[string]int { + return map[string]int{ + macRxMacPauseNum: 0, + macTxMacPauseNum: 0, + macRxPfcPktNum: 0, + macTxPfcPktNum: 0, + macRxBadPktNum: 0, + macTxBadPktNum: 0, + roCERxAllPktNum: 0, + roCETxAllPktNum: 0, + roCERxErrPktNum: 0, + roCETxErrPktNum: 0, + roCERxCnpPktNum: 0, + roCETxCnpPktNum: 0, + macRxBadOctNum: 0, + macTxBadOctNum: 0, + roCEUnexpectedAckNum: 0, + roCEOutOfOrderNum: 0, + roCEVerificationErrNum: 0, + roCEQpStatusErrNum: 0, + roCENewPktRtyNum: 0, + roCEEcnDBNum: 0, + macRXFcsErrPktNum: 0, + } +} + +func mockDdrCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mockMemoryInfo()}) + } + colcommon.UpdateCache[ddrCache](n, cacheKey, &localCache) +} + +func mockHccsCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, hccsCache{chip: chip, timestamp: time.Now(), + hccsStat: mockHccsStaticsInfo(), hccsBW: mockHccsBWInfo()}) + } + colcommon.UpdateCache[hccsCache](n, cacheKey, &localCache) +} + +func mockHccsBWInfo() *common.HccsBandwidthInfo { + return &common.HccsBandwidthInfo{ + ProfilingTime: 0, + RxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, + TxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, + TotalRxbw: 0, + TotalTxbw: 0, + } +} + +func mockHccsStaticsInfo() *common.HccsStatisticInfo { + return &common.HccsStatisticInfo{ + TxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + RxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + CrcErrCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + } +} + +func mockSioCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: mockSioInfo()}) + } + colcommon.UpdateCache[sioCache](n, cacheKey, &localCache) +} + +func mockSioInfo() *common.SioCrcErrStatisticInfo { + return &common.SioCrcErrStatisticInfo{ + TxErrCnt: 0, + RxErrCnt: 0, + } +} +func mockPcieCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + pcieInfo := mockPcieInfo() + localCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieInfo}) + } + colcommon.UpdateCache[pcieCache](n, cacheKey, &localCache) +} + +func mockPcieInfo() common.PCIEBwStat { + return common.PCIEBwStat{ + PcieRxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieRxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieRxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieTxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieTxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + PcieTxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, + } +} + +func mockOpticalCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), + extInfo: getMainOptInfo(mockOpticalInfo())}) + } + colcommon.UpdateCache[opticalCache](n, cacheKey, &localCache) +} + +func mockOpticalInfo() map[string]string { + return map[string]string{ + txPower0: "1 mW", + txPower1: "1 mW", + txPower2: "1 mW", + txPower3: "1 mW", + rxPower0: "1 mW", + rxPower1: "1 mW", + rxPower2: "1 mW", + rxPower3: "1 mW", + voltage: "1 mV", + temperature: "50 C", + present: "1.0", + } +} + +func mockHbmCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, hbmCache{chip: chip, timestamp: time.Now(), extInfo: mockHbmAggregateInfo(), + hbmUtilization: 0}, + ) + } + colcommon.UpdateCache[hbmCache](n, cacheKey, &localCache) +} + +func mockNetInfoCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: mockNetInfo()}) + } + colcommon.UpdateCache[netInfoCache](n, cacheKey, &localCache) +} + +func mockNetInfo() *common.NpuNetInfo { + return &common.NpuNetInfo{ + LinkStatusInfo: &common.LinkStatusInfo{LinkState: "0"}, + BandwidthInfo: &common.BandwidthInfo{RxValue: 0, TxValue: 0}, + LinkStatInfo: &common.LinkStatInfo{LinkUPNum: 0}, + LinkSpeedInfo: &common.LinkSpeedInfo{Speed: 0}, + } +} + +func mockChipCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { + localCache := sync.Map{} + for _, chip := range chips { + localCache.Store(chip.PhyId, chipCache{chip: chip, timestamp: time.Now(), + HealthStatus: "Healthy", + ErrorCodes: []int64{0}, + Utilization: 0, + OverallUtilization: 0, + VectorUtilization: 0, + Temperature: 0, + Power: 0, + Voltage: 0, + AICoreCurrentFreq: 0, + NetHealthStatus: "Healthy", + DevProcessInfo: mockProcessInfo(), + }) + } + colcommon.UpdateCache[chipCache](n, cacheKey, &localCache) +} + +func mockProcessInfo() *common.DevProcessInfo { + return &common.DevProcessInfo{ + ProcNum: 1, + DevProcArray: []common.DevProcInfo{{Pid: 0, MemUsage: 0}}, + } +} + +func mockMemoryInfo() *common.MemoryInfo { + return &common.MemoryInfo{ + MemorySize: 0, + MemoryAvailable: 0, + Frequency: 0, + Utilization: 0, + } +} + +func mockHbmAggregateInfo() *common.HbmAggregateInfo { + return &common.HbmAggregateInfo{ + HbmInfo: &common.HbmInfo{ + MemorySize: 1, + Frequency: 1, + Usage: 1, + Temp: 1, + BandWidthUtilRate: 1, + }, + ECCInfo: &common.ECCInfo{ + EnableFlag: 1, + }, + } +} + +func mockNewNpuCollector() *colcommon.NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: time.Duration(num5) * time.Second, + updateTime: time.Duration(num5) * time.Second, + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := colcommon.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +func mockGetNPUChipList() []colcommon.HuaWeiAIChip { + chips := make([]colcommon.HuaWeiAIChip, 0) + for id := int32(0); id < maxChipNum; id++ { + chip := colcommon.HuaWeiAIChip{ + CardId: id, + PhyId: id, + DeviceID: id, + LogicID: id, + ChipInfo: &common.ChipInfo{ + Name: api.Ascend910, + Type: "Ascend", + Version: "V1", + }, + } + + chips = append(chips, chip) + } + return chips +} + +func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { + containsInfo := make(map[int32]container.DevicesInfo) + for id := int32(0); id < maxChipNum; id++ { + + containerInfo := container.DevicesInfo{ + ID: strconv.Itoa(int(id)), + Name: mockContainerName, + Devices: []int{int(id)}, + } + containsInfo[id] = containerInfo + } + return containsInfo +} + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + + initChain() +} + +func initChain() { + collectorChain = []colcommon.MetricsCollector{ + &HccsCollector{}, + &BaseInfoCollector{}, + &SioCollector{}, + &VersionCollector{}, + &HbmCollector{}, + &DdrCollector{}, + &VnpuCollector{}, + &PcieCollector{}, + &NetworkCollector{}, + &RoceCollector{}, + &OpticalCollector{}, + } +} + +func createChip() colcommon.HuaWeiAIChip { + return colcommon.HuaWeiAIChip{ + CardId: 0, + PhyId: 0, + DeviceID: 0, + LogicID: 0, + ChipInfo: &common.ChipInfo{ + Name: api.Ascend910, + Type: "Ascend", + Version: "V1", + }, + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go new file mode 100644 index 0000000..7a0697d --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go @@ -0,0 +1,193 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics offer common utils for collector +package metrics + +import ( + "math" + "reflect" + "strconv" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +func validateNum(num float64) bool { + if num == -1 || num == math.MaxUint32 || float32(num) == math.MaxUint32 { + return false + } + + return true +} + +func doUpdateTelegrafWithValidateNum(fieldMap map[string]interface{}, desc *prometheus.Desc, + value float64, extInfo string) { + if validateNum(value) { + doUpdateTelegraf(fieldMap, desc, value, extInfo) + } +} + +func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { + fieldMap[utils.GetDescName(desc)+extInfo] = value +} + +func doUpdateMetricWithValidateNum(ch chan<- prometheus.Metric, timestamp time.Time, value float64, + cardLabel []string, desc *prometheus.Desc) { + if validateNum(value) { + doUpdateMetric(ch, timestamp, value, cardLabel, desc) + } +} +func doUpdateMetric(ch chan<- prometheus.Metric, timestamp time.Time, value interface{}, + cardLabel []string, desc *prometheus.Desc) { + var finalValue float64 + + switch value.(type) { + case int: + finalValue = float64(value.(int)) + case int32: + finalValue = float64(value.(int32)) + case int64: + finalValue = float64(value.(int64)) + case uint32: + finalValue = float64(value.(uint32)) + case uint64: + finalValue = float64(value.(uint64)) + case float32: + finalValue = float64(value.(float32)) + case float64: + finalValue = value.(float64) + default: + logger.Errorf("invalid param in function doUpdateMetric,"+ + "metrics name is (%v), value type is (%T),value is (%v)", utils.GetDescName(desc), value, value) + } + // collect failed, set value to -1 + if finalValue == common.FailedValue { + finalValue = common.FailedMetricValue + } + ch <- prometheus.NewMetricWithTimestamp(timestamp, + prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, finalValue, cardLabel...)) +} + +func getContainerInfoWithDefault(cNameArray []string) (containerName, namespaceValue, podNameValue string) { + if len(cNameArray) == colcommon.ContainerNameLen { + namespaceValue = cNameArray[colcommon.NameSpaceIdx] + podNameValue = cNameArray[colcommon.PodNameIdx] + containerName = cNameArray[colcommon.ConNameIdx] + } + return containerName, namespaceValue, podNameValue +} + +func geenGeneralCardLabel(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) []string { + + containerInfo := geenContainerInfo(chip, containerMap) + + containerName, namespaceValue, podNameValue := getContainerInfoWithDefault(getContainerNameArray(containerInfo)) + cardLabel := collectCardLabelValue(chip, namespaceValue, podNameValue, containerName) + return cardLabel +} + +func geenContainerInfo(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) container.DevicesInfo { + deviceID := chip.DeviceID + if chip.VDevActivityInfo != nil && chip.VDevActivityInfo.IsVirtualDev { + deviceID = int32(chip.VDevActivityInfo.VDevID) + } + containerInfo, ok := containerMap[deviceID] + if !ok { + containerInfo = container.DevicesInfo{} + } + return containerInfo +} +func collectCardLabelValue(chip *colcommon.HuaWeiAIChip, namespaceValue, podNameValue, containerName string) []string { + + return []string{strconv.FormatInt(int64(chip.DeviceID), colcommon.Base), common.GetNpuName(chip.ChipInfo), chip.VDieID, + chip.PCIeBusInfo, namespaceValue, podNameValue, containerName} +} + +func getContainerNameArray(devInfo container.DevicesInfo) []string { + if devInfo.Name == "" { + return nil + } + + return strings.Split(devInfo.Name, "_") +} + +func getFieldMap(fieldsMap map[string]map[string]interface{}, devTagKey int32) map[string]interface{} { + devTagKeyStr := strconv.Itoa(int(devTagKey)) + if fieldsMap[devTagKeyStr] == nil { + fieldsMap[devTagKeyStr] = make(map[string]interface{}) + } + return fieldsMap[devTagKeyStr] +} + +func handleErr(err error, domain string, logicID int32) { + if err != nil { + logErrMetricsWithLimit(domain, logicID, err) + } else { + hwlog.ResetErrCnt(domain, logicID) + } +} + +func logErrMetricsWithLimit(metric string, logicID int32, err error) { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{ + Domain: metric, + ID: logicID}, + "logicID(%d),%v", logicID, err) +} + +func validateNotNilForEveryElement(objs ...interface{}) bool { + for _, v := range objs { + val := reflect.ValueOf(v) + if val.Kind() != reflect.Ptr { + return false + } + if val.IsNil() { + return false + } + } + return true +} +func logForUnSupportDevice(isSupport bool, devType string, group string, extInfo string) { + if !isSupport { + logger.Infof("devType %v does not support [%v], %v", devType, group, extInfo) + } +} + +func updateFrame[T any](cacheKey string, n *colcommon.NpuCollector, containerMap map[int32]container.DevicesInfo, + chips []colcommon.HuaWeiAIChip, callBack func(chipWithVnpu colcommon.HuaWeiAIChip, cache T, cardLabel []string)) { + + caches := colcommon.GetInfoFromCache[T](n, cacheKey) + if len(caches) == 0 { + logger.Debugf("cacheKey(%v) not found", cacheKey) + return + } + for _, chip := range chips { + cardLabel := geenGeneralCardLabel(&chip, containerMap) + cache, ok := caches[chip.PhyId] + if !ok { + logger.Warnf("cacheKey(%v) not found, chip.PhyId(%v)", cacheKey, chip.PhyId) + continue + } + + callBack(chip, cache, cardLabel) + } +} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go new file mode 100644 index 0000000..9cb88bd --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go @@ -0,0 +1,165 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package metrics offer common utils for collector +package metrics + +import ( + "math" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/devmanager/common" + colcommon "huawei.com/npu-exporter/v6/collector/common" +) + +const ( + invalidNum = -1 + num100 = 100 +) + +// TestValidateNum test numerical verification +func TestValidateNum(t *testing.T) { + convey.Convey("TestValidateNum", t, func() { + convey.Convey("return true when the num is valid", func() { + convey.So(validateNum(0), convey.ShouldBeTrue) + convey.So(validateNum(num100), convey.ShouldBeTrue) + }) + + convey.Convey("return false when the num is invalid", func() { + convey.So(validateNum(invalidNum), convey.ShouldBeFalse) + convey.So(validateNum(math.MaxUint32), convey.ShouldBeFalse) + }) + }) +} + +// TestDoUpdateTelegraf test update telegraf +func TestDoUpdateTelegraf(t *testing.T) { + convey.Convey("TestDoUpdateTelegraf", t, func() { + fieldMap := make(map[string]interface{}) + desc := prometheus.NewDesc("test_metric", "", nil, nil) + + convey.Convey("update when num is valid", func() { + doUpdateTelegrafWithValidateNum(fieldMap, desc, num100, "_suffix") + convey.So(fieldMap["test_metric_suffix"], convey.ShouldEqual, num100) + }) + + convey.Convey("don't update when num is invalid", func() { + doUpdateTelegrafWithValidateNum(fieldMap, desc, -1, "_suffix") + convey.So(fieldMap, convey.ShouldBeEmpty) + }) + }) +} + +// TestDoUpdateMetric test update prometheus +func TestDoUpdateMetric(t *testing.T) { + const ( + num10 = 10 + num100 = 100 + negaNum = -5 + floatNum = 3.14 + ) + convey.Convey("TestDoUpdateMetric", t, func() { + ch := make(chan prometheus.Metric, 1) + desc := prometheus.NewDesc("test_metric", "", []string{"label"}, nil) + + convey.Convey("convert the various numeric types correctly", func() { + testCases := []struct { + input interface{} + expected float64 + }{ + {int(num10), num10}, + {int32(negaNum), negaNum}, + {uint64(num100), num100}, + {float32(floatNum), floatNum}, + } + + for _, tc := range testCases { + doUpdateMetric(ch, time.Now(), tc.input, []string{"label"}, desc) + m := <-ch + convey.So(m, convey.ShouldNotBeEmpty) + } + }) + }) +} + +// TestContainerInfo test container information processing +func TestContainerInfo(t *testing.T) { + convey.Convey("TestContainerInfo", t, func() { + convey.Convey("correctly split the array of container names", func() { + testCases := []struct { + input []string + expected []string + }{ + {[]string{"ns", "pod", "container"}, []string{"container", "ns", "pod"}}, + {[]string{"short"}, []string{"", "", ""}}, + } + + for _, tc := range testCases { + c, ns, pod := getContainerInfoWithDefault(tc.input) + convey.So([]string{c, ns, pod}, convey.ShouldResemble, tc.expected) + } + }) + }) +} + +// TestCardLabel test card label generation +func TestCardLabel(t *testing.T) { + convey.Convey("TestCardLabel", t, func() { + chip := &colcommon.HuaWeiAIChip{ + DeviceID: 0, + ChipInfo: &common.ChipInfo{Name: "1", Type: "1", Version: "1"}, + VDieID: "die1", + PCIeBusInfo: "0000:00:01.0", + } + + expected := []string{ + "0", + "1-1-1", + "die1", + "0000:00:01.0", + "test-ns", + "test-pod", + "test-container", + } + + convey.Convey("correctly generate an array of tags", func() { + labels := collectCardLabelValue(chip, "test-ns", "test-pod", "test-container") + convey.So(labels, convey.ShouldResemble, expected) + }) + }) +} + +// TestNilValidation test null pointer validation +func TestNilValidation(t *testing.T) { + convey.Convey("TestNilValidation", t, func() { + var nilPtr *int + val := 10 + + convey.Convey("all non null pointers should return true", func() { + convey.So(validateNotNilForEveryElement(&val), convey.ShouldBeTrue) + }) + + convey.Convey("a null pointer should return false", func() { + convey.So(validateNotNilForEveryElement(nilPtr), convey.ShouldBeFalse) + }) + + convey.Convey("non pointer types should return false", func() { + convey.So(validateNotNilForEveryElement(val), convey.ShouldBeFalse) + }) + }) +} diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics new file mode 100644 index 0000000..8f51362 --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics @@ -0,0 +1,166 @@ +# HELP machine_npu_nums Amount of npu installed on the machine. +# TYPE machine_npu_nums gauge +machine_npu_nums 8 +# HELP npu_chip_info_aicore_current_freq the npu ai core current frequency, unit is 'MHz' +# TYPE npu_chip_info_aicore_current_freq gauge +npu_chip_info_aicore_current_freq{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_aicore_current_freq{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_bandwidth_rx the npu interface receive speed, unit is 'MB/s' +# TYPE npu_chip_info_bandwidth_rx gauge +npu_chip_info_bandwidth_rx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_rx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_bandwidth_tx the npu interface transport speed, unit is 'MB/s' +# TYPE npu_chip_info_bandwidth_tx gauge +npu_chip_info_bandwidth_tx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_bandwidth_tx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_error_code the npu error code +# TYPE npu_chip_info_error_code gauge +npu_chip_info_error_code{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_error_code{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_hbm_total_memory the npu hbm total memory +# TYPE npu_chip_info_hbm_total_memory gauge +npu_chip_info_hbm_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_hbm_used_memory the npu hbm used memory +# TYPE npu_chip_info_hbm_used_memory gauge +npu_chip_info_hbm_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_hbm_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_health_status the npu health status +# TYPE npu_chip_info_health_status gauge +npu_chip_info_health_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_health_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +# HELP npu_chip_info_link_status the npu link status +# TYPE npu_chip_info_link_status gauge +npu_chip_info_link_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_link_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_name the Ascend npu name with value '1' +# TYPE npu_chip_info_name gauge +npu_chip_info_name{container_name="",id="0",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="1",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="2",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="3",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="4",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="5",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="6",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +npu_chip_info_name{container_name="",id="7",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 +# HELP npu_chip_info_network_status the npu network health status +# TYPE npu_chip_info_network_status gauge +npu_chip_info_network_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_network_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_power the npu power +# TYPE npu_chip_info_power gauge +npu_chip_info_power{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_power{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_temperature the npu temperature +# TYPE npu_chip_info_temperature gauge +npu_chip_info_temperature{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_temperature{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_total_memory the npu total memory +# TYPE npu_chip_info_total_memory gauge +npu_chip_info_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_used_memory the npu used memory +# TYPE npu_chip_info_used_memory gauge +npu_chip_info_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_utilization the ai core utilization +# TYPE npu_chip_info_utilization gauge +npu_chip_info_utilization{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_utilization{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_chip_info_voltage the npu voltage +# TYPE npu_chip_info_voltage gauge +npu_chip_info_voltage{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +npu_chip_info_voltage{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 +# HELP npu_exporter_version_info exporter version with value '1' +# TYPE npu_exporter_version_info gauge +npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 new file mode 100644 index 0000000..bd501ee --- /dev/null +++ b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 @@ -0,0 +1,6 @@ +# HELP machine_npu_nums Amount of npu installed on the machine. +# TYPE machine_npu_nums gauge +machine_npu_nums 0 +# HELP npu_exporter_version_info exporter version with value '1' +# TYPE npu_exporter_version_info gauge +npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/go.mod b/mind-cluster/component/npu-exporter/go.mod new file mode 100644 index 0000000..0d84960 --- /dev/null +++ b/mind-cluster/component/npu-exporter/go.mod @@ -0,0 +1,63 @@ +module huawei.com/npu-exporter/v6 + +go 1.18 + +require ( + ascend-common v0.0.0 + github.com/agiledragon/gomonkey/v2 v2.8.0 + github.com/golang/protobuf v1.5.3 + github.com/influxdata/telegraf v1.26.3 + github.com/prometheus/client_golang v1.15.0 + github.com/smartystreets/goconvey v1.6.4 + github.com/stretchr/testify v1.8.2 + google.golang.org/grpc v1.57.2 + google.golang.org/protobuf v1.30.0 + k8s.io/cri-api v0.25.13 +) + +require ( + github.com/BurntSushi/toml v1.2.1 // indirect + github.com/alecthomas/participle v0.4.1 // indirect + github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect + github.com/awnumar/memcall v0.1.2 // indirect + github.com/awnumar/memguard v0.22.3 // indirect + github.com/benbjohnson/clock v1.3.3 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/blues/jsonata-go v1.5.4 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/coreos/go-semver v0.3.1 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/fatih/color v1.15.0 // indirect + github.com/fsnotify/fsnotify v1.6.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect + github.com/gosnmp/gosnmp v1.35.0 // indirect + github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 // indirect + github.com/jtolds/gls v4.20.0+incompatible // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.17 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/naoina/go-stringutil v0.1.0 // indirect + github.com/philhofer/fwd v1.1.2 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_model v0.3.0 // indirect + github.com/prometheus/common v0.42.0 // indirect + github.com/prometheus/procfs v0.9.0 // indirect + github.com/prometheus/prometheus v0.42.0 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/sleepinggenius2/gosmi v0.4.4 // indirect + github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect + github.com/tinylib/msgp v1.1.8 // indirect + golang.org/x/crypto v0.31.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/sys v0.28.0 // indirect + golang.org/x/text v0.21.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apimachinery v0.26.2 // indirect +) + +replace ascend-common => ../ascend-common diff --git a/mind-cluster/component/npu-exporter/go.sum b/mind-cluster/component/npu-exporter/go.sum new file mode 100644 index 0000000..d638dd1 --- /dev/null +++ b/mind-cluster/component/npu-exporter/go.sum @@ -0,0 +1,561 @@ +cloud.google.com/go v0.110.1 h1:oDJ19Fu9TX9Xs06iyCw4yifSqZ7JQ8BeuVHcTmWQlOA= +cloud.google.com/go/bigquery v1.51.1 h1:qI/8vkBbzLkv0BJmzE7ajA6uZqQC+C31MAwgb+vJe2U= +cloud.google.com/go/compute v1.19.1 h1:am86mquDUgjGNWxiGn+5PGLbmgiWXlE/yNWpIpNvuXY= +cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= +cloud.google.com/go/iam v1.0.0 h1:hlQJMovyJJwYjZcTohUH4o1L8Z8kYz+E+W/zktiLCBc= +cloud.google.com/go/monitoring v1.13.0 h1:2qsrgXGVoRXpP7otZ14eE1I568zAa92sJSDPyOJvwjM= +cloud.google.com/go/pubsub v1.30.1 h1:RdzTlwhswvROjPIoTfnSJ9tEp0LY2S5ATX90anOw7E8= +cloud.google.com/go/storage v1.29.0 h1:6weCgzRvMg7lzuUurI4697AqIRPU1SvzHhynwpW31jI= +code.cloudfoundry.org/clock v1.0.0 h1:kFXWQM4bxYvdBw2X8BbBeXwQNgfoWv1vqAk2ZZyBN2o= +collectd.org v0.5.0 h1:y4uFSAuOmeVhG3GCRa3/oH+ysePfO/+eGJNfd0Qa3d8= +github.com/Azure/azure-amqp-common-go/v4 v4.0.0 h1:mV5O74KYmonn0ZXtwfMjGUtZ9Z+Hv7AIFVS1s03sRvo= +github.com/Azure/azure-event-hubs-go/v3 v3.4.0 h1:LtH0nHkXivyV/GajOu5ZFC5sb/5KZ8j+9U8UsfHVTOo= +github.com/Azure/azure-kusto-go v0.8.0 h1:AeO6VBRGzB1BhmWeheSyN+WSrx+1wmhHm47vzptitdw= +github.com/Azure/azure-pipeline-go v0.2.3 h1:7U9HBg1JFK3jHl5qmo4CTZKFTVgMwdFHMVtCdfBE21U= +github.com/Azure/azure-sdk-for-go v65.0.0+incompatible h1:HzKLt3kIwMm4KeJYTdx9EbjRYTySD/t8i1Ee/W5EGXw= +github.com/Azure/azure-sdk-for-go/sdk/azcore v0.21.1 h1:qoVeMsc9/fh/yhxVaA0obYjVH/oI/ihrOoMwsLS9KSA= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v0.13.2 h1:mM/yraAumqMMIYev6zX0oxHqX6hreUs5wXf76W47r38= +github.com/Azure/azure-sdk-for-go/sdk/internal v0.9.1 h1:sLZ/Y+P/5RRtsXWylBjB5lkgixYfm0MQPiwrSX//JSo= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor v0.4.1 h1:P6UDRqlbywdpvhpVZeiB5p+DuhMTrVD4xfvPW55bs8M= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v0.3.1 h1:EXTDtCSTfPauGawsG+Ae/W46B1PkrgzuKNrcFqy4ljM= +github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v0.3.0 h1:Px2UA+2RvSSvv+RvJNuUB6n7rs5Wsel4dXLe90Um2n4= +github.com/Azure/azure-storage-blob-go v0.15.0 h1:rXtgp8tN1p29GvpGgfJetavIG0V7OgcSXPpwp3tx6qk= +github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd h1:b3wyxBl3vvr15tUAziPBPK354y+LSdfPCpex5oBttHo= +github.com/Azure/go-amqp v0.18.0 h1:95bTiJq0oxjK1RUlt5T3HF/THj6jWTRZpSXMPSOJLz8= +github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= +github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= +github.com/Azure/go-autorest/autorest v0.11.28 h1:ndAExarwr5Y+GaHE6VCaY1kyS/HwwGGyuimVhWsHOEM= +github.com/Azure/go-autorest/autorest/adal v0.9.23 h1:Yepx8CvFxwNKpH6ja7RZ+sKX+DWYNldbLiALMC3BTz8= +github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 h1:wkAZRgT/pn8HhFyzfe9UnqOjJYqlembgCTi72Bm/xKk= +github.com/Azure/go-autorest/autorest/azure/cli v0.4.5 h1:0W/yGmFdTIT77fvdlGZ0LMISoLHFJ7Tx4U0yeB+uFs4= +github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= +github.com/Azure/go-autorest/autorest/to v0.4.0 h1:oXVqrxakqqV1UZdSazDOPOLvOIz+XA683u8EctwboHk= +github.com/Azure/go-autorest/autorest/validation v0.3.1 h1:AgyqjAd94fwNAoTjl/WQXg4VvFeRFpO+UhNyRXqF1ac= +github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= +github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= +github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e h1:NeAW1fUYUEWhft7pkxDf6WoUvEZJ/uOKsvtpjLnn8MU= +github.com/AzureAD/microsoft-authentication-library-for-go v0.4.0 h1:WVsrXCnHlDDX8ls+tootqRE87/hL9S/g4ewig9RsD/c= +github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= +github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/ClickHouse/clickhouse-go v1.5.4 h1:cKjXeYLNWVJIx2J1K6H2CqyRmfwVJVY1OV1coaaFcI0= +github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= +github.com/Mellanox/rdmamap v0.0.0-20191106181932-7c3c4763a6ee h1:atI/FFjXh6hIVlPE1Jup9m8N4B9q/OSbMUe2EBahs+w= +github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= +github.com/Shopify/sarama v1.38.1 h1:lqqPUPQZ7zPqYlWpTh+LQ9bhYNu2xJL6k1SJN4WVe2A= +github.com/aerospike/aerospike-client-go/v5 v5.11.0 h1:z3ZmDSm3I10VMXXIIrsFCFq3IenwFqTCnLNyvnFVzrk= +github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= +github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= +github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2/go.mod h1:CxCgO+NdpMdi9SsTlGbc0W+/UNxO3I0AabOEJZ3w61w= +github.com/alecthomas/kong v0.2.1/go.mod h1:+inYUSluD+p4L8KdviBSgzcqEjUQOfC5fQDRFuc36lI= +github.com/alecthomas/participle v0.4.1 h1:P2PJWzwrSpuCWXKnzqvw0b0phSfH1kJo4p2HvLynVsI= +github.com/alecthomas/participle v0.4.1/go.mod h1:T8u4bQOSMwrkTWOSyt8/jSFPEnRtd0FKFMjVfYBlqPs= +github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ= +github.com/alecthomas/repr v0.0.0-20210301060118-828286944d6a/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= +github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= +github.com/aliyun/alibaba-cloud-sdk-go v1.62.193 h1:Cwd5cNwrQqtOzOJ1vqswYe3amU3vOz3v0wQF8WizmXI= +github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= +github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= +github.com/antchfx/jsonquery v1.3.1 h1:kh3599hMLpygvcxoENcj99eCvnS++JjRX10LjNYhK58= +github.com/antchfx/xmlquery v1.3.15 h1:aJConNMi1sMha5G8YJoAIF5P+H+qG1L73bSItWHo8Tw= +github.com/antchfx/xpath v1.2.5-0.20230505064641-588960cceeac h1:Et7H7mEPWuivbFEXi3dWa8hobnvF380TS2mq7JmgjEI= +github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ= +github.com/apache/arrow/go/v12 v12.0.0 h1:xtZE63VWl7qLdB0JObIXvvhGjoVNrQ9ciIHG2OK5cmc= +github.com/apache/iotdb-client-go v0.12.2-0.20220722111104-cd17da295b46 h1:28HyUQcr8ZCyCAatR0gkf9PuLr52U2T+66tx5Th0nxI= +github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg= +github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 h1:Bmjk+DjIi3tTAU0wxGaFbfjGUqlxxSXARq9A96Kgoos= +github.com/aristanetworks/goarista v0.0.0-20190325233358-a123909ec740 h1:FD4/ikKOFxwP8muWDypbmBWc634+YcAs3eBrYAmRdZY= +github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA= +github.com/awnumar/memcall v0.1.2 h1:7gOfDTL+BJ6nnbtAp9+HQzUFjtP1hEseRQq8eP055QY= +github.com/awnumar/memcall v0.1.2/go.mod h1:S911igBPR9CThzd/hYQQmTc9SWNu3ZHIlCGaWsWsoJo= +github.com/awnumar/memguard v0.22.3 h1:b4sgUXtbUjhrGELPbuC62wU+BsPQy+8lkWed9Z+pj0Y= +github.com/awnumar/memguard v0.22.3/go.mod h1:mmGunnffnLHlxE5rRgQc3j+uwPZ27eYb61ccr8Clz2Y= +github.com/aws/aws-sdk-go-v2 v1.18.0 h1:882kkTpSFhdgYRKVZ/VCgf7sd0ru57p2JCxz4/oN5RY= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= +github.com/aws/aws-sdk-go-v2/config v1.18.8 h1:lDpy0WM8AHsywOnVrOHaSMfpaiV2igOw8D7svkFkXVA= +github.com/aws/aws-sdk-go-v2/credentials v1.13.20 h1:oZCEFcrMppP/CNiS8myzv9JgOzq2s0d3v3MXYil/mxQ= +github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.2.0 h1:8kvinmbIDObqsWegKP0JjeanYPiA4GUVpAtciNWE+jw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.2 h1:jOzQAesnBFDmz93feqKnsTHsXrlwWORNZMFHMV+WLFU= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.7.1 h1:p9Dys1g2YdaqMalnp6AwCA+tpMMdJNGw5YYKP/u3sUk= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.32 h1:dpbVNUjczQ8Ae3QKHbpHBpfvaVkRdesxpTOe9pTouhU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.26 h1:QH2kOS3Ht7x+u0gHCh06CXL/h6G8LQJFpZfFBYBNboo= +github.com/aws/aws-sdk-go-v2/internal/ini v1.3.28 h1:KeTxcGdNnQudb46oOl4d90f2I33DF/c6q3RnZAmvQdQ= +github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.25.9 h1:7jgW378oM948BxuOBarXeeaKSrRaCj7didsdeSwYGGo= +github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.20.9 h1:sXs+JjIwgKA27t+5O8YgXl0cmZpEmctyDVO5y6cMdqA= +github.com/aws/aws-sdk-go-v2/service/dynamodb v1.17.3 h1:2oB4ikNEMLaPtu6lbNFJyTSayBILvrOfa2VfOffcuvU= +github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.4.0 h1:QbFWJr2SAyVYvyoOHvJU6sCGLnqNT94ZbWElJMEI1JY= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.10 h1:dpiPHgmFstgkLG07KaYAewvuptq5kvo52xn7tVSrtrQ= +github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.7.23 h1:5AwQnYQT3ZX/N7hPTAx4ClWyucaiqr2esQRMNbJIby0= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.26 h1:uUt4XctZLhl9wBE1L8lobU3bVN8SNUP7T+olb0bWBO4= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.9.0 h1:0BOlTqnNnrEO04oYKzDxMMe68t107pmIotn18HtVonY= +github.com/aws/aws-sdk-go-v2/service/kinesis v1.17.8 h1:9Kk24woetm1Tm4cAZNoJStJW1VQAeh92lLD9XZ4176g= +github.com/aws/aws-sdk-go-v2/service/s3 v1.19.0 h1:5mRAms4TjSTOGYsqKYte5kHr1PzpMJSyLThjF3J+hw0= +github.com/aws/aws-sdk-go-v2/service/sso v1.12.8 h1:5cb3D6xb006bPTqEfCNaEA6PPEfBXxxy4NNeX/44kGk= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.8 h1:NZaj0ngZMzsubWZbrEFSB4rgSQRbFq38Sd6KBxHuOIU= +github.com/aws/aws-sdk-go-v2/service/sts v1.18.9 h1:Qf1aWwnsNkyAoqDqmdM3nHwN78XQjec27LjM6b9vyfI= +github.com/aws/aws-sdk-go-v2/service/timestreamwrite v1.16.0 h1:HHVOprdnZxhM6F5JgljW8nCklfwUyOlbd/wuca6vORA= +github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= +github.com/awslabs/kinesis-aggregation/go v0.0.0-20210630091500-54e17340d32f h1:Pf0BjJDga7C98f0vhw+Ip5EaiE07S3lTKpIYPNS0nMo= +github.com/benbjohnson/clock v1.3.3 h1:g+rSsSaAzhHJYcIQE78hJ3AhyjjtQvleKDjlhdBnIhc= +github.com/benbjohnson/clock v1.3.3/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blues/jsonata-go v1.5.4 h1:XCsXaVVMrt4lcpKeJw6mNJHqQpWU751cnHdCFUq3xd8= +github.com/blues/jsonata-go v1.5.4/go.mod h1:uns2jymDrnI7y+UFYCqsRTEiAH22GyHnNXrkupAVFWI= +github.com/bmatcuk/doublestar/v3 v3.0.0 h1:TQtVPlDnAYwcrVNB2JiGuMc++H5qzWZd9PhkNo5WyHI= +github.com/bufbuild/protocompile v0.4.0 h1:LbFKd2XowZvQ/kajzguUp2DC9UEIQhIq77fZZlaQsNA= +github.com/caio/go-tdigest v3.1.0+incompatible h1:uoVMJ3Q5lXmVLCCqaMGHLBWnbGoN6Lpu7OAUPR60cds= +github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= +github.com/cenkalti/backoff/v4 v4.2.0 h1:HN5dHm3WBOgndBH6E8V0q2jIYIR3s9yglV8k/+MN3u4= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cisco-ie/nx-telemetry-proto v0.0.0-20230117155933-f64c045c77df h1:GmrltUp5Qf5XhT+LmqMDizsgm/6VHTSxPWRdrq21yRo= +github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg= +github.com/containerd/containerd v1.6.18 h1:qZbsLvmyu+Vlty0/Ex5xc0z2YtKpIsb5n45mAMI+2Ns= +github.com/coocood/freecache v1.2.3 h1:lcBwpZrwBZRZyLk/8EMyQVXRiFl663cCuMOrjCALeto= +github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= +github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec= +github.com/couchbase/go-couchbase v0.1.1 h1:ClFXELcKj/ojyoTYbsY34QUrrYCBi/1G749sXSCkdhk= +github.com/couchbase/gomemcached v0.1.3 h1:HIc5qMYNbuhB7zNaiEtj61DCYkquAwrQlf64q7JzdEY= +github.com/couchbase/goutils v0.1.0 h1:0WLlKJilu7IBm98T8nS9+J36lBFVLRUSIUtyD/uWpAE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/denisenkom/go-mssqldb v0.12.3 h1:pBSGx9Tq67pBOTLmxNuirNTeB8Vjmf886Kx+8Y+8shw= +github.com/devigned/tab v0.1.1 h1:3mD6Kb1mUOYeLpJvTVSDwSg5ZsfSxfvxGRTxRsJsITA= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/digitalocean/go-libvirt v0.0.0-20220811165305-15feff002086 h1:FTREXo+EVmU9nOCaQ46PvH0hs1Rt2/diCoTAtxzDxrA= +github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= +github.com/djherbis/times v1.5.0 h1:79myA211VwPhFTqUk8xehWrsEO+zcIZj0zT8mXPVARU= +github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= +github.com/docker/docker v23.0.4+incompatible h1:Kd3Bh9V/rO+XpTP/BLqM+gx8z7+Yb0AA2Ibj+nNo4ek= +github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/doclambda/protobufquery v0.0.0-20220727165953-0da287796ee9 h1:677nbAF3nq56BEZ2R/VMl0wROQqJo4vJ/ZWuzm+vsUU= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dynatrace-oss/dynatrace-metric-utils-go v0.5.0 h1:wHGPJSXvwKQVf/XfhjUPyrhpcPKWNy8F3ikH+eiwoBg= +github.com/eapache/go-resiliency v1.3.0 h1:RRL0nge+cWGlxXbUzJ7yMcq6w2XBEr19dCN6HECGaT0= +github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6 h1:8yY/I9ndfrgrXUbOGObLHKBR4Fl3nZXwM2c7OYTT8hM= +github.com/eapache/queue v1.1.0 h1:YOEu7KNc61ntiQlcEeUIoDTJ2o8mQznoNvUhiigpIqc= +github.com/eclipse/paho.golang v0.10.0 h1:oUGPjRwWcZQRgDD9wVDV7y7i7yBSxts3vcvcNJo8B4Q= +github.com/eclipse/paho.mqtt.golang v1.4.2 h1:66wOzfUHSSI1zamx7jR6yMEI5EuHnT1G6rNA5PM12m4= +github.com/emicklei/go-restful/v3 v3.10.1 h1:rc42Y5YTp7Am7CS630D7JmhRjq4UlEUuEKfrDac4bSQ= +github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= +github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/form3tech-oss/jwt-go v3.2.5+incompatible h1:/l4kBbb4/vGSsdtB5nUe8L7B9mImVMaBPw9L/0TBHU8= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= +github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro= +github.com/go-asn1-ber/asn1-ber v1.5.4 h1:vXT6d/FNDiELJnLb6hGNa309LMsrCoYFvpwHDF0+Y1A= +github.com/go-ldap/ldap/v3 v3.4.4 h1:qPjipEpt+qDa6SI/h1fzuGWoRUY+qqQ9sOZq67/PYUs= +github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= +github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-redis/redis/v7 v7.4.1 h1:PASvf36gyUpr2zdOUS/9Zqc80GbM+9BDyiJSJDDOrTI= +github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= +github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= +github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= +github.com/go-stomp/stomp v2.1.4+incompatible h1:D3SheUVDOz9RsjVWkoh/1iCOwD0qWjyeTZMUZ0EXg2Y= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= +github.com/gofrs/uuid v4.2.0+incompatible h1:yyYWMnhkhrKwwr8gAOcOCYxOOscHgDS9yZgBrnJfGa0= +github.com/gofrs/uuid/v5 v5.0.0 h1:p544++a97kEL+svbcFbCQVM9KFu0Yo25UoISXGNNH9M= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c= +github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg= +github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= +github.com/golang-sql/sqlexp v0.1.0 h1:ZCD6MBpcuOVfGVqsEmY5/4FtYiKz6tSyUv9LPEDei6A= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/flatbuffers v23.3.3+incompatible h1:5PJI/WbJkaMTvpGxsHVKG/LurN/KnWXNyGpwSCDgen0= +github.com/google/gnostic v0.6.9 h1:ZK/5VhkoX835RikCHpSUJV9a+S3e1zLh59YnyWeBW+0= +github.com/google/gnxi v0.0.0-20221016143401-2aeceb5a2901 h1:xlsMG0I0F6Ou3a4zRWu3cThivTt2N2V1cZafIloTBTU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-github/v32 v32.1.0 h1:GWkQOdXqviCPx7Q7Fj+KyPoGm4SwHRh8rheoPhd27II= +github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/s2a-go v0.1.3 h1:FAgZmpLl/SXurPEZyCMPBIiiYeTbqfjlbdnCNTAkbGE= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/googleapis/enterprise-certificate-proxy v0.2.3 h1:yk9/cqRKtT9wXZSsRH9aurXEpJX+U6FLtpYTdC3R06k= +github.com/googleapis/gax-go/v2 v2.8.0 h1:UBtEZqx1bjXtOQ5BVTkuYghXrr3N4V123VKJK67vJZc= +github.com/gopcua/opcua v0.3.7 h1:iGjLW3D+ztnjtZQPKsJ0nwibHyDw1m11NfqOU8KSFQ8= +github.com/gophercloud/gophercloud v1.2.0 h1:1oXyj4g54KBg/kFtCdMM6jtxSzeIyg8wv4z1HoGPp1E= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gosnmp/gosnmp v1.35.0 h1:EuWWNPxTCdAUx2/NbQcSa3WdNxjzpy4Phv57b4MWpJM= +github.com/gosnmp/gosnmp v1.35.0/go.mod h1:2AvKZ3n9aEl5TJEo/fFmf/FGO4Nj4cVeEc5yuk88CYc= +github.com/grid-x/modbus v0.0.0-20211113184042-7f2251c342c9 h1:Q7e9kXS3sRbTjsNDKazbcbDSGAKjFdk096M3qYbwNpE= +github.com/grid-x/serial v0.0.0-20211107191517-583c7356b3aa h1:Rsn6ARgNkXrsXJIzhkE4vQr5Gbx2LvtEMv4BJOK4LyU= +github.com/gwos/tcg/sdk v0.0.0-20220621192633-df0eac0a1a4c h1:pVr0TkSFnMP4BWSsEak/4bxD8/K+foJ9V8DGyZ6PIDE= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= +github.com/harlow/kinesis-consumer v0.3.6-0.20211204214318-c2b9f79d7ab6 h1:38nI+nE+oUmLmlNjuByhvnmuBrcQVLNkOJhSSM4eJv0= +github.com/hashicorp/consul/api v1.20.0 h1:9IHTjNVSZ7MIwjlW3N3a7iGiykCMDpxZu8jsxFJh0yc= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-hclog v1.4.0 h1:ctuWFGrhFha8BnnzxqeRGidlEcQkDyL5u8J8t5eA11I= +github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= +github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= +github.com/hashicorp/packer-plugin-sdk v0.3.1 h1:Gr/mnihsdUcPfGiruFL93BQkiFh3EFPwyxxTWkwvRsQ= +github.com/hashicorp/serf v0.10.1 h1:Z1H2J60yRKvfDYAOZLd2MU0ND4AH/WDz7xYHDWQsIPY= +github.com/huandu/xstrings v1.3.2 h1:L18LIDzqlW6xN2rEkpdV8+oL/IXWJ1APd+vsdYy4Wdw= +github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= +github.com/influxdata/go-syslog/v3 v3.0.0 h1:jichmjSZlYK0VMmlz+k4WeOQd7z745YLsvGMqwtYt4I= +github.com/influxdata/influxdb-observability/common v0.3.3 h1:fzsgJKiV/bucNPRYggLE1F6UgpePQaYh72Lqj1rPEmI= +github.com/influxdata/influxdb-observability/influx2otel v0.3.3 h1:KWesgMC0sqRLfvPZXnCzJauCZ82XoHtKTFJVKmEk63M= +github.com/influxdata/influxdb-observability/otel2influx v0.3.3 h1:zdesvjHJYXccZ4vd6hP6vXwbd6YbAj7AGMhOjk9pt0k= +github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= +github.com/influxdata/tail v1.0.1-0.20210707231403-b283181d1fa7 h1:0rQOs1VHLVFpAAOIR0mJEvVOIaMYFgYdreeVbgI9sII= +github.com/influxdata/telegraf v1.26.3 h1:wawD3VTdnPDbHnJ1RBGgCf0YB7vlxREZ70rvEepHdGs= +github.com/influxdata/telegraf v1.26.3/go.mod h1:w+VUZ4NRDzfhRmhEdBbbNZBNT7E8qRkLiL73j/pD0ug= +github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 h1:vvyMtD5LTJc1W9sQKjDkAWdcg0478CszSdzlHtiAXCY= +github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65/go.mod h1:zApaNFpP/bTpQItGZNNUMISDMDAnTXu9UqJ4yT3ocz8= +github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8 h1:W2IgzRCb0L9VzMujq/QuTaZUKcH8096jWwP519mHN6Q= +github.com/intel/iaevents v1.1.0 h1:FzxMBfXk/apG2EUXUCfaq3gUQ+q+TgZ1HNMjjUILUGE= +github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= +github.com/jackc/pgconn v1.13.0 h1:3L1XMNV2Zvca/8BYhzcRFS70Lr0WlDg16Di6SFGAbys= +github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgproto3/v2 v2.3.1 h1:nwj7qwf0S+Q7ISFfBndqeLwSwxs+4DPsbRFjECT1Y4Y= +github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b h1:C8S2+VttkHFdOOCXJe+YGfa4vHYwlt4Zx+IVXQ97jYg= +github.com/jackc/pgtype v1.12.0 h1:Dlq8Qvcch7kiehm8wPGIW0W3KsCCHJnRacKW0UM8n5w= +github.com/jackc/pgx/v4 v4.17.1 h1:tASdE79tX9LOQu3MMvioWT6YaZkf58ZhmLHhV4sv5WM= +github.com/jackc/puddle v1.3.0 h1:eHK/5clGOatcjX3oWGBO/MpxpbHzSwud5EWTSCI+MX0= +github.com/jaegertracing/jaeger v1.38.0 h1:rDQ36TnSxUX4gTskMQzEdpieS0BGYdfXXnUJmGnNMGw= +github.com/james4k/rcon v0.0.0-20120923215419-8fbb8268b60a h1:JxcWget6X/VfBMKxPIc28Jel37LGREut2fpV+ObkwJ0= +github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= +github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= +github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg= +github.com/jcmturner/gokrb5/v8 v8.4.3 h1:iTonLeSJOn7MVUtyMT+arAn5AKAPrkilzhGw8wE/Tq8= +github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY= +github.com/jeremywohl/flatten/v2 v2.0.0-20211013061545-07e4a09fb8e4 h1:eA9wi6ZzpIRobvXkn/S2Lyw1hr2pc71zxzOPl7Xjs4w= +github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/native v1.0.0 h1:Ts/E8zCSEsG17dUqv7joXJFybuMLjQfWE04tsBODTxk= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/karrick/godirwalk v1.16.2 h1:eY2INUWoB2ZfpF/kXasyjWJ3Ncuof6qZuNWYZFN3kAI= +github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= +github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= +github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk= +github.com/knadh/koanf v1.5.0 h1:q2TSd/3Pyc/5yP9ldIrSdIz26MCcyNQzW0pEAugLPNs= +github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b h1:udzkj9S/zlT5X367kqJis0QP7YMxobob6zhzq6Yre00= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/leodido/ragel-machinery v0.0.0-20181214104525-299bdde78165 h1:bCiVCRCs1Heq84lurVinUPy19keqGEe4jh5vtK37jcg= +github.com/linkedin/goavro/v2 v2.12.0 h1:rIQQSj8jdAUlKQh6DttK8wCRv4t4QO09g1C4aBWXslg= +github.com/logzio/azure-monitor-metrics-receiver v1.0.0 h1:TAzhIZL2ueyyc81qIw8FGg4nUbts4Hvc3oOxSobY1IA= +github.com/lufia/plan9stats v0.0.0-20220913051719-115f729f3c8c h1:VtwQ41oftZwlMnOEbMWQtSEUgU64U4s+GHk7hZK+jtY= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-ieproxy v0.0.1 h1:qiyop7gCflfhwCzGyeT0gro3sF9AIg9HU98JORTkqfI= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= +github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/mdlayher/apcupsd v0.0.0-20220319200143-473c7b5f3c6a h1:JOlLsLUQnokTyWWwEvOVoKH3XUl6oDMP8jisO54l6J8= +github.com/mdlayher/genetlink v1.2.0 h1:4yrIkRV5Wfk1WfpWTcoOlGmsWgQj3OtQN9ZsbrE+XtU= +github.com/mdlayher/netlink v1.6.0 h1:rOHX5yl7qnlpiVkFWoqccueppMtXzeziFjWAjLg6sz0= +github.com/mdlayher/socket v0.2.3 h1:XZA2X2TjdOwNoNPVPclRCURoX/hokBY8nkTmRZFEheM= +github.com/microsoft/ApplicationInsights-Go v0.4.4 h1:G4+H9WNs6ygSCe6sUyxRc2U81TI5Es90b2t/MwX5KqY= +github.com/miekg/dns v1.1.51 h1:0+Xg7vObnhrz/4ZCZcZh7zPXlmU0aveS2HDBd0m0qSo= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= +github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= +github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= +github.com/moby/ipvs v1.1.0 h1:ONN4pGaZQgAx+1Scz5RvWV4Q7Gb+mvfRh3NsPS+1XQQ= +github.com/moby/patternmatcher v0.5.0 h1:YCZgJOeULcxLw1Q+sVR636pmS7sPEn1Qo2iAN6M7DBo= +github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= +github.com/moby/term v0.0.0-20221128092401-c43b287e0e0f h1:J/7hjLaHLD7epG0m6TBMGmp4NQ+ibBYLfeyJWdAIFLA= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/montanaflynn/stats v0.6.6 h1:Duep6KMIDpY4Yo11iFsvyqJDyfzLF9+sndUKT+v64GQ= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/multiplay/go-ts3 v1.1.0 h1:OWOjRxBCRds+FbpyM1JKSscRbbmYr/IIrh6V78CM5Xw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks= +github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0= +github.com/nats-io/jwt/v2 v2.3.0 h1:z2mA1a7tIf5ShggOFlR1oBPgd6hGqcDYsISxZByUzdI= +github.com/nats-io/nats-server/v2 v2.9.9 h1:bmj0RhvHOc8+z5/RuhI38GqPwtkFAHQuU3e99FVA/TI= +github.com/nats-io/nats.go v1.24.0 h1:CRiD8L5GOQu/DcfkmgBcTTIQORMwizF+rPk6T0RaHVQ= +github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8= +github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= +github.com/netsampler/goflow2 v1.3.3 h1:uheCMgWwbaHnVdsvc2bqbdQe93E73pVF77WGu/kPE7U= +github.com/newrelic/newrelic-telemetry-sdk-go v0.8.1 h1:6OX5VXMuj2salqNBc41eXKz6K+nV6OB/hhlGnAKCbwU= +github.com/nsqio/go-nsq v1.1.0 h1:PQg+xxiUjA7V+TLdXw7nVrJ5Jbl3sN86EhGCQj4+FYE= +github.com/olivere/elastic v6.2.37+incompatible h1:UfSGJem5czY+x/LqxgeCBgjDn6St+z8OnsCuxwD3L0U= +github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.73.0 h1:b62Oq3dniQm3eg8OcnBnlZCyZ4O85iyKPFuCIeYNCKk= +github.com/openconfig/gnmi v0.9.1 h1:hVOdLTaRjdy68oCGJbkf2vrmnUoQ5xbINqBOAMix4xM= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/image-spec v1.1.0-rc2 h1:2zx/Stx4Wc5pIPDvIxHXvXtQFW/7XWJGmnM7r3wg034= +github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= +github.com/opensearch-project/opensearch-go/v2 v2.2.0 h1:6RicCBiqboSVtLMjSiKgVQIsND4I3sxELg9uwWe/TKM= +github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b h1:FfH+VrHHk6Lxt9HdVS0PXzSXFyS2NbZKXv33FYPol0A= +github.com/p4lang/p4runtime v1.3.0 h1:3fUhHj0JtsGcL2Bh0uxpACdBJBDqpZyLgj93tqKzoJY= +github.com/pborman/ansi v1.0.0 h1:OqjHMhvlSuCCV5JT07yqPuJPQzQl+WXsiZ14gZsqOrQ= +github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw= +github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0= +github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc= +github.com/pion/dtls/v2 v2.2.6 h1:yXMxKr0Skd+Ub6A8UqXTRLSywskx93ooMRHsQUtd+Z4= +github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= +github.com/pion/transport/v2 v2.0.2 h1:St+8o+1PEzPT51O9bv+tH/KYYLMNR5Vwm5Z3Qkjsywg= +github.com/pion/udp/v2 v2.0.1 h1:xP0z6WNux1zWEjhC7onRA3EwwSliXqu1ElUZAQhUP54= +github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 h1:KoWmjvw+nsYOo29YJK9vDA65RGE3NrOnUtO7a+RF9HU= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c h1:NRoLoZvkBTKvR5gQLgA3e0hqjkY9u1wm+iOL45VN/qI= +github.com/prometheus-community/pro-bing v0.1.0 h1:zjzLGhfNPP0bP1OlzGB+SJcguOViw7df12LPg2vUJh8= +github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= +github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= +github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= +github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= +github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/prometheus/prometheus v0.42.0 h1:G769v8covTkOiNckXFIwLx01XE04OE6Fr0JPA0oR2nI= +github.com/prometheus/prometheus v0.42.0/go.mod h1:Pfqb/MLnnR2KK+0vchiaH39jXxvLMBk+3lnIGP4N7Vk= +github.com/rabbitmq/amqp091-go v1.8.0 h1:GBFy5PpLQ5jSVVSYv8ecHGqeX7UTLYR4ItQbDCss9MM= +github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/riemann/riemann-go-client v0.5.1-0.20211206220514-f58f10cdce16 h1:bGXoxRwUpPTCaQ86DRE+3wqE9vh3aH8W0HH5L/ygOFM= +github.com/robbiet480/go.nut v0.0.0-20220219091450-bd8f121e1fa1 h1:YmFqprZILGlF/X3tvMA4Rwn3ySxyE3hGUajBHkkaZbM= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/safchain/ethtool v0.3.0 h1:gimQJpsI6sc1yIqP/y8GYgiXn/NjgvpM0RNoWLVVmP0= +github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e h1:CGjiMQ0wMH4wtNWrlj6kiTbkPt2F3rbYnhGX6TWLfco= +github.com/shirou/gopsutil/v3 v3.23.3 h1:Syt5vVZXUDXPEXpIBt5ziWsJ4LdSAAxF4l/xZeQgSEE= +github.com/shoenig/go-m1cpu v0.1.4 h1:SZPIgRM2sEF9NJy50mRHu9PKGwxyyTTJIWvCtgVbozs= +github.com/showwin/speedtest-go v1.4.2 h1:3YjBajURQTJCv/rVwJsd5UtCYlaiqCihg5NhPxJapk8= +github.com/signalfx/com_signalfx_metrics_protobuf v0.0.3 h1:32k2QLgsKhcEs55q4REPKyIadvid5FPy2+VMgvbmKJ0= +github.com/signalfx/gohistogram v0.0.0-20160107210732-1ccfd2ff5083 h1:WsShHmu12ZztYPfh9b+I+VjYD1o8iOHhB67WZCMEEE8= +github.com/signalfx/golib/v3 v3.3.50 h1:TTBpfzsO00F8ep6rhLgBmRIPUpRqBenacezjE4xCweI= +github.com/signalfx/sapm-proto v0.12.0 h1:OtOe+Jm8L61Ml8K6X8a89zc8/RlaaMRElCImeGKR/Ew= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sleepinggenius2/gosmi v0.4.4 h1:xgu+Mt7CptuB10IPt3SVXBAA9tARToT4B9xGzjjxQX8= +github.com/sleepinggenius2/gosmi v0.4.4/go.mod h1:l8OniPmd3bJzw0MXP2/qh7AhP/e+bTY2CNivIhsnDT0= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/snowflakedb/gosnowflake v1.6.13 h1:r8iozak/p3P2jYfjF3EbeteqMMzPWjwmVrdENJDW6EI= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= +github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/testcontainers/testcontainers-go v0.18.0 h1:8RXrcIQv5xX/uBOSmZd297gzvA7F0yuRA37/918o7Yg= +github.com/thomasklein94/packer-plugin-libvirt v0.3.4 h1:K+NkHFcZuiUTp4ZiDdBhWRMZiSMdsXwGuzyg4THKDAU= +github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= +github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0= +github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw= +github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= +github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= +github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= +github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= +github.com/vapourismo/knx-go v0.0.0-20220829185957-fb5458a5389d h1:BJMc7MNW/p80cCkC46JimNuowOWCnSSW5IHjtUrXzNk= +github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs= +github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= +github.com/vjeantet/grok v1.0.1 h1:2rhIR7J4gThTgcZ1m2JY4TrJZNgjn985U28kT2wQrJ4= +github.com/vmware/govmomi v0.28.1-0.20220921224932-b4b508abf208 h1:IDVzGQ2aczmTEfTos4hzmFw20tGQ4zZsVnel9C6VEpA= +github.com/wavefronthq/wavefront-sdk-go v0.13.0 h1:3s9maJmzI4orW+hiVBfCNp/SIu8ISXi6rtewmDGzheE= +github.com/wvanbergen/kafka v0.0.0-20171203153745-e2edea948ddf h1:TOV5PC6fIWwFOFra9xJfRXZcL2pLhMI8oNuDugNxg9Q= +github.com/wvanbergen/kazoo-go v0.0.0-20180202103751-f72d8611297a h1:ILoU84rj4AQ3q6cjQvtb9jBjx4xzR/Riq/zYhmDQiOk= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg/scram v1.0.5 h1:TuS0RFmt5Is5qm9Tm2SoD89OPqe4IRiFtyFY4iwWXsw= +github.com/xdg/stringprep v1.0.3 h1:cmL5Enob4W83ti/ZHuZLuKD/xqJfus4fVPwE+/BDm+4= +github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/gopher-lua v0.0.0-20200816102855-ee81675732da h1:NimzV1aGyq29m5ukMK0AMWEhFaL/lrEOaephfuoiARg= +github.com/yusufpapurcu/wmi v1.2.2 h1:KBNDSne4vP5mbSWnJbO+51IMOXJB67QiYCSBrubbPRg= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +go.mongodb.org/mongo-driver v1.11.2 h1:+1v2rDQUWNcGW7/7E0Jvdz51V38XXxJfhzbV17aNHCw= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opentelemetry.io/collector v0.73.0 h1:oEBFtf5WcXiIPGXcjOM5gSQ3GNh/3d6pHf0IThhGmfw= +go.opentelemetry.io/collector/component v0.73.0 h1:ka24yVJoVETCru+l5Fm85xGc2y0HwvGfYwyRe7qmjq0= +go.opentelemetry.io/collector/confmap v0.73.0 h1:tC8x8sDk7JQ3QcbosqrxLe756sYcg4iUdTXsx7Ie4CM= +go.opentelemetry.io/collector/consumer v0.73.0 h1:gy89oaG198A7KGbXIsMIdN4lWVQqqSdx6dsBCfzLujU= +go.opentelemetry.io/collector/featuregate v0.73.0 h1:hpHKXmRiJqMLefIzXwIuqDo9df2HcI/66IAKLo+g7nc= +go.opentelemetry.io/collector/pdata v1.0.0-rcv0011 h1:7lT0vseP89mHtUpvgmWYRvQZ0eY+SHbVsnXY20xkoMg= +go.opentelemetry.io/collector/semconv v0.73.0 h1:gF4f6z1q8YfWzzo/gPKysjFmmM4Pv4nC2bWrTPxTPaE= +go.opentelemetry.io/otel v1.14.0 h1:/79Huy8wbf5DnIPhemGB+zEPVwnN6fuQybr/SRXa6hM= +go.opentelemetry.io/otel/metric v0.37.0 h1:pHDQuLQOZwYD+Km0eb657A25NaRzy0a+eLyKfDXedEs= +go.opentelemetry.io/otel/sdk v1.14.0 h1:PDCppFRDq8A1jL9v6KMI6dYesaq+DFcDZvjsoGvxGzY= +go.opentelemetry.io/otel/sdk/metric v0.37.0 h1:haYBBtZZxiI3ROwSmkZnI+d0+AVzBWeviuYQDeBWosU= +go.opentelemetry.io/otel/trace v1.14.0 h1:wp2Mmvj41tDsyAJXiWDWpfNsOiIyd38fy85pyKcFq/M= +go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/exp v0.0.0-20230307190834-24139beb5833 h1:SChBja7BCQewoTAU7IgvucQKMIXrEpFxNMs0spT3/5s= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= +golang.zx2c4.com/wireguard v0.0.0-20211209221555-9c9e7e272434 h1:3zl8RkJNQ8wfPRomwv/6DBbH2Ut6dgMaWTxM0ZunWnE= +golang.zx2c4.com/wireguard/wgctrl v0.0.0-20211230205640-daad0b7ba671 h1:tJAYx7pB6b5bNqi7XatStqFT2zFAxhXcGDq1R6FqqjU= +google.golang.org/api v0.121.0 h1:8Oopoo8Vavxx6gt+sgs8s8/X60WBAtKQq6JqnkF+xow= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 h1:9NWlQfY2ePejTmfwUH1OWwmznFa+0kKcHGPDvcPza9M= +google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 h1:m8v1xLLLzMe1m5P+gCTF8nJB9epwZQUBERm20Oy1poQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 h1:0nDDozoAU19Qb2HwhXadU8OcsiO/09cnTqhUtq2MEOM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= +google.golang.org/grpc v1.57.2 h1:uw37EN34aMFFXB2QPW7Tq6tdTbind1GpRxw5aOX3a5k= +google.golang.org/grpc v1.57.2/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/fatih/pool.v2 v2.0.0 h1:xIFeWtxifuQJGk/IEPKsTduEKcKvPmhoiVDGpC40nKg= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= +gopkg.in/gorethink/gorethink.v3 v3.0.5 h1:e2Uc/Xe+hpcVQFsj6MuHlYog3r0JYpnTzwDj/y2O4MU= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/olivere/elastic.v5 v5.0.86 h1:xFy6qRCGAmo5Wjx96srho9BitLhZl2fcnpuidPwduXM= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637 h1:yiW+nvdHb9LVqSHQBXfZCieqV4fzYhNBql77zY0ykqs= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.2.2 h1:MNh1AVMyVX23VUHE2O27jm6lNj3vjO5DexS4A1xvnzk= +k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ= +k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ= +k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I= +k8s.io/client-go v0.26.2 h1:s1WkVujHX3kTp4Zn4yGNFK+dlDXy1bAAkIl+cFAiuYI= +k8s.io/cri-api v0.25.13 h1:FaVci3+y5COQuyAFWUckdfOxRpD+m0cnaW2q0OPVm1Q= +k8s.io/cri-api v0.25.13/go.mod h1:yKsLus3raCZ+WbR2m5hS+3hUs5BgSldj2CFJTWyx48M= +k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= +k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw= +k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d h1:VcFq5n7wCJB2FQMCIHfC+f+jNcGgNMar1uKd6rVlifU= +k8s.io/utils v0.0.0-20230308161112-d77c459e9343 h1:m7tbIjXGcGIAtpmQr7/NAi7RsWoW3E7Zcm4jI1HicTc= +layeh.com/radius v0.0.0-20221205141417-e7fbddd11d68 h1:2NDro2Jzkrqfngy/sA5GVnChs7fx8EzcQKFi/lI2cfg= +lukechampine.com/uint128 v1.2.0 h1:mBi/5l91vocEN8otkC5bDLhi2KdCticRiwbdB0O+rjI= +modernc.org/cc/v3 v3.40.0 h1:P3g79IUS/93SYhtoeaHW+kRCIrYaxJ27MFPv+7kaTOw= +modernc.org/ccgo/v3 v3.16.13 h1:Mkgdzl46i5F/CNR/Kj80Ri59hC8TKAhZrYSaqvkwzUw= +modernc.org/libc v1.22.3 h1:D/g6O5ftAfavceqlLOFwaZuA5KYafKwmr30A6iSqoyY= +modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= +modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= +modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= +modernc.org/sqlite v1.21.0 h1:4aP4MdUf15i3R3M2mx6Q90WHKz3nZLoz96zlB6tNdow= +modernc.org/strutil v1.1.3 h1:fNMm+oJklMGYfU9Ylcywl0CO5O6nTfaowNsh2wpPjzY= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go new file mode 100644 index 0000000..1318957 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go @@ -0,0 +1,20 @@ +//go:build !custom || inputs || inputs.npu + +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package all this for register +package all + +import _ "github.com/influxdata/telegraf/plugins/inputs/npu" // register plugin diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md b/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md new file mode 100644 index 0000000..72fc73e --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md @@ -0,0 +1,107 @@ +# npu plugin of telegraf +## 使用介绍 +该插件代码可根据以下两种方法来使用(选择其一即可): + +### 1、源码集成使用(适合未安装Telegraf的情况) +对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/write_external_plugin/ +#### **编译步骤:** +拉取telegraf v1.26.0分支源码 +```shell +git clone -b v1.26.0 https://github.com/influxdata/telegraf.git +``` +拉取插件源码 +```shell +git clone -b [latest_tag] https://gitcode.com/Ascend/mind-cluster.git +# [latest_tag]此tag请自行修改,建议采用仓库的最新标签,否则可能导致引用函数失效 +``` +将插件代码集成到telegraf源码中(其中路径按实际修改) +```shell +cp -r mind-cluster/component/npu-exporter/platforms/inputs/npu telegraf/plugins/inputs +``` +将插件注册到telegraf(其中路径按实际修改) +```shell +cp -r mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go telegraf/plugins/inputs/all +``` +将telegraf源码中的Makefile里的“CGO_ENABLED=0”改为“CGO_ENABLED=1” +```shell +cd telegraf +sed -i s"/CGO_ENABLED=0/CGO_ENABLED=1/" Makefile +``` + +将如下内容加入到telegraf源码的go.mod的文件里 +注意:[latest_tag]请自行修改为commitID/分支名称/tag名称中的一种,建议采用仓库的最新标签,否则可能导致引用函数失效 +```go.mod +require huawei.com/npu-exporter/v6 v6.0.0-RC1 + +replace huawei.com/npu-exporter/v6 => gitcode.com/Ascend/mind-cluster.git/component/npu-exporter/v6 [latest_tag] +replace ascend-common => gitcode.com/Ascend/mind-cluster.git/component/ascend-common [latest_tag] +``` + +然后执行 +```shell +go mod tidy +``` +接着编译telegraf +```shell +make all +``` +运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) +```shell +mkdir -m 750 /var/log/mindx-dl/npu-exporter +``` +源码集成时,该日志可通过hwlog.LogConfig{}结构体来配置,该结构体的详细信息如下 +```go +type LogConfig struct { + // log file path, default "/var/log/mindx-dl/npu-exporter/npu-plugin.log" in npu plugin + LogFileName string + // only write to std out, default value: false + OnlyToStdout bool + // only write to file, default value: false + OnlyToFile bool + // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 + LogLevel int + // size of a single log file (MB), default value: 2MB in npu plugin + FileMaxSize int + // MaxLineLength Max length of each log line, default value: 256 + MaxLineLength int + // maximum number of backup log files, set as 2 in npu plugin + MaxBackups int + // maximum number of days for backup log files, default value: 2 + MaxAge int + // whether backup files need to be compressed, default value: false + IsCompress bool + // expiration time for log cache, default value: 1s + ExpiredTime int + // Size of log cache space, default: 2048 + CacheSize int +} +``` +#### **使用示例:** +使用插件中提供的配置文件运行telegraf +```shell +./telegraf --config path_to_plugins/inputs/npu/sample.conf +``` + +### 2、二进制集成,使用telegraf的execd机制(适合已安装Telegraf的情况) +对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/shim/ + +从[MindCluster社区](https://www.hiascend.com/developer/download/community/result?module=cluster)获取npu-exporter软件包,并从中解压出npu-exporter二进制文件 + +### 使用 +运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) +```shell +mkdir -m 750 /var/log/mindx-dl/npu-exporter +``` +先编写配置文件,如test.conf +``` +[[inputs.execd]] + command = ["path_to_npu_plugin/npu-exporter", "-platform=Telegraf"] + signal = "none" + +[[outputs.file]] + files=["stdout"] +``` +然后运行telegraf +```shell +./telegraf --config path_to_config_file/test.conf +``` \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go new file mode 100644 index 0000000..4c200e0 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go @@ -0,0 +1,104 @@ +/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package npu this for parse and pack +package npu + +import ( + _ "embed" + "strings" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/inputs" + + "ascend-common/api" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +//go:embed sample.conf +var sampleConfig string + +const ( + num2 = 2 +) + +// WatchNPU npu watch struct +type WatchNPU struct { + collector *common.NpuCollector +} + +// SampleConfig used to return sampleConfig +func (*WatchNPU) SampleConfig() string { + return sampleConfig +} + +// Gather used to gather information from dcmi info and hccn tool info +func (npu *WatchNPU) Gather(acc telegraf.Accumulator) error { + + fieldsMap := make(map[string]map[string]interface{}) + const devName = "ascend" + + devTagValue := "" + if cardType := npu.collector.Dmgr.GetDevType(); cardType == api.Ascend910A3 || cardType == api.Ascend910B || + cardType == api.Ascend910A { + devTagValue = strings.ToLower(api.Ascend910) + } else { + devTagValue = strings.ToLower(cardType) + } + logger.DynamicConfigure(logger.Config{Acc: acc}) + + containerMap := common.GetContainerNPUInfo(npu.collector) + chips := common.GetChipListWithVNPU(npu.collector) + + fieldsMap = npu.gatherChain(fieldsMap, common.ChainForSingleGoroutine, containerMap, chips) + fieldsMap = npu.gatherChain(fieldsMap, common.ChainForMultiGoroutine, containerMap, chips) + fieldsMap = npu.gatherChain(fieldsMap, common.ChainForCustomPlugin, containerMap, chips) + + generalFields := fieldsMap[common.GeneralDevTagKey] + acc.AddFields(devName, generalFields, map[string]string{"device": devTagValue}) + + // after the report is completed, deleted to avoid repeated reporting in the for loop + delete(fieldsMap, common.GeneralDevTagKey) + for key, fields := range fieldsMap { + + ids := strings.Split(key, "_") + devTag := map[string]string{"device": devTagValue + "-" + ids[0]} + if len(ids) >= num2 { + devTag["vdev_id"] = ids[1] + } + + acc.AddFields(devName, fields, devTag) + } + + return nil +} + +func (npu *WatchNPU) gatherChain(fieldsMap map[string]map[string]interface{}, chain []common.MetricsCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + + for _, collector := range chain { + fieldsMap = collector.UpdateTelegraf(fieldsMap, npu.collector, containerMap, chips) + } + return fieldsMap +} + +func init() { + inputs.Add("npu", func() telegraf.Input { + return &WatchNPU{ + collector: common.Collector, + } + }) +} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go new file mode 100644 index 0000000..c8adef4 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go @@ -0,0 +1,174 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package npu this for parse and pack +package npu + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/influxdata/telegraf" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + num5 = 5 +) + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + initChain() +} + +func initChain() { + common.ChainForSingleGoroutine = []common.MetricsCollector{ + &metrics.VersionCollector{}, + } +} + +func mockNewNpuCollector() *common.NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: time.Duration(num5), + updateTime: time.Duration(num5), + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +// TestGather verifies different device type scenarios +func TestGather(t *testing.T) { + tests := []struct { + name string + deviceType string + expectedTag string + }{ + {name: api.Ascend910A3, + deviceType: api.Ascend910A3, + expectedTag: api.Ascend910, + }, + {name: api.Ascend310P, + deviceType: api.Ascend310P, + expectedTag: api.Ascend310P, + }, + } + npu := &WatchNPU{ + collector: mockNewNpuCollector(), + } + acc := &MockAccumulator{} + + for _, tt := range tests { + convey.Convey(tt.name, t, func() { + patches := gomonkey.NewPatches() + defer patches.Reset() + + patches.ApplyMethodReturn(npu.collector.Dmgr, "GetDevType", tt.deviceType) + patches.ApplyFuncReturn(common.GetContainerNPUInfo, nil) + patches.ApplyFuncReturn(common.GetChipListWithVNPU, nil) + patches.ApplyMethodReturn(common.ChainForSingleGoroutine[0], "UpdateTelegraf", + map[string]map[string]interface{}{ + common.GeneralDevTagKey: {"npu_exporter_version_info": "7.0.0"}, + "0": {"npu_chip_info_power": "1"}, + "1_100": {"npu_chip_info_voltage": "1"}, + }) + + err := npu.Gather(acc) + convey.So(err, convey.ShouldBeNil) + convey.So(acc.fields["ascend,device="+strings.ToLower(tt.expectedTag)], convey.ShouldNotBeEmpty) + }) + } +} + +// TestGatherChain tests the gatherChain method of WatchNPU +func TestGatherChain(t *testing.T) { + npu := &WatchNPU{} + fieldsMap := make(map[string]map[string]interface{}) + chain := []common.MetricsCollector{&metrics.VersionCollector{}} + + convey.Convey("TestGatherChain", t, func() { + result := npu.gatherChain(fieldsMap, chain, nil, nil) + logger.Infof("result:%v", result) + convey.So(len(result), convey.ShouldEqual, 1) + }) +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +// MockAccumulator is a mock implementation of telegraf.Accumulator +type MockAccumulator struct { + fields map[string]map[string]interface{} +} + +func (m *MockAccumulator) AddFields(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { + if m.fields == nil { + m.fields = make(map[string]map[string]interface{}) + } + pairs := make([]string, 0, len(tags)) + for k, v := range tags { + pairs = append(pairs, fmt.Sprintf("%s=%v", k, v)) + } + metricKey := measurement + "," + strings.Join(pairs, ",") + m.fields[metricKey] = fields +} + +func (m *MockAccumulator) AddGauge(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddCounter(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddSummary(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddHistogram(measurement string, fields map[string]interface{}, tags map[string]string, + t ...time.Time) { +} + +func (m *MockAccumulator) AddMetric(metric telegraf.Metric) { +} + +func (m *MockAccumulator) SetPrecision(precision time.Duration) { +} + +func (m *MockAccumulator) AddError(err error) { +} + +func (m *MockAccumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator { + return nil +} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf b/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf new file mode 100644 index 0000000..11fe998 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf @@ -0,0 +1,9 @@ +[agent] + interval="20s" + flush_interval="20s" + +[[inputs.npu]] + npu_log_level = 1 + +[[outputs.file]] + files=["stdout"] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go new file mode 100644 index 0000000..088eeb9 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go @@ -0,0 +1,103 @@ +/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package prometheus for prometheus collector +package prom + +import ( + "github.com/prometheus/client_golang/prometheus" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +// CollectorForPrometheus Entry point for collecting and converting +type CollectorForPrometheus struct { + collector *common.NpuCollector +} + +// NewPrometheusCollector create an instance of prometheus Collector +func NewPrometheusCollector(collector *common.NpuCollector) *CollectorForPrometheus { + promCollector := &CollectorForPrometheus{ + collector: collector, + } + return promCollector +} + +// Describe desc metrics of prometheus +func (*CollectorForPrometheus) Describe(ch chan<- *prometheus.Desc) { + if ch == nil { + logger.Error("ch is nil ") + return + } + const cacheSize = 100 + tempCh := make(chan *prometheus.Desc, cacheSize) + done := make(chan bool) + + go func() { + seenMetrics := make(map[string]struct{}) + for desc := range tempCh { + if desc == nil { + continue + } + descKey := utils.GetDescName(desc) + if _, exists := seenMetrics[descKey]; exists { + logger.Warnf("duplicate metric description detected, keeping first declaration, ignoring duplicate: %s", desc) + continue + } + seenMetrics[descKey] = struct{}{} + ch <- desc + } + // tempCh closed + done <- true + }() + + describeChain(tempCh, common.ChainForSingleGoroutine) + describeChain(tempCh, common.ChainForMultiGoroutine) + describeChain(tempCh, common.ChainForCustomPlugin) + + close(tempCh) + + <-done +} + +func describeChain(ch chan<- *prometheus.Desc, chain []common.MetricsCollector) { + for _, collector := range chain { + if collector != nil { + collector.Describe(ch) + } + } +} + +// Collect update metrics of prometheus +func (n *CollectorForPrometheus) Collect(ch chan<- prometheus.Metric) { + containerMap := common.GetContainerNPUInfo(n.collector) + chips := common.GetChipListWithVNPU(n.collector) + collectChain(ch, n, containerMap, chips, common.ChainForSingleGoroutine) + collectChain(ch, n, containerMap, chips, common.ChainForMultiGoroutine) + collectChain(ch, n, containerMap, chips, common.ChainForCustomPlugin) +} + +func collectChain(ch chan<- prometheus.Metric, n *CollectorForPrometheus, containerMap map[int32]container.DevicesInfo, + chips []common.HuaWeiAIChip, chain []common.MetricsCollector) { + if ch == nil { + logger.Error("ch is nil") + return + } + for _, collector := range chain { + collector.UpdatePrometheus(ch, n.collector, containerMap, chips) + } +} diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go new file mode 100644 index 0000000..331ca66 --- /dev/null +++ b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go @@ -0,0 +1,159 @@ +/* +Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package prometheus for prometheus collector +package prom + +import ( + "strconv" + "testing" + "time" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" + + "ascend-common/common-utils/hwlog" + "ascend-common/devmanager" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/collector/metrics" + "huawei.com/npu-exporter/v6/utils/logger" +) + +const ( + maxMetricsCount = 2000 + num5 = 5 + mockContainerName = "mockContainerName" + maxChipNum int32 = 8 +) + +func TestDescribe(t *testing.T) { + + convey.Convey("test prometheus desc ", t, func() { + collector := NewPrometheusCollector(nil) + + convey.Convey("test prometheus desc when ch is nil", func() { + collector.Describe(nil) + }) + convey.Convey("test prometheus desc when ch is not nil", func() { + ch := make(chan *prometheus.Desc, maxMetricsCount) + collector.Describe(ch) + t.Logf("Describe len(ch):%v", len(ch)) + + convey.So(ch, convey.ShouldNotBeEmpty) + }) + + }) +} + +func TestCollect(t *testing.T) { + convey.Convey("test prometheus collect ", t, func() { + npuCollector := mockNewNpuCollector() + collector := NewPrometheusCollector(npuCollector) + + convey.Convey("test prometheus collect when ch is nil", func() { + collector.Collect(nil) + }) + convey.Convey("test prometheus collect when ch is not nil", func() { + + ch := make(chan prometheus.Metric, maxMetricsCount) + + patches := gomonkey.NewPatches() + collector.Collect(ch) + + patches.ApplyFuncReturn(common.GetChipListWithVNPU, mockGetNPUChipList()) + patches.ApplyFuncReturn(common.GetContainerNPUInfo, mockGetContainerNPUInfo()) + + t.Logf("Describe len(ch):%v", len(ch)) + convey.So(ch, convey.ShouldNotBeEmpty) + }) + }) +} + +func mockNewNpuCollector() *common.NpuCollector { + tc := newNpuCollectorTestCase{ + cacheTime: time.Duration(num5), + updateTime: time.Duration(num5), + deviceParser: &container.DevicesParser{}, + dmgr: &devmanager.DeviceManager{}, + } + c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) + return c +} + +type newNpuCollectorTestCase struct { + cacheTime time.Duration + updateTime time.Duration + deviceParser *container.DevicesParser + dmgr *devmanager.DeviceManager +} + +func mockGetNPUChipList() []common.HuaWeiAIChip { + chips := make([]common.HuaWeiAIChip, 0) + for id := int32(0); id < maxChipNum; id++ { + chip := common.HuaWeiAIChip{ + CardId: id, + PhyId: id, + DeviceID: id, + LogicID: id, + } + + chips = append(chips, chip) + } + return chips +} + +func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { + containsInfo := make(map[int32]container.DevicesInfo) + for id := int32(0); id < maxChipNum; id++ { + + containerInfo := container.DevicesInfo{ + ID: strconv.Itoa(int(id)), + Name: mockContainerName, + Devices: []int{int(id)}, + } + containsInfo[id] = containerInfo + } + return containsInfo +} + +func init() { + logger.HwLogConfig = &hwlog.LogConfig{ + OnlyToStdout: true, + } + logger.InitLogger("Prometheus") + + initChain() +} + +func initChain() { + common.ChainForSingleGoroutine = []common.MetricsCollector{ + &metrics.HccsCollector{}, + &metrics.BaseInfoCollector{}, + &metrics.SioCollector{}, + &metrics.VersionCollector{}, + &metrics.HbmCollector{}, + &metrics.DdrCollector{}, + &metrics.VnpuCollector{}, + &metrics.PcieCollector{}, + } + common.ChainForMultiGoroutine = []common.MetricsCollector{ + &metrics.NetworkCollector{}, + &metrics.RoceCollector{}, + &metrics.OpticalCollector{}, + } +} diff --git a/mind-cluster/component/npu-exporter/plugins/README.md b/mind-cluster/component/npu-exporter/plugins/README.md new file mode 100644 index 0000000..5690dac --- /dev/null +++ b/mind-cluster/component/npu-exporter/plugins/README.md @@ -0,0 +1,388 @@ +## 自定义插件开发说明 + +用户可参考提供的demo,或将代码拷贝到plugins目录下,重新编译部署,下面对demo中各文件进行说明 + +- `dcmi.go` 、`dcmi_interface_api.h`:用户自定义NPU指标的接口声明与cgo实现,用于对接驱动dcmi接口,具体可参考demo实现,全部dcmi接口续参考驱动的dcmi接口文档。 +- `custom_metrics.go` 实现`MetricCollector`的接口,用于指标采集与上报,需要实现下面的接口,具体可参考demo实现: + - Describe:prometheus上报指标前,需要先定义指标的,该接口用于prometheus的指标定义 + - CollectToCache: 指标采集方法,每个采集周期都会执行,从外部获取数据,并传入到内部缓存中 + - UpdatePrometheus: 按照prometheus的格式,将缓存中的数据返回 + - UpdateTelagraf:按照telagraf的格式,将缓存中的数据返回。 + - IsSupporterd:检测当前环境,判断是否支持当前设备的检测。 + - PreCollect:正式开始采集前执行一次,可用于设备初始化。可以为空。 + - PostCollect:采集结束后执行一次,可用于数据的回收。可以为空。 +- `register.go`,提供插件注册函数,在npu-exporter启动时完成插件注册并完成dcmi接口初始化,**RegisterPlugin函数签名不要修改**,自定义插件通过`AddPluginCollector`接口注册,指标名称需要与`pluginConfiguration.json`中的指标组名称保持一致 + +对于插件指标组内定义的指标名称,不要与现有代码中已定义的插件指标(当前NPU指标、插件指标)重名 + +自定义插件采集时间超过10s后,npu-exporter会打印日志,提示插件采集时间过长,执行下一个插件采集。 + +### 编译部署 + +插件开发完后,执行Npu-exporter代码目录下的`build/build.sh`完成编译,需要提前准备go开发环境。 + +编译完成后,会在output目录下生成新的二进制文件与相关配置文件,根据需要打开或关闭相应开关,根据安装部署章节的安装指导,重新作镜像部署即可 + + + +`dcmi.go` + +```go +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins this for dcmi interface +package plugins + +// #cgo LDFLAGS: -ldl +/* + #include + #include + #include + #include + + #include "dcmi_interface_api.h" + + static void *dcmiHandle; + #define SO_NOT_FOUND -99999 + #define FUNCTION_NOT_FOUND -99998 + #define SUCCESS 0 + #define ERROR_UNKNOWN -99997 + #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); + + static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); + int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ + CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) + } + + // load .so files and functions + static int dcmiLoad_dl(const char* dcmiLibPath){ + if (dcmiLibPath == NULL) { + fprintf (stderr,"lib path is null\n"); + return SO_NOT_FOUND; + } + dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); + if (dcmiHandle == NULL){ + fprintf (stderr,"%s\n",dlerror()); + return SO_NOT_FOUND; + } + + dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); + + return SUCCESS; + } + + static int dcmiShutDown(void){ + if (dcmiHandle == NULL) { + return SUCCESS; + } + return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); + } +*/ +import "C" +import ( + "fmt" + + "unsafe" + + "ascend-common/common-utils/utils" + "ascend-common/devmanager/common" +) + +const ( + dcmiLibraryName = "libdcmi.so" +) + +// DcLoad load dcmi symbol +func DcLoad() error { + dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) + if err != nil { + return err + } + cDcmiTemplateName := C.CString(dcmiLibPath) + defer C.free(unsafe.Pointer(cDcmiTemplateName)) + if retCode := C.dcmiLoad_dl(cDcmiTemplateName); retCode != C.SUCCESS { + return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) + } + return nil +} + +// DcShutDown clean the dynamically loaded resource +func DcShutDown() error { + if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) + } + + return nil +} + +// DcGetDeviceHealth get device health +func DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { + if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { + return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) + } + var health C.uint + if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), + &health); int32(retCode) != common.Success { + return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ + "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) + } + if common.IsGreaterThanOrEqualInt32(int64(health)) { + return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ + "health: %d", cardID, deviceID, int64(health)) + } + return int32(health), nil +} + +``` + + + +`dcmi_interface_api.h` + +```c++ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __DCMI_INTERFACE_API_H__ +#define __DCMI_INTERFACE_API_H__ + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif +#endif /* __cplusplus */ + +#define DCMIDLLEXPORT static + +DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); + +#ifdef __cplusplus +#if __cplusplus +} +#endif +#endif /* __cplusplus */ + +#endif /* __DCMI_INTERFACE_API_H__ */ +``` + + + +`custom_metrics.go` + +```go +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +import ( + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/container" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + PluginInfoDesc = prometheus.NewDesc("plugin_info", "exporter custom plugin info", + []string{"plugin_label"}, nil) + + PluginNpuInfoDesc = prometheus.NewDesc("npu_plugin_info", "exporter custom npu plugin info", + []string{"npu_plugin_label"}, nil) +) + +const ( + pluginInfoKey = "pluginInfoKey" + pluginInfoValue = 1.11111 + pluginLabel = "pluginLabel" + npuPluginLabel = "npuPluginInfoKey" + npuPluginInfoKey = "npuPluginInfoKey" + pluginName = "MyPlugin" +) + +// PluginInfoCollector collect custom plugin info +type PluginInfoCollector struct { + common.MetricsCollectorAdapter + Cache sync.Map +} + +// Describe description of the metric +func (c *PluginInfoCollector) Describe(ch chan<- *prometheus.Desc) { + // add desc + logger.Debug("PluginInfoCollector Describe") + ch <- PluginInfoDesc + ch <- PluginNpuInfoDesc +} + +// CollectToCache collect the metric to cache +func (c *PluginInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { + // collect metric to cache + logger.Debug("PluginInfoCollector CollectToCache") + c.Cache.Store(pluginInfoKey, pluginInfoValue) + health, err := DcGetDeviceHealth(0, 0) + if err != nil { + logger.Error(err) + return + } + c.Cache.Store(npuPluginInfoKey, health) +} + +// UpdatePrometheus update prometheus metric +func (c *PluginInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { + logger.Debug("PluginInfoCollector UpdatePrometheus") + // get metric from cache + pluginCache, _ := c.Cache.Load(pluginInfoKey) + npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) + // update plugin info + ch <- prometheus.NewMetricWithTimestamp(time.Now(), + prometheus.MustNewConstMetric(PluginInfoDesc, prometheus.GaugeValue, pluginCache.(float64), pluginLabel)) + // update npu plugin info + value := float64(npuPluginCache.(int32)) + ch <- prometheus.NewMetricWithTimestamp(time.Now(), + prometheus.MustNewConstMetric(PluginNpuInfoDesc, prometheus.GaugeValue, value, npuPluginLabel)) + +} + +// UpdateTelegraf update telegraf metric +func (c *PluginInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + logger.Debug("PluginInfoCollector UpdateTelegraf") + // get metric from cache + pluginCache, _ := c.Cache.Load(pluginInfoKey) + npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) + // update plugin info + if fieldsMap[common.GeneralDevTagKey] == nil { + fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], PluginInfoDesc, pluginCache.(float64), "") + // update npu plugin info + const NpuLogicID = "1" + value := float64(npuPluginCache.(int32)) + if fieldsMap[NpuLogicID] == nil { + fieldsMap[NpuLogicID] = make(map[string]interface{}) + } + doUpdateTelegraf(fieldsMap[NpuLogicID], PluginNpuInfoDesc, value, "") + return fieldsMap +} + +// PreCollect pre handle before collect +func (c *PluginInfoCollector) PreCollect(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { + logger.Debug("PluginInfoCollector PreCollect") +} + +// PostCollect post handle after collect +func (c *PluginInfoCollector) PostCollect(n *common.NpuCollector) { + logger.Debug("PluginInfoCollector PostCollect") +} + +// IsSupported Check whether the current hardware supports this metric +func (c *PluginInfoCollector) IsSupported(n *common.NpuCollector) bool { + logger.Debug("PluginInfoCollector IsSupported") + return true +} + +// getDescName parse metrics name from prometheus.Desc object +func getDescName(desc *prometheus.Desc) string { + str := desc.String() + startIndex := strings.Index(str, "fqName: ") + len("fqName: ") + readfqName := str[startIndex:] + + endIndex := strings.Index(readfqName, ",") + if endIndex != -1 { + readfqName = readfqName[:endIndex] + } + + readfqName = strings.Trim(readfqName, "\"") + return readfqName +} + +func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { + fieldMap[getDescName(desc)+extInfo] = value +} + + +``` + + + +`register.go` + +```go +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +import ( + "huawei.com/npu-exporter/v6/collector/config" + "huawei.com/npu-exporter/v6/utils/logger" +) + +// RegisterPlugin register plugin collector +func RegisterPlugin() { + err := config.AddPluginCollector(pluginName, &PluginInfoCollector{}) + if err != nil { + logger.Errorf("add plugin failed: %v\n", err) + } + logger.Infof("add plugin ok: %v\n", pluginName) + err = DcLoad() + if err != nil { + logger.Errorf("dcmi init failed: %v\n", err) + return + } +} + +``` + diff --git a/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go b/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go new file mode 100644 index 0000000..db462a4 --- /dev/null +++ b/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go @@ -0,0 +1,358 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +import ( + "encoding/json" + "fmt" + "os" + "sort" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + "huawei.com/npu-exporter/v6/collector/common" + "huawei.com/npu-exporter/v6/collector/config" + "huawei.com/npu-exporter/v6/collector/container" + npuutils "huawei.com/npu-exporter/v6/utils" + "huawei.com/npu-exporter/v6/utils/logger" +) + +var ( + metricDesc *prometheus.Desc + labelKeys []string // a list of tag keys extracted from the datalist + jsonFilePath string + isSupported bool + currentVersion versionInfo +) + +const ( + size100k = 100 * 1024 + maxLabelSize = 10 + num1000 = 1000 + maxDataListSize = 128 + maxMetricNameSize = 128 + maxDescSize = 1024 + fileMetricsDisabledMsg = "file metrics collection will be disabled" + skipCurrentCollectionMsg = "will skip current collection and report cached metrics" + excludedPermission = 0111 // file should not have any execute permission +) + +type versionInfo struct { + name string + desc string + version string +} + +// TextMetricData represents the JSON structure +type TextMetricData struct { + Version string `json:"version"` + Desc string `json:"desc"` + Name string `json:"name"` + Timestamp int64 `json:"timestamp"` + DataList []DataItem `json:"data_list"` +} + +// DataItem represents each item in data_list +type DataItem struct { + Label map[string]string `json:"label"` + Value float64 `json:"value"` +} + +// InitTextMetricsDesc init text metric +func InitTextMetricsDesc(filePath string) { + if filePath == "" { + return + } + paths := strings.Split(filePath, ",") + if len(paths) > 1 { + logger.Warnf("multiple file paths detected in filePath: %s, only the first file will be used", filePath) + jsonFilePath = strings.TrimSpace(paths[0]) + } else { + jsonFilePath = filePath + } + if utils.IsDir(jsonFilePath) { + logger.Errorf("file path %s is a directory, only support specify file path", filePath) + return + } + fileData, err := waitForFile(jsonFilePath, time.Minute) + if err != nil { + logger.Warnf("read json file %s failed, %s: %v", jsonFilePath, fileMetricsDisabledMsg, err) + return + } + var metricsData TextMetricData + if err := json.Unmarshal(fileData, &metricsData); err != nil { + logger.Warnf("unmarshal json file %s failed, %s: %v, "+ + "Possible causes:\n1. The file is not in JSON format\n2. File size is more than 100KB ", jsonFilePath, fileMetricsDisabledMsg, err) + return + } + + if err := isDataOk(&metricsData); err != nil { + logger.Warnf("%v, %s", err, fileMetricsDisabledMsg) + return + } + + desc := metricsData.Desc + labelKeys = make([]string, 0, len(metricsData.DataList[0].Label)) + for key := range metricsData.DataList[0].Label { + labelKeys = append(labelKeys, key) + } + sort.Strings(labelKeys) + logger.Infof("init text metric succeeded, metricName: %v, version: %v, desc: %v, labels: %v", + metricsData.Name, metricsData.Version, desc, labelKeys) + + metricDesc = prometheus.NewDesc(metricsData.Name, desc, labelKeys, nil) + isSupported = true + currentVersion = versionInfo{ + name: metricsData.Name, + desc: desc, + version: metricsData.Version, + } + err = config.AddPluginCollector("text", &TextMetricsInfoCollector{}) + if err != nil { + logger.Errorf("%v", err) + } +} + +func isDataOk(metricsData *TextMetricData) error { + if len(metricsData.DataList) == 0 { + return fmt.Errorf("dataList is empty in json file %s", jsonFilePath) + } + if len(metricsData.DataList) > maxDataListSize { + return fmt.Errorf("size of dataList(%d) is more than max allowed dataList size(%d) in json file %s", + len(metricsData.DataList), maxDataListSize, jsonFilePath) + } + if len(metricsData.DataList[0].Label) > maxLabelSize { + return fmt.Errorf("size of first item's Label(%d) is more than max allowed label size(%d) in json file %s", + len(metricsData.DataList[0].Label), maxLabelSize, jsonFilePath) + } + if metricsData.Name == "" { + return fmt.Errorf("name field is empty in json file %s", jsonFilePath) + } + if len(metricsData.Name) > maxMetricNameSize { + return fmt.Errorf("length of metric name should not larger than %d, but current is %d", + maxMetricNameSize, len(metricsData.Name)) + } + if metricsData.Desc == "" { + return fmt.Errorf("desc field is empty in json file %s", jsonFilePath) + } + if len(metricsData.Desc) > maxDescSize { + return fmt.Errorf("length of metric desc should not larger than %d, but current is %d", + maxDescSize, len(metricsData.Desc)) + } + if metricsData.Version == "" { + return fmt.Errorf("version field is empty in json file %s", jsonFilePath) + } + // only support 1.0 version currently + if metricsData.Version != "1.0" { + return fmt.Errorf("version should be 1.0, but current is %s", metricsData.Version) + } + if metricsData.Timestamp <= 0 { + return fmt.Errorf("timestamp field is empty or not correct in json file %s", jsonFilePath) + } + return nil +} + +// waitForFile wait for file to exist +func waitForFile(filePath string, timeout time.Duration) ([]byte, error) { + const tickerDuration = 100 + deadline := time.Now().Add(timeout) + ticker := time.NewTicker(tickerDuration * time.Millisecond) + defer ticker.Stop() + once := sync.Once{} + + for { + fileData, err := utils.ReadLimitBytes(filePath, size100k) + err2 := checkFile(filePath) + if err2 != nil { + hwlog.RunLog.Errorf("check file err, %s: %v", filePath, err2) + } + if err2 != nil && !os.IsNotExist(err2) { + return nil, err2 + } + + if err == nil && err2 == nil && len(fileData) > 0 { + logger.Infof("successfully read json file %s", filePath) + return fileData, nil + } + if os.IsNotExist(err) || len(fileData) == 0 { + once.Do(func() { + logger.Warnf("file [%v] is not exist or file is empty, will wait 1 minute", filePath) + }) + if time.Now().After(deadline) { + return nil, fmt.Errorf("file %s does not exist or file is empty after waiting %v", filePath, timeout) + } + select { + case <-ticker.C: + continue + } + } + return nil, err + } +} + +func checkFile(filePath string) error { + absFilePath, err := utils.CheckPath(filePath) + if err != nil { + return err + } + if err = utils.DoCheckOwnerAndPermission(absFilePath, excludedPermission, 0); err != nil { + logger.Errorf("file permission should not included %04o: %v", excludedPermission, err) + return err + } + return nil +} + +// TextMetricsInfoCollector collect custom plugin info +type TextMetricsInfoCollector struct { + common.MetricsCollectorAdapter + Cache sync.Map +} + +// Describe description of the metric +func (c *TextMetricsInfoCollector) Describe(ch chan<- *prometheus.Desc) { + // add desc + if metricDesc != nil { + ch <- metricDesc + } +} + +// CollectToCache collect the metric to cache +func (c *TextMetricsInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { + // collect metric to cache + logger.Debugf("TextMetricsInfoCollector CollectToCache") + + fileData, err := utils.ReadLimitBytes(jsonFilePath, size100k) + if err != nil { + logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "readFileErr"}, + "read json file %s failed: %v", jsonFilePath, err) + return + } + hwlog.ResetErrCnt("textMetrics", "readFileErr") + + var metricsData TextMetricData + if err := json.Unmarshal(fileData, &metricsData); err != nil { + logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "unmarshalFileErr"}, + "unmarshal json file %s failed: %v", jsonFilePath, err) + return + } + hwlog.ResetErrCnt("textMetrics", "unmarshalFileErr") + + if err := isDataOk(&metricsData); err != nil { + logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "dataNotOk"}, + "%v, %s", err, skipCurrentCollectionMsg) + return + } + hwlog.ResetErrCnt("textMetrics", "dataNotOk") + + if versionChanged(metricsData) { + logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "textMetrics", ID: "versionChanged"}, + "json file base info changed, old: %v, new: %v", currentVersion, + versionInfo{name: metricsData.Name, desc: metricsData.Desc, version: metricsData.Version}) + return + } + hwlog.ResetErrCnt("textMetrics", "versionChanged") + + c.Cache.Store(common.GetCacheKey(c), metricsData) +} + +func versionChanged(data TextMetricData) bool { + if currentVersion.name != data.Name || currentVersion.desc != data.Desc || + currentVersion.version != data.Version { + return true + } + return false +} + +// UpdatePrometheus update prometheus metric +func (c *TextMetricsInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { + logger.Debug("TextMetricsInfoCollector UpdatePrometheus") + if metricDesc == nil { + logger.Warnf("metricDesc is not initialized, skip UpdatePrometheus") + return + } + cacheKey := common.GetCacheKey(c) + data, ok := c.Cache.Load(cacheKey) + if !ok { + logger.Debugf("cache key %s not found", cacheKey) + return + } + + textMetricsData, ok := data.(TextMetricData) + if !ok { + logger.Warnf("cache data type mismatch for key %s", cacheKey) + return + } + + timestamp := time.Unix(0, textMetricsData.Timestamp*num1000) + + for _, item := range textMetricsData.DataList { + labelValues := make([]string, len(labelKeys)) + for i, key := range labelKeys { + if value, ok := item.Label[key]; ok { + labelValues[i] = value + } else { + labelValues[i] = "" + } + } + + ch <- prometheus.NewMetricWithTimestamp(timestamp, + prometheus.MustNewConstMetric(metricDesc, prometheus.GaugeValue, item.Value, labelValues...)) + } +} + +// UpdateTelegraf update telegraf metric +func (c *TextMetricsInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, + containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { + logger.Debug("TextMetricsInfoCollector UpdateTelegraf") + + if metricDesc == nil { + logger.Warnf("metricDesc is not initialized, skip UpdateTelegraf") + return fieldsMap + } + + cacheKey := common.GetCacheKey(c) + data, ok := c.Cache.Load(cacheKey) + if !ok { + logger.Debugf("cache key %s not found", cacheKey) + return fieldsMap + } + + textMetricData, ok := data.(TextMetricData) + if !ok { + logger.Warnf("cache data type mismatch for key %s", cacheKey) + return fieldsMap + } + + for _, item := range textMetricData.DataList { + if fieldsMap[common.GeneralDevTagKey] == nil { + fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) + } + npuutils.DoUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], metricDesc, item.Value, "") + } + + return fieldsMap +} + +// IsSupported Check whether the current hardware supports this metric +func (c *TextMetricsInfoCollector) IsSupported(n *common.NpuCollector) bool { + return isSupported +} diff --git a/mind-cluster/component/npu-exporter/plugins/register.go b/mind-cluster/component/npu-exporter/plugins/register.go new file mode 100644 index 0000000..e9b5f41 --- /dev/null +++ b/mind-cluster/component/npu-exporter/plugins/register.go @@ -0,0 +1,21 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package plugins for custom metrics +package plugins + +// RegisterPlugin register plugin collector +func RegisterPlugin() { + +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/general_logger.go b/mind-cluster/component/npu-exporter/utils/logger/general_logger.go new file mode 100644 index 0000000..3f1e19c --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/general_logger.go @@ -0,0 +1,76 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "context" + "fmt" + + "ascend-common/common-utils/hwlog" +) + +const ( + maxLogLineLength = 1024 + defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" +) + +type generalLogger struct { +} + +// dynamicConfigure configures the logger +func (c *generalLogger) dynamicConfigure(Config) { +} + +// log logs with specified level +func (c *generalLogger) log(ctx context.Context, level Level, args ...interface{}) { + fn, ok := logFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), args...) +} + +// logf logs with specified level and format +func (c *generalLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { + fn, ok := logfFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), format, args...) +} + +func (c *generalLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, + args ...interface{}) { + + if opts.MaxCounts == 0 { + opts.MaxCounts = hwlog.ProblemOccurMaxNumbers + } + + if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + fn, ok := logfFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), format, args...) + } +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger.go b/mind-cluster/component/npu-exporter/utils/logger/logger.go new file mode 100644 index 0000000..723e070 --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/logger.go @@ -0,0 +1,174 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "context" + "errors" + "fmt" + + "github.com/influxdata/telegraf" + + "ascend-common/common-utils/hwlog" +) + +// the method mapping table (avoid rebuilding with every call) +var ( + logFuncs = map[Level]logFunc{} + logfFuncs = map[Level]logfFunc{} +) + +const ( + // DebugLevel Debug level + DebugLevel Level = iota - 1 + // InfoLevel Info level + InfoLevel + // WarnLevel Warn level + WarnLevel + // ErrorLevel Error level + ErrorLevel + + // PrometheusPlatform Prometheus platform + PrometheusPlatform = "Prometheus" + // TelegrafPlatform Telegraf platform + TelegrafPlatform = "Telegraf" +) + +// HwLogConfig default log file +var HwLogConfig = &hwlog.LogConfig{ + LogFileName: defaultLogFile, + ExpiredTime: hwlog.DefaultExpiredTime, + CacheSize: hwlog.DefaultCacheSize, + MaxLineLength: maxLogLineLength, +} + +// Level log level +type Level int + +// logFunc log function +type logFunc func(ctx context.Context, args ...interface{}) + +// logfFunc logf function +type logfFunc func(ctx context.Context, format string, args ...interface{}) + +var ( + // logger Unified log printer + logger UnifiedLogger +) + +// InitLogger initialize the log manager +func InitLogger(platform string) error { + + if platform == TelegrafPlatform { + logger = &telegrafLogger{} + HwLogConfig.LogFileName = defaultTelegrafLogPath + HwLogConfig.OnlyToFile = true + } else if platform == PrometheusPlatform { + logger = &generalLogger{} + } else { + return errors.New("platform is not supported:" + platform) + } + + if err := hwlog.InitRunLogger(HwLogConfig, context.Background()); err != nil { + fmt.Printf("hwlog init failed, error is %v\n", err) + return err + } + + logFuncs = map[Level]logFunc{ + DebugLevel: hwlog.RunLog.DebugWithCtx, + InfoLevel: hwlog.RunLog.InfoWithCtx, + WarnLevel: hwlog.RunLog.WarnWithCtx, + ErrorLevel: hwlog.RunLog.ErrorWithCtx, + } + + logfFuncs = map[Level]logfFunc{ + DebugLevel: hwlog.RunLog.DebugfWithCtx, + InfoLevel: hwlog.RunLog.InfofWithCtx, + WarnLevel: hwlog.RunLog.WarnfWithCtx, + ErrorLevel: hwlog.RunLog.ErrorfWithCtx, + } + return nil +} + +// LogOptions options for log +type LogOptions struct { + Domain string + ID interface{} + MaxCounts int +} + +// Config config for telegraf +type Config struct { + Acc telegraf.Accumulator +} + +// UnifiedLogger unified logger interface +type UnifiedLogger interface { + dynamicConfigure(Config) + log(ctx context.Context, level Level, args ...interface{}) + logf(ctx context.Context, level Level, format string, args ...interface{}) + logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, args ...interface{}) +} + +// Debug print log info with debug level +func Debug(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), DebugLevel, args...) +} + +// Info print log info with info level +func Info(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), InfoLevel, args...) +} + +// Warn print log info with warn level +func Warn(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), WarnLevel, args...) +} + +// Error print log info with error level +func Error(args ...interface{}) { + logger.log(hwlog.DeepIncrease(context.Background()), ErrorLevel, args...) +} + +// Debugf print log info with debug level +func Debugf(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), DebugLevel, format, args...) +} + +// Infof print log info with info level +func Infof(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), InfoLevel, format, args...) +} + +// Warnf print log info with warn level +func Warnf(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), WarnLevel, format, args...) +} + +// Errorf print log info with error level +func Errorf(format string, args ...interface{}) { + logger.logf(hwlog.DeepIncrease(context.Background()), ErrorLevel, format, args...) +} + +// LogfWithOptions print log info with error level +func LogfWithOptions(level Level, opts LogOptions, format string, args ...interface{}) { + logger.logfWithOptions(hwlog.DeepIncrease(context.Background()), level, opts, format, args...) +} + +// DynamicConfigure configure the logger +func DynamicConfigure(config Config) { + logger.dynamicConfigure(config) +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger_test.go b/mind-cluster/component/npu-exporter/utils/logger/logger_test.go new file mode 100644 index 0000000..a08ad4b --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/logger_test.go @@ -0,0 +1,119 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "errors" + "testing" + + "ascend-common/common-utils/hwlog" +) + +// TestInitLogger tests the InitLogger function +func TestInitLogger(t *testing.T) { + tests := []struct { + name string + platform string + expected error + }{ + { + name: "Telegraf Platform", + platform: TelegrafPlatform, + expected: nil, + }, + { + name: "Prometheus Platform", + platform: PrometheusPlatform, + expected: nil, + }, + { + name: "Unsupported Platform", + platform: "Unsupported", + expected: errors.New("platform is not supported:Unsupported"), + }, + } + + HwLogConfig.LogLevel = 0 + HwLogConfig.MaxBackups = hwlog.DefaultMaxBackups + HwLogConfig.LogFileName = defaultLogFile + HwLogConfig.MaxAge = hwlog.DefaultMinSaveAge + + var noExistLevel Level = 5 + var args = "mock" + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := InitLogger(tt.platform) + if tt.expected == nil && err != nil { + t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) + } else if tt.expected != nil && err.Error() != tt.expected.Error() { + t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) + } + + logger.log(nil, DebugLevel, args) + logger.log(nil, InfoLevel, args) + logger.log(nil, WarnLevel, args) + logger.log(nil, noExistLevel, args) + logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") + + logger.logf(nil, DebugLevel, args) + logger.logf(nil, InfoLevel, args) + logger.logf(nil, WarnLevel, args) + logger.logf(nil, noExistLevel, args) + logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") + + }) + } +} + +func TestLoggerMethods(t *testing.T) { + + tests := []struct { + name string + method func(...interface{}) + level Level + args []interface{} + }{ + {"test Debug", Debug, DebugLevel, []interface{}{"debug message"}}, + {"test Info", Info, InfoLevel, []interface{}{"info message"}}, + {"test Warn", Warn, WarnLevel, []interface{}{"warn message"}}, + {"test Error", Error, ErrorLevel, []interface{}{"error message"}}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + test.method(test.args...) + }) + } + + testsF := []struct { + name string + method func(string, ...interface{}) + level Level + format string + args []interface{} + }{ + {"test Debugf", Debugf, DebugLevel, "debug message %d", []interface{}{1}}, + {"test Infof", Infof, InfoLevel, "info message %d", []interface{}{1}}, + {"test Warnf", Warnf, WarnLevel, "warn message %d", []interface{}{1}}, + {"test Errorf", Errorf, ErrorLevel, "error message %d", []interface{}{1}}, + } + + for _, test := range testsF { + t.Run(test.name, func(t *testing.T) { + test.method(test.format, test.args...) + }) + } +} diff --git a/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go b/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go new file mode 100644 index 0000000..56c2ac5 --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go @@ -0,0 +1,82 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package logger for general collector +package logger + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/influxdata/telegraf" + + "ascend-common/common-utils/hwlog" +) + +var defaultTelegrafLogPath = "/var/log/mindx-dl/npu-exporter/npu-plugin.log" +var dangerousChars = map[string]string{ + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", +} + +type telegrafLogger struct { + acc telegraf.Accumulator +} + +// dynamicConfigure configures the logger +func (c *telegrafLogger) dynamicConfigure(config Config) { + c.acc = config.Acc +} + +// log logs with specified level +func (c *telegrafLogger) log(ctx context.Context, level Level, args ...interface{}) { + c.logf(hwlog.DeepIncrease(ctx), level, "%s", args...) +} + +// logf logs with specified level and format +func (c *telegrafLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { + sanitized := format + for char, replacement := range dangerousChars { + sanitized = strings.ReplaceAll(sanitized, char, replacement) + } + if level < InfoLevel || c.acc == nil { + fn, ok := logfFuncs[level] + if !ok { + hwlog.RunLog.Warnf("unknown log level: %v", level) + return + } + + fn(hwlog.DeepIncrease(ctx), sanitized, args...) + return + } + + c.acc.AddError(errors.New(fmt.Sprintf(sanitized, args...))) +} + +// LogfWithOptions print log info with options +func (c *telegrafLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, + args ...interface{}) { + + if opts.MaxCounts == 0 { + opts.MaxCounts = hwlog.ProblemOccurMaxNumbers + } + + if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { + format = fmt.Sprintf("%s %s", format, extraErrLog) + c.logf(hwlog.DeepIncrease(ctx), level, format, args...) + } +} diff --git a/mind-cluster/component/npu-exporter/utils/utils.go b/mind-cluster/component/npu-exporter/utils/utils.go new file mode 100644 index 0000000..b5da97c --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/utils.go @@ -0,0 +1,52 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils for common utils +package utils + +import ( + "strings" + + "github.com/prometheus/client_golang/prometheus" +) + +// GetDescName parse metrics name from prometheus.Desc object +func GetDescName(desc *prometheus.Desc) string { + if desc == nil { + return "" + } + str := desc.String() + startIndex := strings.Index(str, "fqName: ") + if startIndex == -1 { + return "" + } + readfqName := str[startIndex+len("fqName: "):] + + endIndex := strings.Index(readfqName, ",") + if endIndex == -1 { + return "" + } + readfqName = readfqName[:endIndex] + + readfqName = strings.Trim(readfqName, "\"") + return readfqName +} + +// DoUpdateTelegraf update telegraf +func DoUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { + if fieldMap == nil { + return + } + fieldMap[GetDescName(desc)+extInfo] = value +} diff --git a/mind-cluster/component/npu-exporter/utils/utils_test.go b/mind-cluster/component/npu-exporter/utils/utils_test.go new file mode 100644 index 0000000..1a91d29 --- /dev/null +++ b/mind-cluster/component/npu-exporter/utils/utils_test.go @@ -0,0 +1,103 @@ +/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package utils for common utils +package utils + +import ( + "testing" + + "github.com/agiledragon/gomonkey/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/smartystreets/goconvey/convey" +) + +const ( + emptyString = "" + testMetricName = "test_metric" + testMetricName2 = "another_metric" + invalidDescStr = "invalid description" + noCommaDescStr = "fqName: test_metric" + normalDescStr = `fqName: "test_metric", help: "test help"` + normalDescStr2 = `fqName: another_metric, help: "another help"` + noQuoteDescStr = `fqName: test_metric, help: "test help"` + testHelp = "test help" +) + +func TestGetDescName(t *testing.T) { + convey.Convey("should return empty string when desc is nil", t, testGetDescNameNil) + convey.Convey("should return empty string when desc.String does not contain fqName prefix", t, + testGetDescNameNoFqName) + convey.Convey("should return empty string when desc.String does not contain comma", t, + testGetDescNameNoComma) + convey.Convey("should return metric name when desc.String contains valid format", t, + testGetDescNameValidFormat) +} + +func testGetDescNameNil() { + result := GetDescName(nil) + convey.So(result, convey.ShouldEqual, emptyString) +} + +func testGetDescNameNoFqName() { + desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) + patch := gomonkey.ApplyMethodReturn(desc, "String", invalidDescStr) + defer patch.Reset() + + result := GetDescName(desc) + convey.So(result, convey.ShouldEqual, emptyString) +} + +func testGetDescNameNoComma() { + desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) + patch := gomonkey.ApplyMethodReturn(desc, "String", noCommaDescStr) + defer patch.Reset() + + result := GetDescName(desc) + convey.So(result, convey.ShouldEqual, emptyString) +} + +func testGetDescNameValidFormat() { + testCases := []struct { + name string + descStr string + expected string + }{ + { + name: "should return metric name when desc.String contains normal format with quotes", + descStr: normalDescStr, + expected: testMetricName, + }, + { + name: "should return metric name when desc.String contains normal format without quotes", + descStr: noQuoteDescStr, + expected: testMetricName, + }, + { + name: "should return correct metric name when desc.String contains another metric", + descStr: normalDescStr2, + expected: testMetricName2, + }, + } + + for _, tc := range testCases { + desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) + patch := gomonkey.ApplyMethodReturn(desc, "String", tc.descStr) + + result := GetDescName(desc) + convey.So(result, convey.ShouldEqual, tc.expected) + + patch.Reset() + } +} diff --git a/mind-cluster/component/npu-exporter/versions/version.go b/mind-cluster/component/npu-exporter/versions/version.go new file mode 100644 index 0000000..63dba00 --- /dev/null +++ b/mind-cluster/component/npu-exporter/versions/version.go @@ -0,0 +1,23 @@ +/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package versions record the program version +package versions + +var ( + // BuildVersion record the program build version + BuildVersion string + // BuildName record the program build name + BuildName string +) From b5aae0806b42aec8690e6a7f44f1eec5101aa6a3 Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Tue, 20 Jan 2026 18:51:34 +0800 Subject: [PATCH 06/10] fix go.mod Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- go.mod | 1 - 1 file changed, 1 deletion(-) diff --git a/go.mod b/go.mod index 6ea6f74..d04534c 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,6 @@ require ( github.com/Project-HAMi/HAMi v0.0.0 github.com/fsnotify/fsnotify v1.7.0 google.golang.org/grpc v1.63.2 - huawei.com/npu-exporter/v6 v6.0.0-RC3.b001 k8s.io/api v0.29.3 k8s.io/apimachinery v0.29.3 k8s.io/klog/v2 v2.120.1 From 6a9c33f9a6554cd54b05f68e05aca3df9c34976b Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Wed, 21 Jan 2026 15:06:10 +0800 Subject: [PATCH 07/10] remove mind-cluster Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- .../component/ascend-common/README.md | 8 - .../ascend-common/api/ascend-operator/LICENSE | 201 -- .../api/ascend-operator/README.md | 164 -- .../apis/batch/v1/ascendjob_types.go | 85 - .../apis/batch/v1/constants.go | 53 - .../ascend-operator/apis/batch/v1/defaults.go | 137 - .../ascend-operator/apis/batch/v1/register.go | 52 - .../apis/batch/v1/zz_generated.deepcopy.go | 137 - .../apis/batch/v1/zz_generated.defaults.go | 53 - .../client/clientset/versioned/clientset.go | 114 - .../clientset/versioned/scheme/register.go | 39 - .../versioned/typed/batch/v1/client.go | 110 - .../clientset/versioned/typed/batch/v1/job.go | 221 -- .../externalversions/batch/interface.go | 49 - .../externalversions/batch/v1/interface.go | 48 - .../externalversions/batch/v1/job.go | 99 - .../informers/externalversions/factory.go | 207 -- .../informers/externalversions/generic.go | 71 - .../internalinterfaces/factory_interfaces.go | 40 - .../listers/batch/v1/expansion_generated.go | 26 - .../client/listers/batch/v1/job.go | 108 - .../component/ascend-common/api/consts.go | 222 -- .../ascend-common/api/default_name.go | 188 -- .../ascend-common/api/publicfault.go | 32 - .../ascend-common/api/slownet/fault_net.go | 77 - .../ascend-common/api/superpoddevice.go | 36 - .../component/ascend-common/api/type.go | 30 - .../common-utils/cache/lrucache.go | 394 --- .../common-utils/cache/lrucache_test.go | 304 --- .../ascend-common/common-utils/hwlog/api.go | 310 --- .../common-utils/hwlog/api_test.go | 165 -- .../common-utils/hwlog/hwlog_adaptor.go | 174 -- .../common-utils/hwlog/hwlog_adaptor_test.go | 126 - .../common-utils/hwlog/log_limiter.go | 156 -- .../common-utils/hwlog/logger.go | 242 -- .../common-utils/hwlog/logger_test.go | 217 -- .../ascend-common/common-utils/hwlog/rolog.go | 447 ---- .../common-utils/hwlog/rolog_test.go | 687 ----- .../ascend-common/common-utils/hwlog/types.go | 49 - .../ascend-common/common-utils/hwlog/utils.go | 98 - .../common-utils/hwlog/utils_test.go | 38 - .../common-utils/limiter/limit_handler.go | 226 -- .../limiter/limit_handler_test.go | 119 - .../common-utils/limiter/limit_listener.go | 161 -- .../limiter/limit_listener_test.go | 125 - .../common-utils/limiter/limit_writer.go | 64 - .../common-utils/limiter/limit_writer_test.go | 37 - .../common-utils/rand/rand_linux.go | 71 - .../common-utils/rand/rand_linux_test.go | 54 - .../ascend-common/common-utils/rand/random.go | 28 - .../common-utils/rand/random_test.go | 32 - .../ascend-common/common-utils/utils/env.go | 35 - .../common-utils/utils/env_test.go | 51 - .../ascend-common/common-utils/utils/file.go | 176 -- .../common-utils/utils/file_check.go | 240 -- .../common-utils/utils/file_check_test.go | 194 -- .../common-utils/utils/file_test.go | 169 -- .../common-utils/utils/file_watcher.go | 85 - .../common-utils/utils/file_watcher_test.go | 81 - .../common-utils/utils/interface.go | 29 - .../common-utils/utils/interface_test.go | 36 - .../common-utils/utils/ip_utils.go | 98 - .../common-utils/utils/ip_utils_test.go | 182 -- .../ascend-common/common-utils/utils/path.go | 382 --- .../common-utils/utils/path_test.go | 232 -- .../common-utils/utils/pwd_util.go | 75 - .../common-utils/utils/pwd_util_test.go | 59 - .../ascend-common/common-utils/utils/slice.go | 129 - .../common-utils/utils/slice_test.go | 536 ---- .../common-utils/utils/strings.go | 75 - .../common-utils/utils/strings_test.go | 84 - .../ascend-common/devmanager/a310mgr.go | 25 - .../ascend-common/devmanager/a310pmgr.go | 35 - .../ascend-common/devmanager/a910mgr.go | 31 - .../devmanager/common/constants.go | 272 -- .../ascend-common/devmanager/common/types.go | 435 ---- .../ascend-common/devmanager/common/utils.go | 305 --- .../devmanager/common/utils_test.go | 163 -- .../devmanager/dcmi/constants.go | 78 - .../ascend-common/devmanager/dcmi/dcmi.go | 2213 ----------------- .../devmanager/dcmi/dcmi_interface_api.h | 596 ----- .../ascend-common/devmanager/devmanager.go | 1197 --------- .../devmanager/devmanager_910a3_mock.go | 30 - .../devmanager/devmanager_910a3_mock_err.go | 43 - .../devmanager/devmanager_hccs_test.go | 166 -- .../devmanager/devmanager_mock.go | 370 --- .../devmanager/devmanager_mock_err.go | 369 --- .../devmanager/devmanager_test.go | 78 - .../devmanager/hccn/hccn_tool.go | 335 --- .../devmanager/hccn/hccn_tool_test.go | 49 - mind-cluster/component/ascend-common/go.mod | 55 - mind-cluster/component/ascend-common/go.sum | 492 ---- .../component/npu-exporter/.gitignore | 1 - mind-cluster/component/npu-exporter/LICENSE | 201 -- mind-cluster/component/npu-exporter/README.md | 42 - .../component/npu-exporter/build/Dockerfile | 21 - .../npu-exporter/build/Dockerfile-310P-1usoc | 31 - .../component/npu-exporter/build/build.sh | 80 - .../component/npu-exporter/build/build_ch.sh | 74 - .../build/metricConfiguration.json | 13 - .../build/npu-exporter-310P-1usoc.yaml | 167 -- .../npu-exporter/build/npu-exporter.yaml | 140 -- .../build/pluginConfiguration.json | 4 - .../npu-exporter/build/run_for_310P_1usoc.sh | 32 - .../component/npu-exporter/build/test.sh | 75 - .../npu-exporter/cmd/npu-exporter/main.go | 545 ---- .../common/collector_for_container.go | 109 - .../common/collector_for_container_test.go | 137 - .../collector/common/constants.go | 140 -- .../collector/common/metrics_collector.go | 192 -- .../common/metrics_collector_test.go | 231 -- .../collector/common/npu_collector.go | 423 ---- .../collector/common/npu_collector_test.go | 547 ---- .../npu-exporter/collector/common/types.go | 50 - .../collector/config/metrics_config.go | 208 -- .../collector/config/metrics_config_test.go | 216 -- .../collector/container/isula/isula_api.pb.go | 870 ------- .../collector/container/isula/isula_api.proto | 118 - .../container/isula/isula_api_grpc.pb.go | 107 - .../container/isula/isula_container.go | 39 - .../collector/container/isula/isulad.pb.go | 278 --- .../collector/container/isula/isulad.proto | 35 - .../container/isula/isulad_grpc.pb.go | 105 - .../collector/container/parser.go | 630 ----- .../collector/container/parser_test.go | 1027 -------- .../collector/container/runtime_ops.go | 413 --- .../collector/container/runtime_ops_test.go | 568 ----- .../npu-exporter/collector/container/utils.go | 133 - .../collector/container/utils_test.go | 329 --- .../collector/container/v1/containerd.pb.go | 310 --- .../collector/container/v1/containerd.proto | 62 - .../collector/container/v1/spec.go | 59 - .../collector/metrics/collector_for_ddr.go | 142 -- .../collector/metrics/collector_for_hbm.go | 228 -- .../metrics/collector_for_hbm_test.go | 115 - .../collector/metrics/collector_for_hccs.go | 312 --- .../metrics/collector_for_hccs_test.go | 150 -- .../metrics/collector_for_network.go | 190 -- .../collector/metrics/collector_for_npu.go | 453 ---- .../metrics/collector_for_optical.go | 200 -- .../collector/metrics/collector_for_pcie.go | 234 -- .../collector/metrics/collector_for_roce.go | 263 -- .../collector/metrics/collector_for_sio.go | 120 - .../metrics/collector_for_version.go | 56 - .../collector/metrics/collector_for_vnpu.go | 169 -- .../metrics/collector_for_vnpu_test.go | 202 -- .../collector/metrics/collector_test.go | 548 ---- .../collector/metrics/common_utils.go | 193 -- .../collector/metrics/common_utils_test.go | 165 -- .../collector/testdata/prometheus_metrics | 166 -- .../collector/testdata/prometheus_metrics2 | 6 - mind-cluster/component/npu-exporter/go.mod | 63 - mind-cluster/component/npu-exporter/go.sum | 561 ----- .../npu-exporter/platforms/inputs/all/npu.go | 20 - .../platforms/inputs/npu/README.md | 107 - .../npu-exporter/platforms/inputs/npu/npu.go | 104 - .../platforms/inputs/npu/npu_test.go | 174 -- .../platforms/inputs/npu/sample.conf | 9 - .../platforms/prom/prometheus_collector.go | 103 - .../prom/prometheus_collector_test.go | 159 -- .../component/npu-exporter/plugins/README.md | 388 --- .../plugins/collector_for_text_file.go | 358 --- .../npu-exporter/plugins/register.go | 21 - .../utils/logger/general_logger.go | 76 - .../npu-exporter/utils/logger/logger.go | 174 -- .../npu-exporter/utils/logger/logger_test.go | 119 - .../utils/logger/telegraf_logger.go | 82 - .../component/npu-exporter/utils/utils.go | 52 - .../npu-exporter/utils/utils_test.go | 103 - .../npu-exporter/versions/version.go | 23 - 170 files changed, 32586 deletions(-) delete mode 100644 mind-cluster/component/ascend-common/README.md delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/LICENSE delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/README.md delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go delete mode 100644 mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go delete mode 100644 mind-cluster/component/ascend-common/api/consts.go delete mode 100644 mind-cluster/component/ascend-common/api/default_name.go delete mode 100644 mind-cluster/component/ascend-common/api/publicfault.go delete mode 100644 mind-cluster/component/ascend-common/api/slownet/fault_net.go delete mode 100644 mind-cluster/component/ascend-common/api/superpoddevice.go delete mode 100644 mind-cluster/component/ascend-common/api/type.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/types.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/rand/random_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/env_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/interface_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/path_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/slice_test.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings.go delete mode 100644 mind-cluster/component/ascend-common/common-utils/utils/strings_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/a310mgr.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/a310pmgr.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/a910mgr.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/constants.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/types.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/common/utils_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/constants.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/devmanager_test.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go delete mode 100644 mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go delete mode 100644 mind-cluster/component/ascend-common/go.mod delete mode 100644 mind-cluster/component/ascend-common/go.sum delete mode 100644 mind-cluster/component/npu-exporter/.gitignore delete mode 100644 mind-cluster/component/npu-exporter/LICENSE delete mode 100644 mind-cluster/component/npu-exporter/README.md delete mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile delete mode 100644 mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc delete mode 100644 mind-cluster/component/npu-exporter/build/build.sh delete mode 100644 mind-cluster/component/npu-exporter/build/build_ch.sh delete mode 100644 mind-cluster/component/npu-exporter/build/metricConfiguration.json delete mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml delete mode 100644 mind-cluster/component/npu-exporter/build/npu-exporter.yaml delete mode 100644 mind-cluster/component/npu-exporter/build/pluginConfiguration.json delete mode 100644 mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh delete mode 100644 mind-cluster/component/npu-exporter/build/test.sh delete mode 100644 mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/constants.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/common/types.go delete mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config.go delete mode 100644 mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto delete mode 100644 mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/parser.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/parser_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/utils.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/utils_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go delete mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto delete mode 100644 mind-cluster/component/npu-exporter/collector/container/v1/spec.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils.go delete mode 100644 mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go delete mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics delete mode 100644 mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 delete mode 100644 mind-cluster/component/npu-exporter/go.mod delete mode 100644 mind-cluster/component/npu-exporter/go.sum delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf delete mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go delete mode 100644 mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go delete mode 100644 mind-cluster/component/npu-exporter/plugins/README.md delete mode 100644 mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go delete mode 100644 mind-cluster/component/npu-exporter/plugins/register.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/general_logger.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/logger_test.go delete mode 100644 mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go delete mode 100644 mind-cluster/component/npu-exporter/utils/utils.go delete mode 100644 mind-cluster/component/npu-exporter/utils/utils_test.go delete mode 100644 mind-cluster/component/npu-exporter/versions/version.go diff --git a/mind-cluster/component/ascend-common/README.md b/mind-cluster/component/ascend-common/README.md deleted file mode 100644 index fa7f1b8..0000000 --- a/mind-cluster/component/ascend-common/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# AscendCommon - -# 组件介绍 -提供公共代码给其他组件使用,组件包括NPU-Exporter等。 - -# 说明 - -1. 编译NPU-Exporter等组件时,AscendCommon要放在同一目录下 \ No newline at end of file diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE b/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/README.md b/mind-cluster/component/ascend-common/api/ascend-operator/README.md deleted file mode 100644 index 20c2f61..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/README.md +++ /dev/null @@ -1,164 +0,0 @@ -# ascend-aperator-apis - -## 介绍 - -ascend-aperator-apis旨在为用户提供AscendJob API,及其Clientsets, Listers、Informers。使用户能轻松对AscendJob进行CRUD操作。 - -## 接口说明 - -1. 创建clientsets - - ```go - NewForConfig(c *rest.Config)(*Clientset, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------- | ------------------------------------------------------------ | - | c | Input | *rest.Config | 客户端配置文件,由k8s提供的接口生成。包括cluster host、证书等信息 | - | - | Output | *clientsets | Client集合,包括AscendJob client和discovery client | - | - | Output | error | 错误信息 | - -2. 创建AscendJob - - ```go - Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | job | Input | *v1.AscendJob | AscendJob对象指针 | - | opts | Input | metav1.CreateOptions | 创建选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -3. 获取AscendJob - - ```go - Get(ctx context.Context, name string, opts metav1.GetOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | name | Input | string | AscendJob名称 | - | opts | Input | metav1.GetOptions | 获取选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -4. 列举AscendJob - - ```go - List(ctx context.Context, opts metav1.ListOptions)(*v1.AscendJobList, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | --------------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | opts | Input | metav1.ListOptions | 列举选项 | - | - | Output | *v1.AscendJob | AscendJobList对象指针 | - | - | Output | error | 错误信息 | - -5. 观察AscendJob - - ```go - Watch((ctx context.Context, opts metav1.ListOptions)(watch.Interface, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | ------------------ | ---------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | opts | Input | metav1.ListOptions | 列举选项 | - | - | Output | watch.Interface | watch类接口 | - | - | Output | error | 错误信息 | - -6. 更新AscendJob - - ```go - Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | job | Input | *v1.AscendJob | AscendJob对象指针 | - | opts | Input | metav1.UpdateOptions | 更新选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -7. 更新AscendJob状态 - - ```go - UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions)(*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | job | Input | *v1.AscendJob | AscendJob对象指针 | - | opts | Input | metav1.UpdateOptions | 更新选项 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -8. 补丁AscendJob - - ```go - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (*v1.AscendJob, error) - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ------------ | ------------ | --------------- | ----------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | name | Input | string | AscendJob名称 | - | pt | Input | types.PatchType | patch类型 | - | data | Input | []byte | patch信息 | - | subresources | Input | ...string | 子信息 | - | - | Output | *v1.AscendJob | AscendJob对象指针 | - | - | Output | error | 错误信息 | - -9. 删除AscendJob - - ```go - Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ---------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | name | Input | string | AscendJob名称 | - | opts | Input | metav1.DeleteOptions | 删除选项 | - | - | Output | error | 错误信息 | - -10. 批量删除AscendJob - - ```go - DeleteCollection(ctx context.Context,opts metav1.DeleteOptions, listOpts metav1.ListOptions) error - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ---------- | ------------ | -------------------- | ---------------- | - | ctx | Input | context.Context | 上下文,协程控制 | - | opts | Input | metav1.DeleteOptions | 删除选项 | - | listOpts | Input | metav1.ListOptions | 列举选项 | - | - | Output | error | 错误信息 | - -11. 创建informerFactory - - ```go - NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) sharedInformerFactory - ``` - - | Parameters | Input/Output | Parameter Type | Description | - | ------------- | ------------ | --------------------- | ------------------ | - | client | Input | versioned.Interface | client类接口 | - | defaultResync | Input | time.Duration | 默认的重新同步时间 | - | - | Output | sharedInformerFactory | informer类接口 | - -12. 创建informer - - ```go - sharedInformerFactory.Batch().V1().Jobs().Informer() - ``` - - - diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go deleted file mode 100644 index 7bd1d65..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/ascendjob_types.go +++ /dev/null @@ -1,85 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to define AscendJob object and its initialization. -package v1 - -import ( - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// AscendJob is the Schema for the AscendJob API -type AscendJob struct { - // Standard Kubernetes type metadata. - metav1.TypeMeta `json:",inline"` - - // +optional - metav1.ObjectMeta `json:"metadata,omitempty"` - - // Specification of the desired state of the AscendJob. - // +optional - Spec AscendJobSpec `json:"spec,omitempty"` - - // Most recently observed status of the AscendJob. - // Populated by the system. - // Read-only. - // +optional - Status commonv1.JobStatus `json:"status,omitempty"` -} - -// AscendJobSpec defines the desired state of AscendJob -type AscendJobSpec struct { - // RunPolicy encapsulates various runtime policies of the distributed training - // job, for example how to clean up resources and how long the job can stay - // active. - // +kubebuilder:validation:Optional - RunPolicy commonv1.RunPolicy `json:"runPolicy"` - - // SuccessPolicy defines the policy to mark the AscendJob as succeeded. - // Default to "", using the default rules. - // +optional - SuccessPolicy *SuccessPolicy `json:"successPolicy,omitempty"` - - // SchedulerName defines the job scheduler with gang-scheduling enabled - SchedulerName string `json:"schedulerName,omitempty"` - - /* A map of ReplicaType (type) to ReplicaSpec (value). Specifies the ML cluster configuration. - For example, - { - "Scheduler": ReplacaSpec, - "Worker": ReplicaSpec, - } - */ - ReplicaSpecs map[commonv1.ReplicaType]*commonv1.ReplicaSpec `json:"replicaSpecs"` -} - -// AscendJobList contains a list of AscendJob -type AscendJobList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []AscendJob `json:"items"` -} - -// SuccessPolicy is the success policy. -type SuccessPolicy string - -const ( - // SuccessPolicyDefault is the default policy of success - SuccessPolicyDefault SuccessPolicy = "" - // SuccessPolicyAllWorkers is the 'ALLWorkers' policy of success - SuccessPolicyAllWorkers SuccessPolicy = "AllWorkers" -) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go deleted file mode 100644 index 9341682..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/constants.go +++ /dev/null @@ -1,53 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "github.com/kubeflow/common/pkg/apis/common/v1" -) - -const ( - // GroupName is the group name used in this package. - GroupName = "mindxdl.gitee.com" - - // FrameworkKey the key of the laebl - FrameworkKey = "framework" - - // DefaultPort is default value of the port. - DefaultPort = 2222 - - // MindSporeFrameworkName is the name of ML Framework - MindSporeFrameworkName = "mindspore" - // MindSporeReplicaTypeScheduler is the type for Scheduler of distribute ML - MindSporeReplicaTypeScheduler v1.ReplicaType = "Scheduler" - - // PytorchFrameworkName is the name of ML Framework - PytorchFrameworkName = "pytorch" - // PytorchReplicaTypeMaster is the type for Scheduler of distribute ML - PytorchReplicaTypeMaster v1.ReplicaType = "Master" - - // TensorflowFrameworkName is the name of ML Framework - TensorflowFrameworkName = "tensorflow" - // TensorflowReplicaTypeChief is the type for Scheduler of distribute ML - TensorflowReplicaTypeChief v1.ReplicaType = "Chief" - - // ReplicaTypeWorker this is also used for non-distributed AscendJob - ReplicaTypeWorker v1.ReplicaType = "Worker" - - // DefaultRestartPolicy is default RestartPolicy for MSReplicaSpec. - DefaultRestartPolicy = v1.RestartPolicyNever -) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go deleted file mode 100644 index 4d5c124..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/defaults.go +++ /dev/null @@ -1,137 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "errors" - "fmt" - "strings" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - - "ascend-common/api" -) - -// Int32 is a helper routine that allocates a new int32 value -// to store v and returns a pointer to it. -func Int32(v int32) *int32 { - return &v -} - -// addDefaultingFuncs is used to register default funcs -func addDefaultingFuncs(scheme *runtime.Scheme) error { - return RegisterDefaults(scheme) -} - -// setDefaultPort sets the default ports for mindxdl container. -func setDefaultPort(spec *v1.PodSpec) { - index := 0 - for i, container := range spec.Containers { - if container.Name == api.DefaultContainerName { - index = i - break - - } - } - hasASJobPort := false - for _, port := range spec.Containers[index].Ports { - if port.Name == api.DefaultPortName { - hasASJobPort = true - break - } - } - if !hasASJobPort { - spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{ - Name: api.DefaultPortName, - ContainerPort: DefaultPort, - }) - } -} - -func setDefaultReplicas(spec *commonv1.ReplicaSpec) { - if spec.Replicas == nil { - spec.Replicas = Int32(1) - } - if spec.RestartPolicy == "" { - spec.RestartPolicy = DefaultRestartPolicy - } -} - -// setTypeNamesToCamelCase sets the name of all replica types from any case to correct case. -func setTypeNamesToCamelCase(job *AscendJob) { - setTypeNameToCamelCase(job, MindSporeReplicaTypeScheduler) - setTypeNameToCamelCase(job, ReplicaTypeWorker) - setTypeNameToCamelCase(job, PytorchReplicaTypeMaster) - setTypeNameToCamelCase(job, TensorflowReplicaTypeChief) -} - -// setTypeNameToCamelCase sets the name of the replica type from any case to correct case. -// E.g. from ps to PS; from WORKER to Worker. -func setTypeNameToCamelCase(job *AscendJob, typ commonv1.ReplicaType) { - for t := range job.Spec.ReplicaSpecs { - if strings.EqualFold(string(t), string(typ)) && t != typ { - spec := job.Spec.ReplicaSpecs[t] - delete(job.Spec.ReplicaSpecs, t) - job.Spec.ReplicaSpecs[typ] = spec - return - } - } -} - -// SetDefaultsAscendJob sets any unspecified values to defaults. -func SetDefaultsAscendJob(job *AscendJob) { - // Set default cleanpod policy to Running. - if job == nil { - return - } - - if job.Spec.RunPolicy.CleanPodPolicy == nil { - running := commonv1.CleanPodPolicyNone - job.Spec.RunPolicy.CleanPodPolicy = &running - } - // Set default success policy to "". - if job.Spec.SuccessPolicy == nil { - defaultPolicy := SuccessPolicyDefault - job.Spec.SuccessPolicy = &defaultPolicy - } - - // Update the key of replicaSpecs to camel case. - setTypeNamesToCamelCase(job) - - for rt, spec := range job.Spec.ReplicaSpecs { - // Set default replicas to 1. - setDefaultReplicas(spec) - // Set default port to ml container. - if rt == MindSporeReplicaTypeScheduler || rt == PytorchReplicaTypeMaster || rt == TensorflowReplicaTypeChief { - setDefaultPort(&spec.Template.Spec) - } - } -} - -// GetJobFramework get framework name of ascendjob -func GetJobFramework(job *AscendJob) (string, error) { - if job == nil || job.Labels == nil { - return "", errors.New("job or job labels is nil") - } - frame, ok := job.Labels[FrameworkKey] - if !ok { - return "", fmt.Errorf("job<%s-%s> label framework is not set", job.Namespace, job.Name) - } - return frame, nil -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go deleted file mode 100644 index 5813e39..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/register.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -var ( - // SchemeGroupVersion is the group version used to register these objects. - SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1"} - // SchemeBuilder points to a list of functions added to Scheme. - SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) - // AddToScheme adds the types in this group-version to the given scheme. - AddToScheme = SchemeBuilder.AddToScheme -) - -// Resource takes an unqualified resource and returns a Group-qualified GroupResource. -func Resource(resource string) schema.GroupResource { - return SchemeGroupVersion.WithResource(resource).GroupResource() -} - -// addKnownTypes adds the set of types defined in this package to the supplied scheme. -func addKnownTypes(scheme *runtime.Scheme) error { - scheme.AddKnownTypes(SchemeGroupVersion, - &AscendJob{}, - &AscendJobList{}, - ) - - v1.AddToGroupVersion(scheme, SchemeGroupVersion) - return nil -} - -func init() { - SchemeBuilder.Register(addDefaultingFuncs) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go deleted file mode 100644 index 695038b..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.deepcopy.go +++ /dev/null @@ -1,137 +0,0 @@ -//go:build !ignore_autogenerated -// +build !ignore_autogenerated - -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by controller-gen. DO NOT EDIT. - -package v1 - -import ( - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "k8s.io/apimachinery/pkg/runtime" -) - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AscendJob) DeepCopyInto(out *AscendJob) { - if in == nil { - return - } - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJob. -func (in *AscendJob) DeepCopy() *AscendJob { - if in == nil { - return nil - } - out := new(AscendJob) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *AscendJob) DeepCopyObject() runtime.Object { - if in == nil { - return nil - } - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AscendJobList) DeepCopyInto(out *AscendJobList) { - if in == nil { - return - } - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]AscendJob, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobList. -func (in *AscendJobList) DeepCopy() *AscendJobList { - if in == nil { - return nil - } - out := new(AscendJobList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *AscendJobList) DeepCopyObject() runtime.Object { - if in == nil { - return nil - } - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AscendJobSpec) DeepCopyInto(out *AscendJobSpec) { - if in == nil { - return - } - *out = *in - in.RunPolicy.DeepCopyInto(&out.RunPolicy) - if in.SuccessPolicy != nil { - in, out := &in.SuccessPolicy, &out.SuccessPolicy - *out = new(SuccessPolicy) - **out = **in - } - if in.ReplicaSpecs != nil { - in, out := &in.ReplicaSpecs, &out.ReplicaSpecs - *out = make(map[commonv1.ReplicaType]*commonv1.ReplicaSpec, len(*in)) - for key, val := range *in { - var outVal *commonv1.ReplicaSpec - if val == nil { - (*out)[key] = nil - } else { - in, out := &val, &outVal - *out = new(commonv1.ReplicaSpec) - (*in).DeepCopyInto(*out) - } - (*out)[key] = outVal - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MSJobSpec. -func (in *AscendJobSpec) DeepCopy() *AscendJobSpec { - if in == nil { - return nil - } - out := new(AscendJobSpec) - in.DeepCopyInto(out) - return out -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go b/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go deleted file mode 100644 index e9b774a..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/apis/batch/v1/zz_generated.defaults.go +++ /dev/null @@ -1,53 +0,0 @@ -//go:build !ignore_autogenerated -// +build !ignore_autogenerated - -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by defaulter-gen. DO NOT EDIT. - -package v1 - -import ( - "errors" - - runtime "k8s.io/apimachinery/pkg/runtime" -) - -// RegisterDefaults adds defaulters functions to the given scheme. -// Public to allow building arbitrary schemes. -// All generated defaulters are covering - they call all nested defaulters. -func RegisterDefaults(scheme *runtime.Scheme) error { - if scheme == nil { - return errors.New("scheme is nil") - } - scheme.AddTypeDefaultingFunc(&AscendJob{}, func(obj interface{}) { SetObjectDefaults_AscendJob(obj.(*AscendJob)) }) - scheme.AddTypeDefaultingFunc(&AscendJobList{}, func(obj interface{}) { SetObjectDefaults_AscendJobList(obj.(*AscendJobList)) }) - return nil -} - -func SetObjectDefaults_AscendJob(in *AscendJob) { - SetDefaultsAscendJob(in) -} - -func SetObjectDefaults_AscendJobList(in *AscendJobList) { - if in == nil { - return - } - for i := range in.Items { - a := &in.Items[i] - SetObjectDefaults_AscendJob(a) - } -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go deleted file mode 100644 index 0d4add4..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/clientset.go +++ /dev/null @@ -1,114 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package versioned is used to define the ClientSet interface and struct, and its initialization. -package versioned - -import ( - "fmt" - "net/http" - - "k8s.io/client-go/discovery" - "k8s.io/client-go/rest" - "k8s.io/client-go/util/flowcontrol" - - "ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1" -) - -// Interface is the interface definition for Clientset. -type Interface interface { - Discovery() discovery.DiscoveryInterface - BatchV1() v1.BatchV1Interface -} - -// Clientset contains the clients for groups. Each group has exactly one -// version included in a Clientset. -type Clientset struct { - *discovery.DiscoveryClient - batchV1 *v1.BatchV1Client -} - -// BatchV1 retrieves the BatchV1alpha1Client -func (c *Clientset) BatchV1() v1.BatchV1Interface { - if c == nil { - return nil - } - return c.batchV1 -} - -// Discovery retrieves the DiscoveryClient -func (c *Clientset) Discovery() discovery.DiscoveryInterface { - if c == nil { - return nil - } - return c.DiscoveryClient -} - -// NewForConfig creates a new Clientset for the given config. -// If config's RateLimiter is not set and QPS and Burst are acceptable, -// NewForConfig will generate a rate-limiter in configShallowCopy. -// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), -// where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*Clientset, error) { - configShallowCopy := *c - - // share the transport between all clients - httpClient, err := rest.HTTPClientFor(&configShallowCopy) - if err != nil { - return nil, err - } - - return NewForConfigAndClient(&configShallowCopy, httpClient) -} - -// NewForConfigAndClient creates a new Clientset for the given config and http client. -// Note the http client provided takes precedence over the configured transport values. -// If config's RateLimiter is not set and QPS and Burst are acceptable, -// NewForConfigAndClient will generate a rate-limiter in configShallowCopy. -func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { - if c == nil || httpClient == nil { - return nil, fmt.Errorf("nil pointer") - } - configShallowCopy := *c - if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { - if configShallowCopy.Burst <= 0 { - return nil, fmt.Errorf("burst is required to be greater than 0 " + - "when RateLimiter is not set and QPS is set to greater than 0") - } - configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) - } - - var cs Clientset - var err error - cs.batchV1, err = v1.NewForConfigAndClient(&configShallowCopy, httpClient) - if err != nil { - return nil, err - } - cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) - if err != nil { - return nil, err - } - return &cs, nil -} - -// New creates a new Clientset for the given RESTClient. -func New(c rest.Interface) *Clientset { - var cs Clientset - cs.batchV1 = v1.New(c) - - cs.DiscoveryClient = discovery.NewDiscoveryClient(c) - return &cs -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go deleted file mode 100644 index 58a99b0..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/scheme/register.go +++ /dev/null @@ -1,39 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package scheme is used to add runtime.Scheme -package scheme - -import ( - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/serializer" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - - "ascend-common/api/ascend-operator/apis/batch/v1" -) - -// RuntimeScheme is a Scheme object instance. -var RuntimeScheme = runtime.NewScheme() - -// Codecs is a CodecFactory object instance. -var Codecs = serializer.NewCodecFactory(RuntimeScheme) - -// ParameterCodec is a parameterCodec object instance. -var ParameterCodec = runtime.NewParameterCodec(RuntimeScheme) - -func init() { - utilruntime.Must(v1.AddToScheme(RuntimeScheme)) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go deleted file mode 100644 index 7dd8264..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/client.go +++ /dev/null @@ -1,110 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to define some client- and job-related interfaces, initialization operations, -// and method implementations. -package v1 - -import ( - "errors" - "net/http" - - "k8s.io/client-go/rest" - - "ascend-common/api/ascend-operator/apis/batch/v1" - "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" -) - -// BatchV1Interface is a batch client interface. -type BatchV1Interface interface { - RESTClient() rest.Interface - JobsGetter -} - -// BatchV1Client is a client structure. -type BatchV1Client struct { - restClient rest.Interface -} - -// Jobs returns a JobInterface object instance. -func (c *BatchV1Client) Jobs(namespace string) JobInterface { - if c == nil { - return nil - } - return newJobs(c, namespace) -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *BatchV1Client) RESTClient() rest.Interface { - if c == nil { - return nil - } - return c.restClient -} - -// NewForConfig creates a new BatchV1alpha1Client for the given config. -// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), -// where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*BatchV1Client, error) { - if c == nil { - return nil, errors.New(nilPointError) - } - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - httpClient, err := rest.HTTPClientFor(&config) - if err != nil { - return nil, err - } - return NewForConfigAndClient(&config, httpClient) -} - -func setConfigDefaults(config *rest.Config) error { - gv := v1.SchemeGroupVersion - config.GroupVersion = &gv - config.APIPath = "/apis" - config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() - - if config.UserAgent == "" { - config.UserAgent = rest.DefaultKubernetesUserAgent() - } - - return nil -} - -// NewForConfigAndClient creates a new BatchV1alpha1Client for the given config and http client. -// Note the http client provided takes precedence over the configured transport values. -func NewForConfigAndClient(c *rest.Config, h *http.Client) (*BatchV1Client, error) { - if c == nil || h == nil { - return nil, errors.New(nilPointError) - } - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - client, err := rest.RESTClientForConfigAndClient(&config, h) - if err != nil { - return nil, err - } - return &BatchV1Client{restClient: client}, nil -} - -// New creates a new BatchV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *BatchV1Client { - return &BatchV1Client{restClient: c} -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go deleted file mode 100644 index a6527ad..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/clientset/versioned/typed/batch/v1/job.go +++ /dev/null @@ -1,221 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "context" - "errors" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/rest" - - "ascend-common/api" - "ascend-common/api/ascend-operator/apis/batch/v1" - "ascend-common/api/ascend-operator/client/clientset/versioned/scheme" -) - -const ( - nilPointError = "nil pointer" -) - -// JobsGetter has a method to return a JobInterface. -// A group's client should implement this interface. -type JobsGetter interface { - Jobs(namespace string) JobInterface -} - -// JobInterface has methods to work with Job resources. -type JobInterface interface { - Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) - Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) - UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, error) - Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error - DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error - Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) - List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) - Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, - subresources ...string) (result *v1.AscendJob, err error) - // JobExpansion -} - -// jobs implements JobInterface -type jobs struct { - client rest.Interface - ns string -} - -func (j *jobs) Create(ctx context.Context, job *v1.AscendJob, opts metav1.CreateOptions) (*v1.AscendJob, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Post(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&opts, scheme.ParameterCodec). - Body(job). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) Update(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, - error) { - if j == nil || job == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Put(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(job.Name). - VersionedParams(&opts, scheme.ParameterCodec). - Body(job). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) UpdateStatus(ctx context.Context, job *v1.AscendJob, opts metav1.UpdateOptions) (*v1.AscendJob, - error) { - if j == nil || job == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Put(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(job.Name). - SubResource("status"). - VersionedParams(&opts, scheme.ParameterCodec). - Body(job). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { - if j == nil { - return errors.New(nilPointError) - } - return j.client.Delete(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(name). - Body(&opts). - Do(ctx). - Error() -} - -func (j *jobs) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error { - if j == nil { - return errors.New(nilPointError) - } - var timeout time.Duration - if listOpts.TimeoutSeconds != nil { - timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second - } - return j.client.Delete(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&listOpts, scheme.ParameterCodec). - Timeout(timeout). - Body(&opts). - Do(ctx). - Error() -} - -func (j *jobs) Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.AscendJob, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Get(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(name). - VersionedParams(&opts, scheme.ParameterCodec). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) List(ctx context.Context, opts metav1.ListOptions) (*v1.AscendJobList, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - var timeout time.Duration - if opts.TimeoutSeconds != nil { - timeout = time.Duration(*opts.TimeoutSeconds) * time.Second - } - result := &v1.AscendJobList{} - err := j.client.Get(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&opts, scheme.ParameterCodec). - Timeout(timeout). - Do(ctx). - Into(result) - return result, err -} - -func (j *jobs) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - var timeout time.Duration - if opts.TimeoutSeconds != nil { - timeout = time.Duration(*opts.TimeoutSeconds) * time.Second - } - opts.Watch = true - return j.client.Get(). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - VersionedParams(&opts, scheme.ParameterCodec). - Timeout(timeout). - Watch(ctx) -} - -func (j *jobs) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, - subresources ...string) (*v1.AscendJob, error) { - if j == nil { - return nil, errors.New(nilPointError) - } - result := &v1.AscendJob{} - err := j.client.Patch(pt). - Namespace(j.ns). - Resource(api.AscendJobsLowerCase). - Name(name). - SubResource(subresources...). - VersionedParams(&opts, scheme.ParameterCodec). - Body(data). - Do(ctx). - Into(result) - return result, err -} - -// newJobs returns a Jobs -func newJobs(c *BatchV1Client, namespace string) *jobs { - return &jobs{ - client: c.RESTClient(), - ns: namespace, - } -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go deleted file mode 100644 index 78b5d12..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/interface.go +++ /dev/null @@ -1,49 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package batch is used to define interfaces. -package batch - -import ( - "ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1" - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to each of this group's versions. -type Interface interface { - // V1 provides access to shared informers for resources in V1alpha1. - V1() v1.Interface -} - -type group struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, - tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// V1 returns a new v1alpha1.Interface. -func (g *group) V1() v1.Interface { - if g == nil { - return nil - } - return v1.New(g.factory, g.namespace, g.tweakListOptions) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go deleted file mode 100644 index a4f0466..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/interface.go +++ /dev/null @@ -1,48 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to implement job informer-related methods. -package v1 - -import ( - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to all the informers in this group version. -type Interface interface { - // Jobs returns a JobInformer. - Jobs() JobInformer -} - -type version struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, - tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// Jobs returns a JobInformer. -func (v *version) Jobs() JobInformer { - if v == nil { - return nil - } - return &jobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go deleted file mode 100644 index e5f0b1c..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/batch/v1/job.go +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "context" - "time" - - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/tools/cache" - - batchv1 "ascend-common/api/ascend-operator/apis/batch/v1" - "ascend-common/api/ascend-operator/client/clientset/versioned" - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" - batchlister "ascend-common/api/ascend-operator/client/listers/batch/v1" -) - -// JobInformer provides access to a shared informer and lister for -// Jobs. -type JobInformer interface { - Informer() cache.SharedIndexInformer - Lister() batchlister.JobLister -} - -type jobInformer struct { - factory internalinterfaces.SharedInformerFactory - tweakListOptions internalinterfaces.TweakListOptionsFunc - namespace string -} - -// NewJobInformer constructs a new informer for Job type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, - indexers cache.Indexers) cache.SharedIndexInformer { - return NewFilteredJobInformer(client, namespace, resyncPeriod, indexers, nil) -} - -// NewFilteredJobInformer constructs a new informer for Job type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewFilteredJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, - indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.BatchV1().Jobs(namespace).List(context.TODO(), options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.BatchV1().Jobs(namespace).Watch(context.TODO(), options) - }, - }, - &batchv1.AscendJob{}, - resyncPeriod, - indexers, - ) -} - -func (f *jobInformer) defaultInformer(client versioned.Interface, - resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewFilteredJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{ - cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) -} - -func (f *jobInformer) Informer() cache.SharedIndexInformer { - if f == nil || f.factory == nil { - return nil - } - return f.factory.InformerFor(&batchv1.AscendJob{}, f.defaultInformer) -} - -func (f *jobInformer) Lister() batchlister.JobLister { - if f == nil { - return nil - } - return batchlister.NewJobLister(f.Informer().GetIndexer()) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go deleted file mode 100644 index 5fec15f..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/factory.go +++ /dev/null @@ -1,207 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package externalversions - -import ( - "reflect" - "sync" - "time" - - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/tools/cache" - - "ascend-common/api/ascend-operator/client/clientset/versioned" - "ascend-common/api/ascend-operator/client/informers/externalversions/batch" - "ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces" -) - -// SharedInformerFactory provides shared informers for resources in all known -// API group versions. -type SharedInformerFactory interface { - internalinterfaces.SharedInformerFactory - ForResource(resource schema.GroupVersionResource) (GenericInformer, error) - WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool - - Batch() batch.Interface -} - -// SharedInformerOption defines the functional option type for SharedInformerFactory. -type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory - -type sharedInformerFactory struct { - client versioned.Interface - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc - lock sync.Mutex - defaultResync time.Duration - customResync map[reflect.Type]time.Duration - - informers map[reflect.Type]cache.SharedIndexInformer - // startedInformers is used for tracking which informers have been started. - // This allows Start() to be called multiple times safely. - startedInformers map[reflect.Type]bool -} - -// WithCustomResyncConfig sets a custom resync period for the specified informer types. -func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - if factory == nil { - return factory - } - - if factory.customResync == nil { - factory.customResync = make(map[reflect.Type]time.Duration) - } - - for k, v := range resyncConfig { - factory.customResync[reflect.TypeOf(k)] = v - } - return factory - } -} - -// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. -func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - if factory == nil { - return nil - } - factory.tweakListOptions = tweakListOptions - return factory - } -} - -// WithNamespace limits the SharedInformerFactory to the specified namespace. -func WithNamespace(namespace string) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - if factory == nil { - return nil - } - factory.namespace = namespace - return factory - } -} - -// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. -func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { - return NewSharedInformerFactoryWithOptions(client, defaultResync) -} - -// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. -func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, - options ...SharedInformerOption) SharedInformerFactory { - factory := &sharedInformerFactory{ - client: client, - namespace: v1.NamespaceAll, - defaultResync: defaultResync, - informers: make(map[reflect.Type]cache.SharedIndexInformer), - startedInformers: make(map[reflect.Type]bool), - customResync: make(map[reflect.Type]time.Duration), - } - - // Apply all options - for _, opt := range options { - factory = opt(factory) - } - - return factory -} - -// Start initializes all requested informers. -func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { - if f == nil { - return - } - f.lock.Lock() - defer f.lock.Unlock() - - if f.startedInformers == nil { - f.startedInformers = make(map[reflect.Type]bool) - } - - for informerType, informer := range f.informers { - if !f.startedInformers[informerType] { - go informer.Run(stopCh) - f.startedInformers[informerType] = true - } - } -} - -// WaitForCacheSync waits for all started informers' cache were synced. -func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { - informers := func() map[reflect.Type]cache.SharedIndexInformer { - if f == nil { - return nil - } - f.lock.Lock() - defer f.lock.Unlock() - - informers := map[reflect.Type]cache.SharedIndexInformer{} - for informerType, informer := range f.informers { - if f.startedInformers[informerType] { - informers[informerType] = informer - } - } - return informers - }() - - res := map[reflect.Type]bool{} - for informType, informer := range informers { - res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) - } - return res -} - -// InternalInformerFor returns the SharedIndexInformer for obj using an internal -// client. -func (f *sharedInformerFactory) InformerFor(obj runtime.Object, - newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { - if f == nil { - return nil - } - - f.lock.Lock() - defer f.lock.Unlock() - - informerType := reflect.TypeOf(obj) - informer, exists := f.informers[informerType] - if exists { - return informer - } - - resyncPeriod, exists := f.customResync[informerType] - if !exists { - resyncPeriod = f.defaultResync - } - - informer = newFunc(f.client, resyncPeriod) - if f.informers == nil { - f.informers = make(map[reflect.Type]cache.SharedIndexInformer) - } - f.informers[informerType] = informer - - return informer -} - -func (f *sharedInformerFactory) Batch() batch.Interface { - if f == nil { - return nil - } - return batch.New(f, f.namespace, f.tweakListOptions) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go deleted file mode 100644 index 95db6d0..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/generic.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright 2023 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package externalversions - -import ( - "errors" - "fmt" - - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/tools/cache" - - "ascend-common/api" - "ascend-common/api/ascend-operator/apis/batch/v1" -) - -// GenericInformer is type of SharedIndexInformer which will locate and delegate to other -// sharedInformers based on type -type GenericInformer interface { - Informer() cache.SharedIndexInformer - Lister() cache.GenericLister -} - -type genericInformer struct { - informer cache.SharedIndexInformer - resource schema.GroupResource -} - -// Informer returns the SharedIndexInformer. -func (f *genericInformer) Informer() cache.SharedIndexInformer { - if f == nil { - return nil - } - return f.informer -} - -// Lister returns the GenericLister. -func (f *genericInformer) Lister() cache.GenericLister { - if f == nil { - return nil - } - return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) -} - -// ForResource gives generic access to a shared informer of the matching type -// extend this to unknown resources with a client pool -func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { - if f == nil { - return nil, errors.New("nil pointer") - } - switch resource { - case v1.SchemeGroupVersion.WithResource(api.AscendJobsLowerCase): - return &genericInformer{resource: resource.GroupResource(), informer: f.Batch().V1().Jobs().Informer()}, nil - default: - } - - return nil, fmt.Errorf("no informer found for %v", resource) -} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go deleted file mode 100644 index 5602b78..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/informers/externalversions/internalinterfaces/factory_interfaces.go +++ /dev/null @@ -1,40 +0,0 @@ -/* -Copyright 2019 Bloomberg Finance LP. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package internalinterfaces is used to define informer-related interfaces. -package internalinterfaces - -import ( - "time" - - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/tools/cache" - - "ascend-common/api/ascend-operator/client/clientset/versioned" -) - -// NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. -type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer - -// SharedInformerFactory a small interface to allow for adding an informer without an import cycle -type SharedInformerFactory interface { - Start(stopCh <-chan struct{}) - InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer -} - -// TweakListOptionsFunc is a function that transforms a v1.ListOptions. -type TweakListOptionsFunc func(*v1.ListOptions) diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go deleted file mode 100644 index 9ed431c..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/expansion_generated.go +++ /dev/null @@ -1,26 +0,0 @@ -/* -Copyright 2024 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1 is used to define job-related interfaces. -package v1 - -// JobListerExpansion allows custom methods to be added to -// JobLister. -type JobListerExpansion interface{} - -// JobNamespaceListerExpansion allows custom methods to be added to -// JobNamespaceLister. -type JobNamespaceListerExpansion interface{} diff --git a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go b/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go deleted file mode 100644 index 084a913..0000000 --- a/mind-cluster/component/ascend-common/api/ascend-operator/client/listers/batch/v1/job.go +++ /dev/null @@ -1,108 +0,0 @@ -/* -Copyright 2024 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "errors" - - k8serr "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/tools/cache" - - "ascend-common/api/ascend-operator/apis/batch/v1" -) - -// JobLister helps list Jobs. -// All objects returned here must be treated as read-only. -type JobLister interface { - // List lists all Jobs in the indexer. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*v1.AscendJob, err error) - // Jobs returns an object that can list and get Jobs. - Jobs(namespace string) JobNamespaceLister - JobListerExpansion -} - -// jobLister implements the JobLister interface. -type jobLister struct { - indexer cache.Indexer -} - -// NewJobLister returns a new JobLister. -func NewJobLister(indexer cache.Indexer) JobLister { - return &jobLister{indexer: indexer} -} - -// List lists all Jobs in the indexer. -func (s *jobLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { - if s == nil { - return nil, errors.New("nil pointer") - } - var ret []*v1.AscendJob - err := cache.ListAll(s.indexer, selector, func(m interface{}) { - ret = append(ret, m.(*v1.AscendJob)) - }) - return ret, err -} - -// Jobs returns an object that can list and get Jobs. -func (s *jobLister) Jobs(namespace string) JobNamespaceLister { - if s == nil { - return nil - } - return jobNamespaceLister{indexer: s.indexer, namespace: namespace} -} - -// JobNamespaceLister helps list and get Jobs. -// All objects returned here must be treated as read-only. -type JobNamespaceLister interface { - // List lists all Jobs in the indexer for a given namespace. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*v1.AscendJob, err error) - // Get retrieves the Job from the indexer for a given namespace and name. - // Objects returned here must be treated as read-only. - Get(name string) (*v1.AscendJob, error) - JobNamespaceListerExpansion -} - -// jobNamespaceLister implements the JobNamespaceLister -// interface. -type jobNamespaceLister struct { - indexer cache.Indexer - namespace string -} - -// List lists all Jobs in the indexer for a given namespace. -func (s jobNamespaceLister) List(selector labels.Selector) ([]*v1.AscendJob, error) { - var ret []*v1.AscendJob - err := cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { - ret = append(ret, m.(*v1.AscendJob)) - }) - return ret, err -} - -// Get retrieves the Job from the indexer for a given namespace and name. -func (s jobNamespaceLister) Get(name string) (*v1.AscendJob, error) { - obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) - if err != nil { - return nil, err - } - if !exists { - return nil, k8serr.NewNotFound(v1.Resource("job"), name) - } - return obj.(*v1.AscendJob), nil -} diff --git a/mind-cluster/component/ascend-common/api/consts.go b/mind-cluster/component/ascend-common/api/consts.go deleted file mode 100644 index 01881ce..0000000 --- a/mind-cluster/component/ascend-common/api/consts.go +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api common const -package api - -// Env -const ( - NodeNameEnv = "NODE_NAME" - - // PtWorldSizeEnv the total number of npu used for the task for PyTorch - PtWorldSizeEnv = "WORLD_SIZE" - // PtLocalWorldSizeEnv number of npu used per pod for PyTorch - PtLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" - // PtLocalRankEnv logic id List of npu used by pod for PyTorch - PtLocalRankEnv = "LOCAL_RANK" - - // TfWorkerSizeEnv the total number of npu used for the task for TensorFlow - TfWorkerSizeEnv = "CM_WORKER_SIZE" - // TfLocalWorkerEnv number of npu used per pod for TensorFlow - TfLocalWorkerEnv = "CM_LOCAL_WORKER" - - // MsWorkerNumEnv the total number of npu used for the task for MindSpore - MsWorkerNumEnv = "MS_WORKER_NUM" - // MsLocalWorkerEnv number of npu used per pod for MindSpore - MsLocalWorkerEnv = "MS_LOCAL_WORKER" -) - -// NameSpace -const ( - DLNamespace = "mindx-dl" - ClusterNS = "cluster-system" - KubeNS = "kube-system" -) - -// Node -const ( - // NPUChipMemoryLabel label value is npu chip memory - NPUChipMemoryLabel = "mind-cluster/npu-chip-memory" - - // NodeSNAnnotation annotation value is node sn - NodeSNAnnotation = "product-serial-number" - // BaseDevInfoAnno annotation value is device base info - BaseDevInfoAnno = "baseDeviceInfos" - - // AcceleratorTypeKey the node label key of accelerator type - AcceleratorTypeKey = "accelerator-type" - // AcceleratorTypeModule910A3SuperPod for 910A3-SuperPod hardware - AcceleratorTypeModule910A3SuperPod = "module-a3-16-super-pod" -) - -// Pod -const ( - // PodUsedHardwareTypeAnno annotation value is the hardware type that real used in pod - PodUsedHardwareTypeAnno = "mind-cluster/hardware-type" - // PodRankIndexAnno annotation value is rank index of the pod - PodRankIndexAnno = "hccl/rankIndex" - // SuperPodIDAnno annotation key of the super pod id - SuperPodIDAnno = "super-pod-id" - - // Hotswitch Annotations - - // InHotSwitchFlowKey in hot switch flow key - InHotSwitchFlowKey = "inHotSwitchFlow" - // InHotSwitchFlowValue in hot switch flow true - InHotSwitchFlowValue = "true" - // BackupNewPodNameKey backup new pod name key - BackupNewPodNameKey = "backupNewPodName" - // BackupSourcePodNameKey backup source pod name key - BackupSourcePodNameKey = "backupSourcePodName" - // NeedOperatorOpeKey need operator ope key - NeedOperatorOpeKey = "needOperatorOpe" - // NeedVolcanoOpeKey need volcano ope key - NeedVolcanoOpeKey = "needVolcanoOpe" - // OpeTypeDelete ope type delete - OpeTypeDelete = "delete" - // OpeTypeCreate ope type create - OpeTypeCreate = "create" - // PodTypeKey pod type key - PodTypeKey = "podType" - // PodTypeBackup pod type backup - PodTypeBackup = "backup" - // DefaultRetryTimes default retry times - DefaultRetryTimes = 3 - // MasterPodRank master pod rank - MasterPodRank = "0" -) - -// PodGroup -const ( - // AtlasTaskLabel label value task kind, eg. ascend-910, ascend-{xxx}b - AtlasTaskLabel = "ring-controller.atlas" -) - -// ConfigMap -const ( - // DeviceInfoCMDataKey device-info-cm data key, record device info - DeviceInfoCMDataKey = "DeviceInfoCfg" - // SwitchInfoCMDataKey device-info-cm data key, record switch info - SwitchInfoCMDataKey = "SwitchInfoCfg" - // NodeInfoCMDataKey node-info-cm data key, record node info - NodeInfoCMDataKey = "NodeInfo" - // PubFaultCMDataKey public fault cm data key, record public fault info - PubFaultCMDataKey = "PublicFault" - - // CIMCMLabelKey cm label key, who uses these cms - CIMCMLabelKey = "mx-consumer-cim" - // PubFaultCMLabelKey public fault cm label key - PubFaultCMLabelKey = "mc-consumer-publicfault" -) - -const ( - // FaultJobCmName fault job cm name - FaultJobCmName = "fault-job-info" -) - -const ( - // PodScheduleLabel pod schedule label - PodScheduleLabel = "pod-rescheduling" - // ProcessScheduleLabel process schedule label - ProcessScheduleLabel = "process-recover-enable" - // RecoverStrategyKey recover strategy key in job annotation - RecoverStrategyKey = "recover-strategy" -) - -// process schedule strategy -const ( - // RecoverStrategy recover strategy - RecoverStrategy = "recover" - // RetryStrategy retry strategy - RetryStrategy = "retry" - // InPlaceStrategy recover in place strategy - InPlaceStrategy = "recover-in-place" - // DumpStrategy dump strategy - DumpStrategy = "dump" - // ExitStrategy exit strategy - ExitStrategy = "exit" - // ElasticTraining elastic-training strategy - ElasticTraining = "elastic-training" -) - -// process schedule common env -const ( - // ProcessRecoverEnv process recover env - ProcessRecoverEnv = "PROCESS_RECOVER" - // ElasticRecoverEnv elastic process recover env - ElasticRecoverEnv = "ELASTIC_PROCESS_RECOVER_ENABLE" - // EnableRestartEnv enable restart env - EnableRestartEnv = "ENABLE_RESTART_FAULT_PROCESS" -) - -// process schedule pytorch env -const ( - // HighAvailableEnv high available env - HighAvailableEnv = "HIGH_AVAILABILITY" - // PtCloseWatchDogKey pt close watch dog key - PtCloseWatchDogKey = "HCCL_ASYNC_ERROR_HANDLING" - // PtCloseWatchDogValue pt close watch dog value - PtCloseWatchDogValue = "0" -) - -// process schedule ms env -const ( - // MsRecoverEnv ms recover env - MsRecoverEnv = "MS_ENABLE_TFT" - // EnableMS enable ms - EnableMS = "MINDIO_FOR_MINDSPORE" - // MsDumpStrategy ms dump strategy - MsDumpStrategy = "TTP:1" - // MsUceStrategy ms uce strategy - MsUceStrategy = "UCE:1" - // MsArfStrategy ms arf strategy - MsArfStrategy = "ARF:1" - // MsHcceStrategy ms hcce strategy - MsHcceStrategy = "HCCE:1" - // MsRscStrategy ms rsc strategy - MsRscStrategy = "RSC:1" - // MsCloseWatchDogKey ms close watch dog key - MsCloseWatchDogKey = "MS_ENABLE_THM" - // MsCloseWatchDogValue ms close watch dog value - MsCloseWatchDogValue = `{HCCL_WATCHDOG:0}` -) - -const ( - //EnableFunc Enable Func - EnableFunc = "on" - // EnableFlag enable flag - EnableFlag = "1" - // PytorchFramework framework - PytorchFramework = "pytorch" - // MindSporeFramework framework - MindSporeFramework = "mindspore" -) - -const ( - // RescheduleInPlaceKey reschedule in place key - RescheduleInPlaceKey = "reschedule-in-place" - // RescheduleInPlaceValue reschedule in place value - RescheduleInPlaceValue = "true" -) - -const ( - // DeviceResetTimeout device reset timeout - DeviceResetTimeout = "deviceResetTimeout" - // DefaultDeviceResetTimeout default device reset timeout is 60 seconds - DefaultDeviceResetTimeout = 60 - // MinDeviceResetTimeout min device reset timeout is 10 seconds - MinDeviceResetTimeout = 10 - // MaxDeviceResetTimeout max device reset timeout is 600 seconds - MaxDeviceResetTimeout = 600 -) - -const ( - // SubHealthyStrategy config in pod group label for subHealthy fault strategy - SubHealthyStrategy = "subHealthyStrategy" - // SubHealthyHotSwitch strategy name of hot switch - SubHealthyHotSwitch = "hotSwitch" -) - -const ( - // MinAvailableKey decide minAvailable of task - MinAvailableKey = "huawei.com/schedule_minAvailable" -) diff --git a/mind-cluster/component/ascend-common/api/default_name.go b/mind-cluster/component/ascend-common/api/default_name.go deleted file mode 100644 index 7f0ae6c..0000000 --- a/mind-cluster/component/ascend-common/api/default_name.go +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api common brand moniker -package api - -// common -const ( - // Pod910DeviceAnno annotation value is for generating 910 hccl rank table - Pod910DeviceAnno = "ascend.kubectl.kubernetes.io/ascend-910-configuration" - - // ResourceNamePrefix pre resource name - ResourceNamePrefix = "huawei.com/" - // PodRealAlloc pod annotation key, means pod real mount device - PodRealAlloc = "AscendReal" - - // PodAnnotationAscendReal pod annotation ascend real - PodAnnotationAscendReal = "huawei.com/AscendReal" - - // Ascend brand name - Ascend = "Ascend" - // AscendJob job kind is AscendJob - AscendJob = "AscendJob" - // AscendJobsLowerCase for ascend jobs lowercase - AscendJobsLowerCase = "ascendjobs" - - // AscendOperator ascend-Operator - AscendOperator = "ascend-Operator" -) - -// common 910 -const ( - // Ascend910 for 910 chip - Ascend910 = "Ascend910" - // Ascend910Lowercase for 910 chip lowercase - Ascend910Lowercase = "ascend910" - // HuaweiAscend910 ascend 910 chip with prefix - HuaweiAscend910 = "huawei.com/Ascend910" - // Ascend910MinuxPrefix name prefix of ascend 910 chip - Ascend910MinuxPrefix = "Ascend910-" - // Ascend910MinuxCase minus type of ascend 910 chip - Ascend910MinuxCase = "ascend-910" - // Ascend910No 910 chip number - Ascend910No = "910" -) - -// common 910 A1 -const ( - // Ascend910A ascend 910A chip - Ascend910A = "Ascend910" - // Ascend910APattern regular expression for 910A - Ascend910APattern = `^910` -) - -// common 910 A2 -const ( - // Ascend910B ascend 910B chip - Ascend910B = "Ascend910B" - // Ascend910BPattern regular expression for 910B - Ascend910BPattern = `^(910B\d{1}|A2G\d{1})` -) - -// common 910 A3 -const ( - // Ascend910A3 ascend Ascend910A3 chip - Ascend910A3 = "Ascend910A3" -) - -// common 310 -const ( - // Ascend310 ascend 310 chip - Ascend310 = "Ascend310" - // Ascend310Lowercase ascend 310 chip lowercase - Ascend310Lowercase = "ascend310" - // Ascend310No 310 chip number - Ascend310No = "310" - // HuaweiAscend310 ascend 310 chip with prefix - HuaweiAscend310 = "huawei.com/Ascend310" - // Ascend310MinuxPrefix name prefix of ascend 310 chip - Ascend310MinuxPrefix = "Ascend310-" -) - -// common 310B -const ( - // Ascend310B ascend 310B chip - Ascend310B = "Ascend310B" - // Ascend310BNo 310B chip number - Ascend310BNo = "310B" -) - -// common 310P -const ( - // Ascend310P ascend 310P chip - Ascend310P = "Ascend310P" - // Ascend310PLowercase ascend 310P chip lowercase - Ascend310PLowercase = "ascend310P" - // Ascend310PNo 310P chip number - Ascend310PNo = "310P" - // Ascend310PPattern regular expression for 310P - Ascend310PPattern = `^(310P\d{0,1}|I2\d{0,1})` - // HuaweiAscend310P ascend 310P chip with prefix - HuaweiAscend310P = "huawei.com/Ascend310P" - // Ascend310PMinuxPrefix name prefix of ascend 310P chip - Ascend310PMinuxPrefix = "Ascend310P-" -) - -// device plugin -const ( - // Use310PMixedInsert use 310P Mixed insert - Use310PMixedInsert = "use310PMixedInsert" - // Ascend310PMix dp use310PMixedInsert parameter usage - Ascend310PMix = "ascend310P-V, ascend310P-VPro, ascend310P-IPro" - // A300IA2Label the value of the A300I A2 node label - A300IA2Label = "card-910b-infer" - // A300IDuoLabel the value of the A300I Duo node label - A300IDuoLabel = "card-300i-duo" - //UseAscendDocker UseAscendDocker parameter - UseAscendDocker = "useAscendDocker" -) - -// docker runtime -const ( - // AscendDockerRuntime ascend-docker-runtime - AscendDockerRuntime = "ascend-docker-runtime" - // AscendDockerHook ascend-docker-hook - AscendDockerHook = "ascend-docker-hook" - // AscendDockerDestroy ascend-docker-destroy - AscendDockerDestroy = "ascend-docker-destroy" - // AscendDockerCli ascend-docker-cli - AscendDockerCli = "ascend-docker-cli" - - // AscendDockerRuntimeEnv env variable - AscendDockerRuntimeEnv = "ASCEND_DOCKER_RUNTIME" - // AscendVisibleDevicesEnv env variable - AscendVisibleDevicesEnv = "ASCEND_VISIBLE_DEVICES" - // AscendRuntimeOptionsEnv env variable - AscendRuntimeOptionsEnv = "ASCEND_RUNTIME_OPTIONS" - // AscendRuntimeMountsEnv env variable - AscendRuntimeMountsEnv = "ASCEND_RUNTIME_MOUNTS" - // AscendAllowLinkEnv env variable - AscendAllowLinkEnv = "ASCEND_ALLOW_LINK" - // AscendVnpuSpescEnv env variable - AscendVnpuSpescEnv = "ASCEND_VNPU_SPECS" - - // RunTimeLogDir dir path of runtime - RunTimeLogDir = "/var/log/ascend-docker-runtime/" - // HookRunLogPath run log path of hook - HookRunLogPath = "/var/log/ascend-docker-runtime/hook-run.log" - // InstallHelperRunLogPath run log path of install helper - InstallHelperRunLogPath = "/var/log/ascend-docker-runtime/install-helper-run.log" - // RunTimeRunLogPath run log path of runtime - RunTimeRunLogPath = "/var/log/ascend-docker-runtime/runtime-run.log" - - // RunTimeDConfigPath config path - RunTimeDConfigPath = "/etc/ascend-docker-runtime.d" -) - -// npu exporter -const ( - // DevicePathPattern device path pattern - DevicePathPattern = `^/dev/davinci\d+$` - // HccsBWProfilingTimeStr preset parameter name - HccsBWProfilingTimeStr = "hccsBWProfilingTime" - // Hccs log options domain value - Hccs = "hccs" - // Prefix pre statistic info - Prefix = "npu_chip_info_hccs_statistic_info_" - // BwPrefix pre bandwidth info - BwPrefix = "npu_chip_info_hccs_bandwidth_info_" - // AscendDeviceInfo - AscendDeviceInfo = "ASCEND_VISIBLE_DEVICES" -) - -const ( - // AscendJobKind is the kind name - AscendJobKind = "AscendJob" - // DefaultContainerName the default container name for AscendJob. - DefaultContainerName = "ascend" - // DefaultPortName is name of the port used to communicate between other process. - DefaultPortName = "ascendjob-port" - // ControllerName is the name of controller,used in log. - ControllerName = "ascendjob-controller" - // OperatorName name of operator - OperatorName = "ascend-operator" - // LogModuleName name of log module - LogModuleName = "hwlog" - // OperatorLogFilePath Operator log file name - OperatorLogFilePath = "/var/log/mindx-dl/ascend-operator/ascend-operator.log" -) diff --git a/mind-cluster/component/ascend-common/api/publicfault.go b/mind-cluster/component/ascend-common/api/publicfault.go deleted file mode 100644 index 8561145..0000000 --- a/mind-cluster/component/ascend-common/api/publicfault.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api structs for public fault -package api - -// PubFaultInfo struct for public fault input -type PubFaultInfo struct { - Id string `json:"id"` - TimeStamp int64 `json:"timestamp"` - Version string `json:"version"` - Resource string `json:"resource"` - Faults []Fault `json:"faults"` -} - -// Fault public fault cm item Fault -type Fault struct { - FaultId string `json:"faultId"` - FaultType string `json:"faultType"` - FaultCode string `json:"faultCode"` - FaultTime int64 `json:"faultTime"` - Assertion string `json:"assertion"` - FaultLocation map[string]string `json:"faultLocation"` - Influence []Influence `json:"influence"` - Description string `json:"description"` -} - -// Influence public fault cm item Influence -type Influence struct { - NodeName string `json:"nodeName"` - NodeSN string `json:"nodeSN"` - DeviceIds []int32 `json:"deviceIds"` -} diff --git a/mind-cluster/component/ascend-common/api/slownet/fault_net.go b/mind-cluster/component/ascend-common/api/slownet/fault_net.go deleted file mode 100644 index eacde6a..0000000 --- a/mind-cluster/component/ascend-common/api/slownet/fault_net.go +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package slownet for net fault detect common -package slownet - -import ( - "fmt" - "os" - "path/filepath" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" -) - -const ( - rasNetRootPathKey = "RAS_NET_ROOT_PATH" - netFaultSubPath = "cluster" - detectConf = "cathelper.conf" -) - -// GetRasNetRootPath get ras net fault detect root path from env -func GetRasNetRootPath() (string, error) { - rootPath := os.Getenv(rasNetRootPathKey) - if len(rootPath) == 0 { - return "", fmt.Errorf("env %s not exists, please config it before starting", rasNetRootPathKey) - } - if !utils.IsDir(rootPath) { - return "", fmt.Errorf("env %s=%s, which is not dir", rasNetRootPathKey, rootPath) - } - safeRootPath, err := utils.CheckPath(rootPath) - if err != nil { - return "", fmt.Errorf("env %s=%s, which is invalid, err: %v", rasNetRootPathKey, rootPath, err) - } - return safeRootPath, nil -} - -// GetPingListFilePath get ping list task info file for ping mesh -func GetPingListFilePath(superPodId, serverIndex string) (string, error) { - rootPath, err := GetRasNetRootPath() - if err != nil { - return "", err - } - return filepath.Join(rootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), - fmt.Sprintf("ping_list_%s.json", serverIndex)), nil -} - -// GetSuperPodInfoFilePath get super pod info file path -func GetSuperPodInfoFilePath(superPodID, superPodPrefix string) (string, error) { - rootPath, err := GetRasNetRootPath() - if err != nil { - hwlog.RunLog.Errorf("get ras net root path failed, err : %v", err) - return "", err - } - superPodPathName := fmt.Sprintf("%s-%s", superPodPrefix, superPodID) - fileName := fmt.Sprintf("%s.json", superPodPathName) - filePath := filepath.Join(rootPath, netFaultSubPath, superPodPathName, fileName) - if _, errInfo := utils.CheckPath(filePath); errInfo != nil { - hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) - return "", errInfo - } - return filePath, nil -} - -// GetConfigPathForDetect the config path for network fault detect so -func GetConfigPathForDetect(superPodId string) (string, error) { - rasNetRootPath, err := GetRasNetRootPath() - if err != nil { - hwlog.RunLog.Errorf("get ras net root path failed, err: %v", err) - return "", err - } - confPath := filepath.Join(rasNetRootPath, netFaultSubPath, fmt.Sprintf("super-pod-%s", superPodId), detectConf) - if _, errInfo := utils.CheckPath(confPath); errInfo != nil { - hwlog.RunLog.Errorf("file path is invalid, err: %v", errInfo) - return "", errInfo - } - return confPath, nil -} diff --git a/mind-cluster/component/ascend-common/api/superpoddevice.go b/mind-cluster/component/ascend-common/api/superpoddevice.go deleted file mode 100644 index 4039dcb..0000000 --- a/mind-cluster/component/ascend-common/api/superpoddevice.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api structs for SuperPodDevice -package api - -import "k8s.io/apimachinery/pkg/util/sets" - -// NpuBaseInfo is the base info of npu -type NpuBaseInfo struct { - IP string - SuperDeviceID uint32 -} - -// NodeDevice node device info -type NodeDevice struct { - NodeName string - ServerID string - ServerType string `json:"-"` - DeviceMap map[string]string // key: dev phyID, value: superPod device id -} - -// SuperPodDevice super node device info, key is superPodID, value is NodeDevice -type SuperPodDevice struct { - Version string - SuperPodID string - NodeDeviceMap map[string]*NodeDevice -} - -// SuperPodFaultInfos super pod fault info -type SuperPodFaultInfos struct { - SdIds []string - FaultNodes sets.String - NodeNames []string - FaultTimes int64 - JobId string `json:"JobId,omitempty"` -} diff --git a/mind-cluster/component/ascend-common/api/type.go b/mind-cluster/component/ascend-common/api/type.go deleted file mode 100644 index 9a2cde1..0000000 --- a/mind-cluster/component/ascend-common/api/type.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - -// Package api common const -package api - -// ResetCmInfo is the reset config info of a task -type ResetCmInfo struct { - RankList []*DevFaultnfo - UpdateTime int64 - RetryTime int - FaultFlushing bool - GracefulExit int - RestartFaultProcess bool -} - -// DevFaultnfo is the device info of a task -type DevFaultnfo struct { - RankId int - FaultInfo -} - -// FaultInfo is the fault info of device -type FaultInfo struct { - LogicId int32 - Status string - Policy string - InitialPolicy string - ErrorCode []int64 - ErrorCodeHex string -} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go deleted file mode 100644 index 0c0d420..0000000 --- a/mind-cluster/component/ascend-common/common-utils/cache/lrucache.go +++ /dev/null @@ -1,394 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package cache implement a memory-based LRU local cache -package cache - -import ( - "container/list" - "errors" - "fmt" - "math" - "sync" - "time" -) - -const ( - segmentCount = 16 - int64One int64 = 1 - int64Zero int64 = 0 - negInt64One int64 = -1 - intTwo = 2 - hashInit uint32 = 2166136261 - prime32 uint32 = 16777619 - twentyYears time.Duration = 20 * 365 * 24 * time.Hour -) - -var ( - notInitErr = errors.New("not initializes") - paraErr = errors.New("parameter error") -) - -type cacheEle struct { - key string - data interface{} - expireTime int64 -} - -type lruCache struct { - maxSize int - elemIndex map[string]*list.Element - *list.List - mu sync.Mutex -} - -// ConcurrencyLRUCache is a memory-based LRU local cache, default total 16 segment to improve concurrent performance -// LRU is not real least recently used for the total cache,but just for each buket -// we just need a proper method to clear cache -type ConcurrencyLRUCache struct { - segment int - cacheBuket [segmentCount]*lruCache -} - -// Set create or update an element using key -// key: The identity of an element -// value: new value of the element -// expireTime: expire time, positive int64 or -1 which means never overdue -func (cl *ConcurrencyLRUCache) Set(key string, value interface{}, expireTime time.Duration) error { - if cl == nil || cl.cacheBuket[0] == nil { - return notInitErr - } - if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { - return paraErr - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].setValue(key, value, expireTime) -} - -// Get get the value of a cached element by key. If key do not exist, this function will return nil and an error msg -// key: The identity of an element -// return: -// value: the cached value, nil if key do not exist -// err: error info, nil if value is not nil -func (cl *ConcurrencyLRUCache) Get(key string) (interface{}, error) { - if cl == nil || cl.cacheBuket[0] == nil { - return nil, notInitErr - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return nil, errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].getValue(key) -} - -// Delete delete the value by key, no error returned -func (cl *ConcurrencyLRUCache) Delete(key string) { - if cl == nil || cl.cacheBuket[0] == nil { - return - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return - } - cl.cacheBuket[cacheIndex].delValue(key) -} - -// SetIfNX if the key not exist or expired, will set the new value to cache and return true ,otherwise return false -func (cl *ConcurrencyLRUCache) SetIfNX(key string, value interface{}, expireTime time.Duration) bool { - if cl == nil || cl.cacheBuket[0] == nil { - return false - } - if expireTime < time.Duration(negInt64One) || expireTime > twentyYears { - return false - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return false - } - return cl.cacheBuket[cacheIndex].setIfNotExist(key, value, expireTime) -} - -// INCR add one to the value(must int64) of the key , if the key not exist, initialize with 0 and then add one -func (cl *ConcurrencyLRUCache) INCR(key string, expireTime time.Duration) (int64, error) { - if err := validate(cl, expireTime); err != nil { - return 0, err - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return 0, errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].increment(key, expireTime) -} - -// DECR minus one to the value(must int64) of the key,if the key not exist, initialize with 0 and then minus one -func (cl *ConcurrencyLRUCache) DECR(key string, expireTime time.Duration) (int64, error) { - if err := validate(cl, expireTime); err != nil { - return 0, err - } - cacheIndex := cl.index(key) - if cacheIndex < 0 || cacheIndex >= segmentCount { - return 0, errors.New("index out of valid value") - } - return cl.cacheBuket[cacheIndex].decrement(key, expireTime) -} - -func validate(cl *ConcurrencyLRUCache, expireTime time.Duration) error { - if cl == nil || cl.cacheBuket[0] == nil { - return paraErr - } - if expireTime <= 0 && expireTime != time.Duration(negInt64One) { - return paraErr - } - return nil -} - -// index calculate the key hashcode and index the right buket -func (cl *ConcurrencyLRUCache) index(key string) int { - var hash = hashInit - for i := 0; i < len(key); i++ { - hash *= prime32 - hash ^= uint32(key[i]) - } - return int(hash & (uint32(cl.segment) - 1)) -} - -// New create an instance of ConcurrencyLRUCache -// maxEntries the cache size, will to convert to (n/16+n%16>0?1:0)*16 -func New(maxEntries int) *ConcurrencyLRUCache { - if maxEntries <= 0 { - return nil - } - size := maxEntries / segmentCount - remain := maxEntries % segmentCount - if remain > 0 { - size += 1 - } - var cache [segmentCount]*lruCache - for i := 0; i < segmentCount; i++ { - cache[i] = &lruCache{ - maxSize: size, - elemIndex: make(map[string]*list.Element, segmentCount), - List: list.New(), - mu: sync.Mutex{}, - } - } - return &ConcurrencyLRUCache{ - segment: segmentCount, - cacheBuket: cache, - } -} - -func (c *lruCache) setValue(key string, value interface{}, expireTime time.Duration) error { - if c == nil || c.elemIndex == nil { - return errors.New("not initializes") - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - // if the cache not exist - c.setInner(key, value, expireTime) - return nil - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - return errors.New("cacheElement convert failed") - } - c.MoveToFront(v) - pkgElement(ele, value, expireTime) - return nil -} - -func pkgElement(ele *cacheEle, value interface{}, expireTime time.Duration) { - ele.data = value - if expireTime == time.Duration(negInt64One) { - ele.expireTime = negInt64One - return - } - ele.expireTime = time.Now().UnixNano() + int64(expireTime) -} - -func (c *lruCache) getValue(key string) (interface{}, error) { - if c == nil || c.elemIndex == nil { - return nil, errors.New("not initializes") - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - return nil, errors.New("no value found") - } - c.MoveToFront(v) - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - return nil, errors.New("cacheElement convert failed") - } - if ele.expireTime != negInt64One && time.Now().UnixNano() > ele.expireTime { - // if cache expired - c.safeDeleteByKey(key, v) - return nil, errors.New("the key was expired") - } - return ele.data, nil -} - -// Delete delete an element -func (c *lruCache) delValue(key string) { - if c == nil || c.elemIndex == nil { - return - } - c.mu.Lock() - defer c.mu.Unlock() - if v, ok := c.elemIndex[key]; ok { - c.safeDeleteByKey(key, v) - } -} - -func (c *lruCache) setIfNotExist(key string, value interface{}, expireTime time.Duration) bool { - if c == nil || c.elemIndex == nil { - return false - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - // if the cache not exist - c.setInner(key, value, expireTime) - return true - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - return false - } - c.MoveToFront(v) - if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { - return false - } - // if cache expired - pkgElement(ele, value, expireTime) - return true -} - -func (c *lruCache) increment(key string, expireTime time.Duration) (int64, error) { - if c == nil || c.elemIndex == nil { - return 0, notInitErr - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - c.setInner(key, int64One, expireTime) - return int64One, nil - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - c.setInner(key, int64One, expireTime) - return int64One, nil - } - c.MoveToFront(v) - if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { - newValue, ok := ele.data.(int64) - if !ok || newValue == math.MaxInt64 { - return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) - } - newValue++ - pkgElement(ele, newValue, expireTime) - return newValue, nil - } - // if cache expired - pkgElement(ele, int64One, expireTime) - return int64One, nil -} - -func (c *lruCache) decrement(key string, expireTime time.Duration) (int64, error) { - if c == nil || c.elemIndex == nil { - return 0, notInitErr - } - c.mu.Lock() - defer c.mu.Unlock() - v, ok := c.elemIndex[key] - if !ok { - // if the cache not exist - c.setInner(key, negInt64One, expireTime) - return negInt64One, nil - } - ele, ok := v.Value.(*cacheEle) - if !ok { - c.safeDeleteByKey(key, v) - c.setInner(key, negInt64One, expireTime) - return negInt64One, nil - } - c.MoveToFront(v) - if ele.expireTime == negInt64One || time.Now().UnixNano() < ele.expireTime { - newValue, ok := ele.data.(int64) - if !ok || newValue == math.MinInt64 { - return 0, fmt.Errorf("the cache value is not valid, ok:%v", ok) - } - newValue-- - pkgElement(ele, newValue, expireTime) - return newValue, nil - } - // if cache expired - pkgElement(ele, negInt64One, expireTime) - return negInt64One, nil -} - -func (c *lruCache) setInner(key string, value interface{}, expireTime time.Duration) { - if c == nil { - return - } - if c.Len()+1 > c.maxSize { - c.safeRemoveOldest() - } - newElem := &cacheEle{ - key: key, - data: value, - expireTime: negInt64One, - } - if expireTime != time.Duration(negInt64One) { - newElem.expireTime = time.Now().UnixNano() + int64(expireTime) - } - e := c.PushFront(newElem) - c.elemIndex[key] = e -} - -func (c *lruCache) safeDeleteByKey(key string, v *list.Element) { - if c == nil { - return - } - c.List.Remove(v) - delete(c.elemIndex, key) -} - -func (c *lruCache) safeRemoveOldest() { - if c == nil { - return - } - v := c.List.Back() - if v == nil { - return - } - c.List.Remove(v) - ele, ok := v.Value.(*cacheEle) - if !ok { - return - } - delete(c.elemIndex, ele.key) -} diff --git a/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go b/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go deleted file mode 100644 index a8b5ea0..0000000 --- a/mind-cluster/component/ascend-common/common-utils/cache/lrucache_test.go +++ /dev/null @@ -1,304 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package cache implement a memory-based LRU local cache -package cache - -import ( - "container/list" - "fmt" - "math" - "sync" - "testing" - "time" - - "github.com/smartystreets/goconvey/convey" -) - -const ( - cacheTime = 500 - goRoutineCount = 10 -) - -func TestSet(t *testing.T) { - cache := New(1) - convey.Convey("test lru cacheTime", t, func() { - cache.Set("testkey1", "1", cacheTime*time.Millisecond) - v, err := cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - time.Sleep(cacheTime * time.Millisecond) - v, err = cache.Get("testkey1") - convey.So(v, convey.ShouldEqual, nil) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("test set twice", t, func() { - cache.Set("testkey1", "1", time.Minute) - v, err := cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - cache.Set("testkey1", "2", time.Minute) - v, err = cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "2") - }) - convey.Convey("SET failed", t, func() { - c := &lruCache{} - err := c.setValue("test", "1", time.Minute) - convey.So(err.Error(), convey.ShouldEqual, "not initializes") - _, err = c.getValue("test") - convey.So(err.Error(), convey.ShouldEqual, "not initializes") - }) - convey.Convey("SET not expired", t, func() { - cache.Set("testkey2", "1", time.Second) - err := cache.Set("testkey2", "1", time.Duration(negInt64One)) - convey.So(err, convey.ShouldEqual, nil) - v, err := cache.Get("testkey2") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - }) - convey.Convey("SET parameter error", t, func() { - err := cache.Set("testkey2", "1", -time.Second) - convey.So(err.Error(), convey.ShouldEqual, "parameter error") - }) -} - -func TestDelete(t *testing.T) { - cache := New(1) - convey.Convey("test lru delete", t, func() { - cache.Set("testkey1", "1", time.Minute) - v, err := cache.Get("testkey1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(v, convey.ShouldEqual, "1") - cache.Delete("testkey1") - v, err = cache.Get("testkey1") - convey.So(v, convey.ShouldEqual, nil) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("Delete no thing", t, func() { - c := &lruCache{} - c.delValue("test") - }) -} - -func TestSetIfNX(t *testing.T) { - cache := New(1) - convey.Convey("SetIfNX set parameter error", t, func() { - r := cache.SetIfNX("testkey1", "1", -time.Millisecond) - convey.So(r, convey.ShouldEqual, false) - }) - convey.Convey("SetIfNX set success", t, func() { - r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, true) - }) - convey.Convey("SetIfNX set success failed", t, func() { - r := cache.SetIfNX("testkey1", "1", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, false) - }) - time.Sleep(cacheTime * time.Millisecond) - convey.Convey("SetIfNX set success", t, func() { - r := cache.SetIfNX("testkey1", "1", time.Second) - convey.So(r, convey.ShouldEqual, true) - }) - convey.Convey("SetIfNX expireTime -1", t, func() { - r := cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) - convey.So(r, convey.ShouldEqual, true) - r = cache.SetIfNX("testkey", "1", time.Duration(negInt64One)) - convey.So(r, convey.ShouldEqual, false) - }) - -} - -func TestSetIfNXConcurrencyTest(t *testing.T) { - cache := New(1) - convey.Convey("SetIfNX concurrency test", t, func() { - var count = 0 - count = testSetIfNX(cache, count) - convey.So(count, convey.ShouldEqual, 1) - }) -} - -func testSetIfNX(cache *ConcurrencyLRUCache, count int) int { - l := sync.Mutex{} - wg := sync.WaitGroup{} - wg.Add(goRoutineCount) - for i := 0; i < goRoutineCount; i++ { - go func() { - r := cache.SetIfNX("testkey2", "1", time.Second) - if r { - l.Lock() - count++ - l.Unlock() - } - wg.Done() - }() - } - wg.Wait() - return count -} - -func TestINCRConcurrencyTest(t *testing.T) { - cache := New(1) - convey.Convey("INCR concurrency test", t, func() { - max := testIncr(cache) - convey.So(max, convey.ShouldEqual, goRoutineCount) - }) -} - -func testIncr(cache *ConcurrencyLRUCache) int64 { - var max = int64Zero - l := sync.Mutex{} - wg := sync.WaitGroup{} - wg.Add(goRoutineCount) - for i := 0; i < goRoutineCount; i++ { - go func() { - r, err := cache.INCR("testkey1", time.Second) - if err != nil { - return - } - l.Lock() - if r > max { - max = r - } - l.Unlock() - wg.Done() - }() - } - wg.Wait() - return max -} - -func TestDECRConcurrencyTest(t *testing.T) { - cache := New(1) - cache.Set("testkey1", int64(goRoutineCount), time.Minute) - convey.Convey("INCR concurrency test", t, func() { - min := testDecr(cache) - convey.So(min, convey.ShouldEqual, 0) - }) -} - -func testDecr(cache *ConcurrencyLRUCache) int64 { - var min = int64(math.MaxInt) - l := sync.Mutex{} - wg := sync.WaitGroup{} - wg.Add(goRoutineCount) - for i := 0; i < goRoutineCount; i++ { - go func() { - r, err := cache.DECR("testkey1", time.Second) - if err != nil { - return - } - l.Lock() - if r < min { - min = r - } - l.Unlock() - wg.Done() - }() - } - wg.Wait() - return min -} - -func TestINCR(t *testing.T) { - cache := New(1) - convey.Convey("not initializes", t, func() { - c := &lruCache{} - _, err := c.increment("test", time.Minute) - convey.So(err, convey.ShouldEqual, notInitErr) - }) - convey.Convey("parameter error", t, func() { - _, err := cache.INCR("testkey", -time.Minute) - convey.So(err, convey.ShouldEqual, paraErr) - }) - convey.Convey("INCR success", t, func() { - r, err := cache.INCR("testkey", time.Minute) - convey.So(r, convey.ShouldEqual, 1) - convey.So(err, convey.ShouldEqual, nil) - r, err = cache.INCR("testkey", time.Minute) - convey.So(r, convey.ShouldEqual, intTwo) - }) - - convey.Convey("INCR success when exits", t, func() { - cache.Set("testkey1", int64Zero, cacheTime*time.Millisecond) - r, err := cache.INCR("testkey1", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, 1) - convey.So(err, convey.ShouldEqual, nil) - time.Sleep(cacheTime * time.Millisecond) - r, err = cache.INCR("testkey1", time.Minute) - convey.So(r, convey.ShouldEqual, 1) - }) -} - -func TestDECR(t *testing.T) { - cache := New(1) - convey.Convey("not initializes", t, func() { - c := &lruCache{} - _, err := c.decrement("test", time.Minute) - convey.So(err, convey.ShouldEqual, notInitErr) - }) - convey.Convey("parameter error", t, func() { - _, err := cache.DECR("testkey1", -time.Minute) - convey.So(err, convey.ShouldEqual, paraErr) - }) - convey.Convey("SetIfNX set success", t, func() { - r, err := cache.DECR("testkey1", time.Minute) - convey.So(r, convey.ShouldEqual, negInt64One) - convey.So(err, convey.ShouldEqual, nil) - cache.Set("testkey1", int64One, time.Minute) - r, err = cache.DECR("testkey1", time.Minute) - convey.So(r, convey.ShouldEqual, 0) - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("Decr success when exits", t, func() { - cache.Set("testkey2", int64One, cacheTime*time.Millisecond) - r, err := cache.DECR("testkey2", cacheTime*time.Millisecond) - convey.So(r, convey.ShouldEqual, 0) - convey.So(err, convey.ShouldEqual, nil) - time.Sleep(cacheTime * time.Millisecond) - r, err = cache.DECR("testkey2", time.Minute) - convey.So(err, convey.ShouldEqual, nil) - convey.So(r, convey.ShouldEqual, negInt64One) - }) -} - -func TestLRU(t *testing.T) { - convey.Convey("not initializes", t, func() { - c := &lruCache{ - maxSize: intTwo, - elemIndex: make(map[string]*list.Element, segmentCount), - List: list.New(), - mu: sync.Mutex{}, - } - c.setValue("test", "1", time.Minute) - c.setValue("test1", "1", time.Minute) - c.setValue("test2", "1", time.Minute) - _, err := c.getValue("test") - convey.So(err.Error(), convey.ShouldEqual, "no value found") - }) -} - -func BenchmarkSetIfNx(b *testing.B) { - cache := New(1) - for n := 0; n < b.N; n++ { - cache.SetIfNX(fmt.Sprintf("key%d", n), "xx", time.Second) - } -} - -func BenchmarkINCR(b *testing.B) { - cache := New(1) - for n := 0; n < b.N; n++ { - cache.INCR("sdds", time.Second) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api.go deleted file mode 100644 index 65de3e7..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/api.go +++ /dev/null @@ -1,310 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "context" - "fmt" - "io" - "log" - "os" - "path" -) - -const ( - logDebugLv = iota - 1 - logInfoLv - logWarnLv - logErrorLv - logCriticalLv -) - -type logger struct { - lgDebug *log.Logger - lgInfo *log.Logger - lgWarn *log.Logger - lgError *log.Logger - lgCritical *log.Logger - lgCtrl *LogLimiter - lgLevel int - lgMaxLine int -} - -func (lg *logger) initLogWriter(w io.Writer) { - lg.lgDebug = log.New(w, "[DEBUG] ", log.Ldate|log.Lmicroseconds) - lg.lgInfo = log.New(w, "[INFO] ", log.Ldate|log.Lmicroseconds) - lg.lgWarn = log.New(w, "[WARN] ", log.Ldate|log.Lmicroseconds) - lg.lgError = log.New(w, "[ERROR] ", log.Ldate|log.Lmicroseconds) - lg.lgCritical = log.New(w, "[Critical] ", log.Ldate|log.Lmicroseconds) -} - -func (lg *logger) setLoggerLevel(lv int) { - if lv < minLogLevel || lv > maxLogLevel { - lg.lgLevel = 0 - return - } - lg.lgLevel = lv -} - -func (lg *logger) setLoggerMaxLine(lml int) { - if lml <= 0 || lml > maxEachLineLen { - lg.lgMaxLine = defaultMaxEachLineLen - return - } - lg.lgMaxLine = lml -} - -func (lg *logger) setLoggerWriter(config *LogConfig) { - rollLogger := &Logs{ - FileName: config.LogFileName, - Capacity: config.FileMaxSize, // megabytes - SaveVolume: config.MaxBackups, - SaveTime: config.MaxAge, // days - } - logWriter := &LogLimiter{ - Logs: rollLogger, - ExpiredTime: config.ExpiredTime, // seconds - CacheSize: config.CacheSize, - } - if config.OnlyToStdout { - lg.initLogWriter(os.Stdout) - return - } - if config.OnlyToFile { - lg.initLogWriter(logWriter) - return - } - writer := io.MultiWriter(os.Stdout, logWriter) - lg.initLogWriter(writer) - lg.lgCtrl = logWriter -} - -func (lg *logger) setLogger(config *LogConfig) error { - if err := validateLogConfigFiled(config); err != nil { - return err - } - lg.setLoggerWriter(config) - lg.setLoggerLevel(config.LogLevel) - lg.setLoggerMaxLine(config.MaxLineLength) - msg := fmt.Sprintf("%s's logger init success", path.Base(config.LogFileName)) - // skip change file mode and fs notify - if config.OnlyToStdout { - msg = fmt.Sprintf("%s, only to stdout", msg) - return nil - } - lg.Info(msg) - if err := os.Chmod(config.LogFileName, LogFileMode); err != nil { - lg.Errorf("change file mode failed: %v", err) - return fmt.Errorf("set log file mode failed") - } - return nil -} - -func (lg *logger) isInit() bool { - return lg.lgDebug != nil && lg.lgInfo != nil && lg.lgWarn != nil && lg.lgError != nil && lg.lgCritical != nil -} - -// Debug record debug not format -func (lg *logger) Debug(args ...interface{}) { - lg.DebugWithCtx(nil, args...) -} - -// Debugf record debug -func (lg *logger) Debugf(format string, args ...interface{}) { - lg.DebugfWithCtx(nil, format, args...) -} - -// DebugWithCtx record Debug not format -func (lg *logger) DebugWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logDebugLv { - return - } - if lg.validate() { - printHelper(lg.lgDebug, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// DebugfWithCtx record Debug format -func (lg *logger) DebugfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logDebugLv { - return - } - if lg.validate() { - printHelper(lg.lgDebug, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// Info record info not format -func (lg *logger) Info(args ...interface{}) { - lg.InfoWithCtx(nil, args...) -} - -// Infof record info -func (lg *logger) Infof(format string, args ...interface{}) { - lg.InfofWithCtx(nil, format, args...) -} - -// InfoWithCtx record Info not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) InfoWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logInfoLv { - return - } - if lg.validate() { - printHelper(lg.lgInfo, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// InfofWithCtx record Info format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) InfofWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logInfoLv { - return - } - if lg.validate() { - printHelper(lg.lgInfo, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// Warn record warn not format -func (lg *logger) Warn(args ...interface{}) { - lg.WarnWithCtx(nil, args...) -} - -// Warnf record warn -func (lg *logger) Warnf(format string, args ...interface{}) { - lg.WarnfWithCtx(nil, format, args...) -} - -// WarnWithCtx record Warn not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) WarnWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logWarnLv { - return - } - if lg.validate() { - printHelper(lg.lgWarn, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// WarnfWithCtx record Warn format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) WarnfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logWarnLv { - return - } - if lg.validate() { - printHelper(lg.lgWarn, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// WarnfWithLimit record warn for default times (default 3),domain is for logType of msg, -// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt -func (lg *logger) WarnfWithLimit(domain string, id interface{}, format string, args ...interface{}) { - if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - lg.WarnfWithCtx(nil, format, args...) - } -} - -// Error record error not format -func (lg *logger) Error(args ...interface{}) { - lg.ErrorWithCtx(nil, args...) -} - -// Errorf record error -func (lg *logger) Errorf(format string, args ...interface{}) { - lg.ErrorfWithCtx(nil, format, args...) -} - -// ErrorfWithLimit record error for default times (default 3),domain is for logType of msg, -// id is a unique identifier of this logType, you can reset the counter by call ResetErrCnt -func (lg *logger) ErrorfWithLimit(domain string, id interface{}, format string, args ...interface{}) { - if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, ProblemOccurMaxNumbers); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - lg.ErrorfWithCtx(nil, format, args...) - } -} - -// ErrorfWithSpecifiedCounts record error for specified times,domain is for logType of msg, -// id is a unique identifier of this logType,maxCounts is for max print counts, -// you can reset the counter by call ResetErrCnt -func (lg *logger) ErrorfWithSpecifiedCounts(domain string, id interface{}, maxCounts int, - format string, args ...interface{}) { - if needPrint, extraErrLog := IsNeedPrintWithSpecifiedCounts(domain, id, maxCounts); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - lg.ErrorfWithCtx(nil, format, args...) - } -} - -// ErrorWithCtx record Error not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) ErrorWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logErrorLv { - return - } - if lg.validate() { - printHelper(lg.lgError, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// ErrorfWithCtx record Error format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) ErrorfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logErrorLv { - return - } - if lg.validate() { - printHelper(lg.lgError, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -// Critical record critical not format -func (lg *logger) Critical(args ...interface{}) { - lg.CriticalWithCtx(nil, args...) -} - -// Criticalf record Critical log format -func (lg *logger) Criticalf(format string, args ...interface{}) { - lg.CriticalfWithCtx(nil, format, args...) -} - -// CriticalWithCtx record Critical not format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) CriticalWithCtx(ctx context.Context, args ...interface{}) { - if lg.lgLevel > logCriticalLv { - return - } - if lg.validate() { - printHelper(lg.lgCritical, fmt.Sprint(args...), lg.lgMaxLine, ctx) - } -} - -// CriticalfWithCtx record Critical format with context, if you have no ctx, please use the method with not ctx -func (lg *logger) CriticalfWithCtx(ctx context.Context, format string, args ...interface{}) { - if lg.lgLevel > logCriticalLv { - return - } - if lg.validate() { - printHelper(lg.lgCritical, fmt.Sprintf(format, args...), lg.lgMaxLine, ctx) - } -} - -func (lg *logger) validate() bool { - if lg == nil || !lg.isInit() { - fmt.Println("Fatal function's logger is nil") - return false - } - return true -} - -// FlushMem writes the contents of the memory to the disk -func (lg *logger) FlushMem() error { - return lg.lgCtrl.Flush() -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go deleted file mode 100644 index ecdcef6..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/api_test.go +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "fmt" - "io/fs" - "os" - "path" - "path/filepath" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/utils" -) - -func TestNewLogger(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test setLogger func", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - // test for log file - mockPathCheck := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { - return "", nil - }) - mockMkdir := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { - return nil - }) - defer mockPathCheck.Reset() - defer mockMkdir.Reset() - lgConfig = &LogConfig{ - LogFileName: path.Join(filepath.Dir(os.Args[0]), "t.log"), - OnlyToFile: true, - MaxBackups: DefaultMaxBackups, - MaxAge: DefaultMinSaveAge, - CacheSize: DefaultCacheSize, - ExpiredTime: DefaultExpiredTime, - } - err = lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestLoggerPrint(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test logger print func", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - LogLevel: -1, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - lg.Debug("test debug") - lg.Debugf("test debugf") - lg.Info("test info") - lg.Infof("test infof") - lg.Warn("test warn") - lg.Warnf("test warnf") - lg.Error("test error") - lg.Errorf("test errorf") - lg.Critical("test critical") - lg.Criticalf("test criticalf") - lg.setLoggerLevel(maxLogLevel + 1) - lg.Debug("test debug") - lg.Debugf("test debugf") - lg.Info("test info") - lg.Infof("test infof") - lg.Warn("test warn") - lg.Warnf("test warnf") - lg.Error("test error") - lg.Errorf("test errorf") - lg.Critical("test critical") - lg.Criticalf("test criticalf") - }) - }) -} -func TestLoggerPrintWithLimit(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test logger print func with limit", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - LogLevel: -1, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - domain := "hccs" - logicId := 1 - - errFormat := "collect failed ,err:%v" - collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - ResetErrCnt(domain, logicId) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - lg.ErrorfWithLimit(domain, logicId, errFormat, collectErr) - }) - }) -} - -func TestWarnfWithLimit(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test warn logger print func with limit", func() { - lgConfig := &LogConfig{ - OnlyToStdout: true, - LogLevel: -1, - } - lg := new(logger) - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - domain := "hccs" - logicId := 1 - - errFormat := "collect failed ,err:%v" - collectErr := fmt.Errorf("detail errs,logicId(%d)", logicId) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - ResetErrCnt(domain, logicId) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - lg.WarnfWithLimit(domain, logicId, errFormat, collectErr) - }) - }) -} - -func TestValidate(t *testing.T) { - convey.Convey("test api", t, func() { - convey.Convey("test validate", func() { - lg := new(logger) - res := lg.validate() - convey.So(res, convey.ShouldBeFalse) - lgConfig := &LogConfig{ - OnlyToStdout: true, - } - err := lg.setLogger(lgConfig) - convey.So(err, convey.ShouldBeNil) - res = lg.validate() - convey.So(res, convey.ShouldBeTrue) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go deleted file mode 100644 index 5e5c567..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor.go +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "context" - "errors" -) - -// RunLog run logger -var RunLog *logger - -// InitRunLogger initialize run logger -func InitRunLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("run logger config is nil") - } - if RunLog != nil && RunLog.isInit() { - RunLog.Warn("run logger is been initialized") - return nil - } - RunLog = new(logger) - if RunLog == nil { - return errors.New("malloc new logger flied") - } - if err := RunLog.setLogger(config); err != nil { - return err - } - if !RunLog.isInit() { - return errors.New("run logger init failed") - } - return nil -} - -// OpLog operate logger -var OpLog *logger - -// InitOperateLogger initialize operate logger -func InitOperateLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("operate logger config is nil") - } - if OpLog != nil && OpLog.isInit() { - OpLog.Warn("operate logger is been initialized") - return nil - } - OpLog = new(logger) - if OpLog == nil { - return errors.New("malloc new logger flied") - } - if err := OpLog.setLogger(config); err != nil { - return err - } - if !OpLog.isInit() { - return errors.New("operate logger init failed") - } - return nil -} - -// SecLog security logger -var SecLog *logger - -// InitSecurityLogger initialize security logger -func InitSecurityLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("security logger config is nil") - } - if SecLog != nil && SecLog.isInit() { - SecLog.Warn("security logger is been initialized") - return nil - } - SecLog = new(logger) - if SecLog == nil { - return errors.New("malloc new logger flied") - } - if err := SecLog.setLogger(config); err != nil { - return err - } - if !SecLog.isInit() { - return errors.New("security logger init failed") - } - return nil -} - -// UserLog user logger -var UserLog *logger - -// InitUserLogger initialize user logger -func InitUserLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("user logger config is nil") - } - if UserLog != nil && UserLog.isInit() { - UserLog.Warn("user logger is been initialized") - return nil - } - UserLog = new(logger) - if UserLog == nil { - return errors.New("malloc new logger flied") - } - if err := UserLog.setLogger(config); err != nil { - return err - } - if !UserLog.isInit() { - return errors.New("user logger init failed") - } - return nil -} - -// DebugLog debug logger -var DebugLog *logger - -// InitDebugLogger initialize debug logger -func InitDebugLogger(config *LogConfig, ctx context.Context) error { - if config == nil { - return errors.New("debug logger config is nil") - } - if DebugLog != nil && DebugLog.isInit() { - DebugLog.Warn("debug logger is been initialized") - return nil - } - DebugLog = new(logger) - if DebugLog == nil { - return errors.New("malloc new logger flied") - } - if err := DebugLog.setLogger(config); err != nil { - return err - } - if !DebugLog.isInit() { - return errors.New("debug logger init failed") - } - return nil -} - -// CustomLogger custom logger -type CustomLogger struct { - *logger -} - -// NewCustomLogger create a new custom logger -func NewCustomLogger(config *LogConfig, ctx context.Context) (*CustomLogger, error) { - if config == nil { - return nil, errors.New("custom logger config is nil") - } - log := new(logger) - if err := log.setLogger(config); err != nil { - return nil, err - } - if !log.isInit() { - return nil, errors.New("logger init failed") - } - return &CustomLogger{logger: log}, nil -} - -// SetCustomLogger set custom logger -func SetCustomLogger(log *logger) *CustomLogger { - if log == nil { - return nil - } - return &CustomLogger{logger: log} -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go deleted file mode 100644 index a32e9be..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/hwlog_adaptor_test.go +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "context" - "errors" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestInitRunLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init run log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitRunLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("run logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitRunLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitRunLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestNewCustomLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init custom log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - _, err := NewCustomLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("custom logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - _, err = NewCustomLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - _, err = NewCustomLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitOperateLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init operate log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitOperateLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("operate logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitOperateLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitOperateLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitSecurityLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init security log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitSecurityLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("security logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitSecurityLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitSecurityLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitUserLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init user log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitUserLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("user logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitUserLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitUserLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} - -func TestInitDebugLogger(t *testing.T) { - convey.Convey("test hwlog adaptor", t, func() { - convey.Convey("test init debug log", func() { - ctx, cancel := context.WithCancel(context.TODO()) - err := InitDebugLogger(nil, ctx) - convey.So(err, convey.ShouldBeError, errors.New("debug logger config is nil")) - lgConfig := &LogConfig{OnlyToStdout: true} - err = InitDebugLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - // repeat initialize - err = InitDebugLogger(lgConfig, ctx) - convey.So(err, convey.ShouldBeNil) - cancel() - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go b/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go deleted file mode 100644 index 88cfb9d..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/log_limiter.go +++ /dev/null @@ -1,156 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "fmt" - "sync" - "time" - - "ascend-common/common-utils/cache" -) - -const ( - // MaxCacheSize indicates the maximum log cache size - MaxCacheSize = 100 * 1024 - // MaxExpiredTime indicates the maximum log cache expired time - MaxExpiredTime = 60 * 60 - // DefaultCacheSize indicates the default log cache size - DefaultCacheSize = 10 * 1024 - // DefaultExpiredTime indicates the default log cache expired time - DefaultExpiredTime = 1 - cutPreLen = 46 - // ProblemOccurMaxNumbers indicates the maximum number of times that the same problem can occur - ProblemOccurMaxNumbers = 3 -) - -var ( - errorMap sync.Map -) - -// LogLimiter encapsulates Logs and provides the log traffic limiting capability -// to prevent too many duplicate logs. -type LogLimiter struct { - // Logs is a log rotate instance - Logs *Logs - logCache *cache.ConcurrencyLRUCache - logMu sync.Mutex - doOnce sync.Once - - logExpiredTime time.Duration - // CacheSize indicates the size of log cache - CacheSize int - // ExpiredTime indicates the expired time of log cache - ExpiredTime int -} - -// Write implements io.Writer. It encapsulates the Write method of Los and uses -// the lru cache to prevent duplicate log writing. -func (l *LogLimiter) Write(d []byte) (int, error) { - if l == nil { - return 0, fmt.Errorf("log limiter pointer does not exist") - } - - l.logMu.Lock() - defer l.logMu.Unlock() - - if l.ExpiredTime == 0 || l.CacheSize == 0 { - return l.Logs.Write(d) - } - - l.doOnce.Do(func() { - l.validateLimiterConf() - l.logCache = cache.New(l.CacheSize) - l.logExpiredTime = time.Duration(int64(l.ExpiredTime) * int64(time.Second)) - }) - - if l.logCache == nil { - l.logCache = cache.New(DefaultCacheSize) - } - if !l.logCache.SetIfNX(string(d[cutPreLen:]), "v", l.logExpiredTime) { - return 0, nil - } - - return l.Logs.Write(d) -} - -// Close implements io.Closer. It encapsulates the Close method of Logs. -func (l *LogLimiter) Close() error { - if l == nil { - return fmt.Errorf("log limiter pointer does not exist") - } - - l.logMu.Lock() - defer l.logMu.Unlock() - - return l.Logs.Close() -} - -// Flush encapsulates the Flush method of Logs. -func (l *LogLimiter) Flush() error { - if l == nil { - return fmt.Errorf("log limiter pointer does not exist") - } - - l.logMu.Lock() - defer l.logMu.Unlock() - - return l.Logs.Flush() -} - -// validateLimiterConf verifies the external input parameters in the LogLimiter. -func (l *LogLimiter) validateLimiterConf() { - if l.CacheSize < 0 || l.CacheSize > MaxCacheSize { - l.CacheSize = DefaultCacheSize - } - if l.ExpiredTime < 0 || l.ExpiredTime > MaxExpiredTime { - l.ExpiredTime = DefaultExpiredTime - } -} - -func getKey(domain string, id interface{}) string { - return fmt.Sprintf("%d_%s", id, domain) -} - -// IsNeedPrintWithSpecifiedCounts check whether print the error message, -// if the error message (domain_id as a unique identifier) has been printed -// for problemOccurMaxNumbers times, return false -func IsNeedPrintWithSpecifiedCounts(domain string, id interface{}, problemOccurMaxNumbers int) (bool, string) { - key := getKey(domain, id) - cnt, _ := errorMap.LoadOrStore(key, 0) - intCnt, ok := cnt.(int) - extraErrLog := "" - if !ok { - // the counter type is abnormal, print by default - return true, extraErrLog - } - if intCnt >= problemOccurMaxNumbers { - return false, extraErrLog - } - intCnt += 1 - errorMap.Store(key, intCnt) - if intCnt == problemOccurMaxNumbers { - extraErrLog = fmt.Sprintf(".The error log has been printed for %v times "+ - "and will not be printed any more", problemOccurMaxNumbers) - } - return true, extraErrLog - -} - -// ResetErrCnt reset the error count -func ResetErrCnt(domain string, id interface{}) { - errorMap.Delete(getKey(domain, id)) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go deleted file mode 100644 index f659fbc..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/logger.go +++ /dev/null @@ -1,242 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "errors" - "fmt" - "os" - "path" - "regexp" - "strings" - - "github.com/fsnotify/fsnotify" - - "ascend-common/common-utils/utils" -) - -const ( - // DefaultFileMaxSize the default maximum size of a single log file is 20 MB - DefaultFileMaxSize = 20 - // DefaultMinSaveAge the minimum storage duration of backup logs is 7 days - DefaultMinSaveAge = 7 - // DefaultMaxSaveAge the maximum storage duration of backup logs is 700 days - DefaultMaxSaveAge = 700 - // DefaultMaxBackups the default number of backup log - DefaultMaxBackups = 30 - // LogFileMode log file mode - LogFileMode os.FileMode = 0640 - // BackupLogFileMode backup log file mode - BackupLogFileMode os.FileMode = 0400 - // LogDirMode log dir mode - LogDirMode = 0750 - backUpLogRegex = `^.+-[0-9]{4}-[0-9]{2}-[0-9T]{5}-[0-9]{2}-[0-9]{2}\.[0-9]{2,4}` - bitsize = 64 - stackDeep = 3 - pathLen = 2 - minLogLevel = -1 - maxLogLevel = 3 - maxEachLineLen = 1048576 - defaultMaxEachLineLen = 256 -) - -// LogConfig log module config -type LogConfig struct { - // log file path - LogFileName string - // only write to std out, default value: false - OnlyToStdout bool - // only write to file, default value: false - OnlyToFile bool - // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 - LogLevel int - // size of a single log file (MB), default value: 20MB - FileMaxSize int - // MaxLineLength Max length of each log line, default value: 256 - MaxLineLength int - // maximum number of backup log files, default value: 30 - MaxBackups int - // maximum number of days for backup log files, default value: 7 - MaxAge int - // whether backup files need to be compressed, default value: false - IsCompress bool - // expiration time for log cache, default value: 1s - ExpiredTime int - // Size of log cache space, default: 10240 - CacheSize int -} - -var reg = regexp.MustCompile(backUpLogRegex) - -type validateFunc func(config *LogConfig) error - -func checkDir(fileDir string) error { - if !utils.IsExist(fileDir) { - if err := os.MkdirAll(fileDir, LogDirMode); err != nil { - return fmt.Errorf("create dirs failed") - } - return nil - } - if err := os.Chmod(fileDir, LogDirMode); err != nil { - return fmt.Errorf("change log dir mode failed") - } - return nil -} - -func createFile(filePath string) error { - fileName := path.Base(filePath) - if !utils.IsExist(filePath) { - f, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, LogFileMode) - if err != nil { - return fmt.Errorf("create file(%s) failed", fileName) - } - defer func() { - if err := f.Close(); err != nil { - fmt.Printf("close file failed: %v\n", err) - return - } - }() - } - return nil -} - -func checkAndCreateLogFile(filePath string) error { - if !utils.IsFile(filePath) { - return fmt.Errorf("config path is not file") - } - fileDir := path.Dir(filePath) - if err := checkDir(fileDir); err != nil { - return err - } - if err := createFile(filePath); err != nil { - return err - } - return nil -} - -func validateLogConfigFileMaxSize(config *LogConfig) error { - if config.FileMaxSize == 0 { - config.FileMaxSize = DefaultFileMaxSize - return nil - } - if config.FileMaxSize < 0 || config.FileMaxSize > DefaultFileMaxSize { - return fmt.Errorf("the size of a single log file range is (0, 20] MB") - } - - return nil -} - -func validateLogConfigBackups(config *LogConfig) error { - if config.MaxBackups <= 0 || config.MaxBackups > DefaultMaxBackups { - return fmt.Errorf("the number of backup log file range is (0, 30]") - } - return nil -} - -func validateLogConfigMaxAge(config *LogConfig) error { - fmt.Printf("MaxAge %s", config.MaxAge) - if config.MaxAge < DefaultMinSaveAge || config.MaxAge > DefaultMaxSaveAge { - return fmt.Errorf("the maxage of backup logs range is [7,700]") - } - return nil -} - -func validateLogLevel(config *LogConfig) error { - if config.LogLevel < minLogLevel || config.LogLevel > maxLogLevel { - return fmt.Errorf("the log level range should be [-1, 3]") - } - return nil -} - -func validateMaxLineLength(config *LogConfig) error { - if config.MaxLineLength == 0 { - config.MaxLineLength = defaultMaxEachLineLen - return nil - } - if config.MaxLineLength < 0 || config.MaxLineLength > maxEachLineLen { - return fmt.Errorf("the max length of each log line should be in the range (0, 1048576]") - } - return nil -} - -func getValidateFuncList() []validateFunc { - var funcList []validateFunc - funcList = append(funcList, validateLogConfigFileMaxSize, validateLogConfigBackups, validateMaxLineLength, - validateLogConfigMaxAge, validateLogLevel, validateLogConfigLimiter) - return funcList -} - -func validateLogConfigFiled(config *LogConfig) error { - if config.OnlyToStdout { - return nil - } - if _, err := utils.CheckPath(config.LogFileName); err != nil && err != os.ErrNotExist { - return fmt.Errorf("config log path is not absolute path: %v", err) - } - if strings.Contains(config.LogFileName, "..") || strings.Contains(config.LogFileName, "./") { - return errors.New("log path include invalid char") - } - - if err := checkAndCreateLogFile(config.LogFileName); err != nil { - return err - } - validateFuncList := getValidateFuncList() - for _, vaFunc := range validateFuncList { - if err := vaFunc(config); err != nil { - return err - } - } - - return nil -} - -func validateLogConfigLimiter(config *LogConfig) error { - if config.ExpiredTime < 0 || config.ExpiredTime > MaxExpiredTime { - return fmt.Errorf("the expired time of log cache range is [0, 3600], the value 0 disables the limiter") - } - if config.CacheSize < 0 || config.CacheSize > MaxCacheSize { - return fmt.Errorf("the size of log cache range is [0, 102400], the value 0 disables the limiter") - } - return nil -} - -func changeFileMode(l *logger, event fsnotify.Event, logFileFullPath string) { - if l == nil { - fmt.Println("changeFileMode logger is nil") - return - } - var logMode = LogFileMode - logPath := path.Dir(logFileFullPath) - changedFileName := path.Base(event.Name) - if isTargetLog(changedFileName) { - logMode = BackupLogFileMode - } - changedLogFilePath := path.Join(logPath, changedFileName) - if !utils.IsExist(changedLogFilePath) { - return - } - fPath, err := utils.CheckPath(changedLogFilePath) - if err != nil { - l.Errorf("wrong file path: %v", err) - return - } - if errChmod := os.Chmod(fPath, logMode); errChmod != nil { - l.Errorf("set file mode failed, filename: %s", changedFileName) - } -} -func isTargetLog(fileName string) bool { - return reg.MatchString(fileName) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go deleted file mode 100644 index f91b663..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/logger_test.go +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "errors" - "io/fs" - "os" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/fsnotify/fsnotify" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/utils" -) - -func TestCheckDir(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test check dir func", func() { - mockStat := gomonkey.ApplyFunc(os.Stat, func(_ string) (fs.FileInfo, error) { - return nil, os.ErrNotExist - }) - mockMkDir := gomonkey.ApplyFunc(os.MkdirAll, func(_ string, _ fs.FileMode) error { - return nil - }) - defer mockStat.Reset() - defer mockMkDir.Reset() - err := checkDir("log") - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestCreateFile(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test create file", func() { - mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { - return false - }) - mockCreate := gomonkey.ApplyFunc(os.Create, func(_ string) (*os.File, error) { - return nil, nil - }) - defer mockExist.Reset() - defer mockCreate.Reset() - err := createFile("log") - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestCheckAndCreateLogFile(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test checkAndCreateLogFile func", func() { - mockCreate := gomonkey.ApplyFunc(createFile, func(_ string) error { - return nil - }) - defer mockCreate.Reset() - err := checkAndCreateLogFile("log") - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestValidateLogConfigFileMaxSize(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate max size func", func() { - conf := &LogConfig{} - err := validateLogConfigFileMaxSize(conf) - convey.So(err, convey.ShouldBeNil) - convey.So(conf.FileMaxSize, convey.ShouldEqual, DefaultFileMaxSize) - conf.FileMaxSize = -1 - err = validateLogConfigFileMaxSize(conf) - convey.So(err, convey.ShouldBeError) - conf.FileMaxSize = DefaultFileMaxSize + 1 - err = validateLogConfigFileMaxSize(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateLogConfigBackups(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate backups func", func() { - conf := &LogConfig{MaxBackups: DefaultMaxBackups} - err := validateLogConfigBackups(conf) - convey.So(err, convey.ShouldBeNil) - conf.MaxBackups = 0 - err = validateLogConfigBackups(conf) - convey.So(err, convey.ShouldBeError) - conf.FileMaxSize = DefaultMaxBackups + 1 - err = validateLogConfigBackups(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateLogConfigMaxAge(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate max age func", func() { - conf := &LogConfig{MaxAge: DefaultMinSaveAge} - err := validateLogConfigMaxAge(conf) - convey.So(err, convey.ShouldBeNil) - conf.MaxAge = 0 - err = validateLogConfigMaxAge(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateLogLevel(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate log level func", func() { - conf := &LogConfig{} - err := validateLogLevel(conf) - convey.So(err, convey.ShouldBeNil) - conf.LogLevel = minLogLevel - 1 - err = validateLogLevel(conf) - convey.So(err, convey.ShouldBeError) - conf.LogLevel = maxLogLevel + 1 - err = validateLogLevel(conf) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -func TestValidateMaxLineLength(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate max line length func", func() { - conf := &LogConfig{} - err := validateMaxLineLength(conf) - convey.So(err, convey.ShouldBeNil) - convey.So(conf.MaxLineLength, convey.ShouldEqual, defaultMaxEachLineLen) - conf.MaxLineLength = -1 - err = validateMaxLineLength(conf) - convey.So(err, convey.ShouldNotBeNil) - conf.MaxLineLength = maxEachLineLen + 1 - err = validateMaxLineLength(conf) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestValidateLogConfigFiled(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test validate config filed func", func() { - mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { - return "", nil - }) - mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { - return nil - }) - defer mockCheckPath.Reset() - defer mockCheckAndCreate.Reset() - conf := &LogConfig{ - MaxBackups: DefaultMaxBackups, - MaxAge: DefaultMinSaveAge, - CacheSize: DefaultCacheSize, - ExpiredTime: DefaultExpiredTime, - } - err := validateLogConfigFiled(conf) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("test validate config filed func, log file is relative path", func() { - mockCheckPath := gomonkey.ApplyFunc(utils.CheckPath, func(_ string) (string, error) { - return "", nil - }) - mockCheckAndCreate := gomonkey.ApplyFunc(checkAndCreateLogFile, func(_ string) error { - return nil - }) - defer mockCheckPath.Reset() - defer mockCheckAndCreate.Reset() - conf := &LogConfig{ - MaxBackups: DefaultMaxBackups, - MaxAge: DefaultMinSaveAge, - CacheSize: DefaultCacheSize, - ExpiredTime: DefaultExpiredTime, - LogFileName: "../", - } - err := validateLogConfigFiled(conf) - expErr := errors.New("log path include invalid char") - convey.So(err, convey.ShouldResemble, expErr) - }) - }) -} - -func TestChangeFileMode(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test changeFileMode func", func() { - changeFileMode(nil, fsnotify.Event{}, "log") - mockExist := gomonkey.ApplyFunc(utils.IsExist, func(_ string) bool { - return true - }) - mockChmod := gomonkey.ApplyFunc(os.Chmod, func(_ string, _ fs.FileMode) error { - return nil - }) - defer mockExist.Reset() - defer mockChmod.Reset() - lg := new(logger) - evt := fsnotify.Event{Name: "run-2022-01-01T00-00-00.123.log"} - changeFileMode(lg, evt, "log") - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go deleted file mode 100644 index cc07bb2..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog.go +++ /dev/null @@ -1,447 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "sort" - "strings" - "sync" - "time" -) - -const ( - oneDaySeconds = 24 * 60 * 60 - defaultCapacity = 20 - timeFormat = "2006-01-02T15-04-05.000" - kilobytes = 1024 - defaultDirPermission = 0750 - defaultFilePermission = 0600 - defaultBackupPermission = 0400 - maxCapacity = 20 - minSaveVolume = 1 - maxSaveVolume = 30 - maxSaveTime = 700 - minSaveTime = 7 -) - -// Logs is an io.WriteCloser. -type Logs struct { - file *os.File - mutex sync.Mutex - rmOnce sync.Once - - // FileName is the file where logs are written. - FileName string `json:"filename" yaml:"filename"` - - // Capacity is the maximum number of bytes before the log file - // is rotated, and the default value is 128 megabytes. - Capacity int `json:"capacity" yaml:"capacity"` - - // SaveTime is the maximum number of days for retaining old log - // files. It calculates the retention time based on the timestamp - // of the old log file name and the current time. - SaveTime int `json:"savetime" yaml:"savetime"` - - // SaveVolume is the maximum number of old log files that can be - // retained. It saves all old files by default. - SaveVolume int `json:"savevolume" yaml:"savevolume"` - - // UTC determines whether to use the local time of the computer - // or the UTC time as the timestamp in the formatted backup file. - LocalOrUTC bool `json:"localorutc" yaml:"localorutc"` - - length int64 - rmCh chan bool -} - -// logFile is a struct that is used to return filename and -// timestamp. -type logFile struct { - fileInfo os.FileInfo - timeStamp time.Time -} - -var ( - // mByte is used to convert capacity into bytes. - mByte = kilobytes * kilobytes -) - -// Write implements io.Writer. If a write would not cause the size of -// the log file to exceed Capacity, the log file is written normally. -// If a write would cause the size of the log file to exceed Capacity, -// but the write length is less than Capacity, the log file is closed, -// renamed to include a timestamp of the current time, and a new log -// is created using the original log file name. If the length of a write -// is greater than the Capacity, an error is returned. -func (l *Logs) Write(d []byte) (int, error) { - if l == nil { - return 0, fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - - writeLenth := int64(len(d)) - if writeLenth > l.maxLenth() { - return 0, fmt.Errorf("the write lenth %d is greater than the maximum file size %d", - writeLenth, l.maxLenth(), - ) - } - - if l.file == nil { - if err := l.openOrCreateFile(writeLenth); err != nil { - return 0, err - } - } - fileInfo, err := l.file.Stat() - if err != nil { - return 0, err - } - l.length = fileInfo.Size() - if writeLenth+l.length > l.maxLenth() { - if err := l.roll(); err != nil { - return 0, err - } - } - - n, err := l.file.Write(d) - if err != nil { - return 0, err - } - l.length += int64(n) - return n, err -} - -// Roll causes Logs to close the existing log file and create a new log -// file immediately. The purpose of this function is to provide rotation -// outside the normal rotation rule, e.g. in response to SIGHUP. After -// rotation, the deletion of the old log files is initiated. -func (l *Logs) Roll() error { - if l == nil { - return fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - return l.roll() -} - -// Close implements io.Closer. It closses the current log file. -func (l *Logs) Close() error { - if l == nil { - return fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - - return l.close() -} - -// Flush persist the contents of the current memory. -func (l *Logs) Flush() error { - if l == nil { - return fmt.Errorf("logs pointer does not exist") - } - - l.mutex.Lock() - defer l.mutex.Unlock() - if l.file == nil { - return nil - } - return l.file.Sync() -} - -// maxLenth return the number of bytes of the maximum log size -// before rotating. -func (l *Logs) maxLenth() int64 { - if l.Capacity > 0 && l.Capacity < maxCapacity { - return int64(l.Capacity) * int64(mByte) - } - return int64(defaultCapacity * mByte) -} - -// fileName return the name of the log file. -func (l *Logs) fileName() string { - if l.FileName != "" { - return l.FileName - } - logName := filepath.Base(os.Args[0]) + "-mindx-dl.log" - return filepath.Join(os.TempDir(), logName) -} - -// openOrCreateFile opens the log file if it exists and the -// current write would not exceed the Capacity. It will create -// a new file if there is no such file or the write would exceed -// the Capacity. -func (l *Logs) openOrCreateFile(writeLen int64) error { - l.remove() - - name := l.fileName() - message, err := os.Stat(name) - if os.IsNotExist(err) { - return l.create() - } - - if err != nil { - return fmt.Errorf("failed to get log file message: %v", err) - } - - if writeLen+message.Size() >= l.maxLenth() { - return l.roll() - } - - f, err := os.OpenFile(name, os.O_APPEND|os.O_WRONLY, defaultFilePermission) - if err != nil { - return l.create() - } - l.file = f - l.length = message.Size() - return nil -} - -// create creates a new log file for writing, and backs up the -// old log file. The file is closed when this method is invoked -// by default. -func (l *Logs) create() error { - if err := os.MkdirAll(l.getDir(), defaultDirPermission); err != nil { - return fmt.Errorf("unable to create directory for new log file: %v", err) - } - - fileName, fileMode := l.fileName(), os.FileMode(defaultFilePermission) - if message, err := os.Stat(fileName); err == nil { - fileMode = message.Mode() - backupName := l.backup() - if err := os.Rename(fileName, backupName); err != nil { - return fmt.Errorf("failed to rename the log file: %v", err) - } - if err := os.Chmod(backupName, defaultBackupPermission); err != nil { - return fmt.Errorf("failed to change backup log file permission: %v", err) - } - } - newFile, err := os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, fileMode) - if err != nil { - return fmt.Errorf("unable to open new log file: %v", err) - } - l.length, l.file = 0, newFile - return nil -} - -// backup generates a backup file name based on the original file -// name and inserts a timestamp between the file name and extension. -// The timestamp uses the UTC time by default. -func (l *Logs) backup() string { - prefix, extension := l.getPreAndExt() - return filepath.Join(l.getDir(), fmt.Sprintf("%s%s%s", prefix, l.getTimestamp(), extension)) -} - -// getDir returns the directory for the current filename. -func (l *Logs) getDir() string { - return filepath.Dir(l.fileName()) -} - -// getPreAndExt returns the prefix name and extension name -// from Logs's filename. -func (l *Logs) getPreAndExt() (string, string) { - name := filepath.Base(l.fileName()) - extension := filepath.Ext(name) - prefix := name[:len(name)-len(extension)] + "-" - return prefix, extension -} - -// getTimestamp returns the timestamp of current time, and -// uses UTC time by default. -func (l *Logs) getTimestamp() string { - t := time.Now() - if !l.LocalOrUTC { - t = t.UTC() - } - return t.Format(timeFormat) -} - -// roll rotates the log file, close the existing log file and -// create a new one immediately. After rotating, this method -// deletes the old log files according to the configuration. -func (l *Logs) roll() error { - if err := l.close(); err != nil { - return err - } - if err := l.create(); err != nil { - return err - } - l.remove() - return nil -} - -// close closes the file if it is open. -func (l *Logs) close() error { - if l.file == nil { - return nil - } - err := l.file.Sync() - if err != nil { - return err - } - err = l.file.Close() - l.file = nil - return err -} - -// remove delete outdated log files, starting the remove -// goroutine if necessary. -func (l *Logs) remove() { - l.rmOnce.Do(func() { - l.rmCh = make(chan bool, 1) - go l.removeRun() - }) - select { - case l.rmCh <- true: - default: - } -} - -// removeRun manages the deletion of the old log files after -// rotating, which runs in a goroutine. -func (l *Logs) removeRun() { - for range l.rmCh { - if err := l.removeRunOnce(); err != nil { - fmt.Println("failed to remove runonce: ", err) - } - } -} - -// removeRunOnce performs removal of outdated log files. -// Old log files are removed if the number of old files -// exceed the Capacity or the retention time of old files -// is greater than SaveTime. -func (l *Logs) removeRunOnce() error { - if l.SaveVolume == 0 && l.SaveTime == 0 { - return nil - } - - if err := checkParam(l.SaveVolume, l.SaveTime); err != nil { - return err - } - - oldFiles, err := l.oldFilesList() - if err != nil { - return err - } - - var removeFiles []logFile - if l.SaveTime > 0 { - delTime := time.Now().Unix() - int64(l.SaveTime)*oneDaySeconds - var remainingFiles []logFile - for _, f := range oldFiles { - if f.timeStamp.Unix() <= delTime { - removeFiles = append(removeFiles, f) - continue - } - remainingFiles = append(remainingFiles, f) - } - oldFiles = remainingFiles - } - - if l.SaveVolume > 0 && l.SaveVolume < len(oldFiles) { - saved := make(map[string]struct{}, len(oldFiles)) - var remainingFiles []logFile - for _, f := range oldFiles { - saved[f.fileInfo.Name()] = struct{}{} - if l.SaveVolume >= len(saved) { - remainingFiles = append(remainingFiles, f) - continue - } - removeFiles = append(removeFiles, f) - } - oldFiles = remainingFiles - } - - for _, f := range removeFiles { - rmError := os.Remove(filepath.Join(l.getDir(), f.fileInfo.Name())) - if rmError != nil { - err = rmError - } - } - return err -} - -// oldFilesList returns the list of backup log files sorted -// by ModTime. These backup log files are stored in the same -// directory as the current log file. -func (l *Logs) oldFilesList() ([]logFile, error) { - logFiles, err := ioutil.ReadDir(l.getDir()) - if err != nil { - return nil, fmt.Errorf("unable to open the log file directory: %v", err) - } - - prefix, extension := l.getPreAndExt() - - var oldFiles []logFile - - for _, file := range logFiles { - if file.IsDir() { - continue - } - if timeStamp, err := l.extractTime(file.Name(), prefix, extension); err == nil { - oldFiles = append(oldFiles, logFile{fileInfo: file, timeStamp: timeStamp}) - continue - } - } - sort.Slice(oldFiles, func(i, j int) bool { - if i < 0 || i > len(oldFiles) || j < 0 || j > len(oldFiles) { - return false - } - return oldFiles[i].timeStamp.After(oldFiles[j].timeStamp) - }) - - return oldFiles, nil -} - -// extractTime extracts the formatted time from file name by -// stripping the prefix and extension of the file name. This -// prevents fileName from being confused with time.parse. -func (l *Logs) extractTime(name, prefix, extension string) (time.Time, error) { - if !strings.HasSuffix(name, extension) { - return time.Time{}, errors.New("unmatched extension") - } - - if !strings.HasPrefix(name, prefix) { - return time.Time{}, errors.New("unmatched prefix") - } - - timeStamp := name[len(prefix) : len(name)-len(extension)] - return time.Parse(timeFormat, timeStamp) -} - -// checkParam checks whether the parameters are correct -func checkParam(volume int, time int) error { - if volume != 0 { - if volume < minSaveVolume || volume > maxSaveVolume { - return fmt.Errorf("the value of savevolume is incorrect") - } - } - if time != 0 { - if time < minSaveTime || time > maxSaveTime { - return fmt.Errorf("the value of savetime is incorrect") - } - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go deleted file mode 100644 index 67807bd..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/rolog_test.go +++ /dev/null @@ -1,687 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "encoding/json" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -const ( - testDirPermission = 0700 - testFilePermission = 0600 - testMByte = 1 - testCapacity = 10 - testCapacity2 = 100 - testCapacity3 = 5 - testSaveTime = 10 - testSaveTime2 = 7 - testSaveVolume = 3 - testSaveVolume2 = 1 - fileCountOne = 1 - fileCountTwo = 2 - fileCountFour = 4 - waitTime = 50 - oneDayHour = 24 - sevenDays = 7 - fourteenDays = 14 - twentyOneDays = 21 - testYear = 2014 - testMonth = 5 - testDay = 4 - testHour = 14 - testMin = 44 - testSec = 33 - testNsec = 555000000 -) - -// TestCreate for test the function of create log file -func TestCreate(t *testing.T) { - convey.Convey("TestCreate", t, func() { - dir := makeTempDir("TestCrate") - defer os.RemoveAll(dir) - l := &Logs{ - FileName: getLogFile(dir), - } - defer l.Close() - - input := []byte("foobarfoobar!") - fileWrite(input, l) - existWithContent(input, getLogFile(dir)) - fileCount(fileCountOne, dir) - }) -} - -// TestOpenFile for test the function of open log file -func TestOpenFile(t *testing.T) { - convey.Convey("TestOpenFile", t, func() { - dir := makeTempDir("TestOpenFile") - defer os.RemoveAll(dir) - fileName := getLogFile(dir) - data := []byte("foo!") - err := ioutil.WriteFile(fileName, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - existWithContent(data, fileName) - - l := &Logs{ - FileName: fileName, - } - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(append(data, b...), fileName) - fileCount(fileCountOne, dir) - }) -} - -// TestWriteTooLong for test the processing of the overlong write error -func TestWriteTooLong(t *testing.T) { - convey.Convey("TestWriteTooLong", t, func() { - mByte = testMByte - dir := makeTempDir("TestWriteTooLong") - defer os.RemoveAll(dir) - - l := &Logs{ - FileName: getLogFile(dir), - Capacity: testCapacity3, - } - defer l.Close() - - b := []byte("barrrrrrrrrrrrrrrrr!") - n, err := l.Write(b) - convey.So(err, convey.ShouldNotBeNil) - convey.So(0, convey.ShouldEqual, n) - convey.So(err.Error(), convey.ShouldEqual, fmt.Sprintf( - "the write lenth %d is greater than the maximum file size %d", len(b), l.Capacity)) - _, err = os.Stat(getLogFile(dir)) - convey.So(err, shouldNotBeExist) - }) -} - -// TestMakeLogDir for test the function of make log file directory -func TestMakeLogDir(t *testing.T) { - convey.Convey("TestMakeLogDir", t, func() { - dir := time.Now().Format("TestMakeLogDir" + timeFormat) - dir = filepath.Join(os.TempDir(), dir) - defer os.RemoveAll(dir) - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - } - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, getLogFile(dir)) - fileCount(fileCountOne, dir) - }) -} - -// TestDefaultFileName for test default log file name -func TestDefaultFileName(t *testing.T) { - convey.Convey("TestDefaultFileName", t, func() { - dir := os.TempDir() - fileName := filepath.Join(dir, filepath.Base(os.Args[0])+"-mindx-dl.log") - defer os.Remove(fileName) - - l := &Logs{} - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, fileName) - }) -} - -// TestAutoRoll for test the automatic log rolling -func TestAutoRoll(t *testing.T) { - convey.Convey("TestAutoRoll", t, func() { - mByte = testMByte - dir := makeTempDir("TestAutoRoll") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - } - defer l.Close() - - b := []byte("aoo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - defer patch1.Reset() - - b2 := []byte("foooooo!") - fileWrite(b2, l) - existWithContent(b2, fileName) - existWithContent(b, getBackupFile(dir, time.Now())) - fileCount(fileCountTwo, dir) - }) -} - -// TestFirstWriteRoll for test the log rolling on first write -func TestFirstWriteRoll(t *testing.T) { - convey.Convey("TestFirstWriteRoll", t, func() { - mByte = testMByte - dir := makeTempDir("TestFirstWriteRoll") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - } - defer l.Close() - - start := []byte("boooooo!") - err := ioutil.WriteFile(fileName, start, testFilePermission) - convey.So(err, convey.ShouldBeNil) - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - defer patch1.Reset() - - b := []byte("fooo!") - fileWrite(b, l) - existWithContent(b, fileName) - existWithContent(start, getBackupFile(dir, time.Now())) - fileCount(fileCountTwo, dir) - }) -} - -// TestSaveVolumeCase1 for test the deleting log files that exceed the volume -func TestSaveVolumeCase1(t *testing.T) { - convey.Convey("TestSaveVolumeCase1", t, func() { - mByte = testMByte - dir := makeTempDir("TestSaveVolumeCase1") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - SaveVolume: testSaveVolume2, - } - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b2 := []byte("foooooo!") - fileWrite(b2, l) - secondFileName := getBackupFile(dir, time.Now()) - existWithContent(b, secondFileName) - existWithContent(b2, fileName) - fileCount(fileCountTwo, dir) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - defer patch2.Reset() - b3 := []byte("baaaaaar!") - fileWrite(b3, l) - thirdFileName := getBackupFile(dir, time.Now()) - existWithContent(b2, thirdFileName) - existWithContent(b3, fileName) - <-time.After(time.Millisecond * waitTime) - fileCount(fileCountTwo, dir) - existWithContent(b2, thirdFileName) - convey.So(secondFileName, shouldNotExist) - }) -} - -// TestSaveVolumeCase2 for test the deleting log files that exceed the volume when a non-log file exists -func TestSaveVolumeCase2(t *testing.T) { - convey.Convey("TestSaveVolumeCase2", t, func() { - mByte = testMByte - dir := makeTempDir("TestSaveVolumeCase2") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{FileName: fileName, Capacity: testCapacity, SaveVolume: testSaveVolume2} - defer l.Close() - - b := []byte("boo!") - fileWrite(b, l) - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b2 := []byte("baaaaaar!") - fileWrite(b2, l) - secondFileName := getBackupFile(dir, time.Now()) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - notLogFile := getLogFile(dir) + ".foo" - err := ioutil.WriteFile(notLogFile, []byte("data"), testFilePermission) - convey.So(err, convey.ShouldBeNil) - notLogFileDir := getBackupFile(dir, time.Now()) - err = os.Mkdir(notLogFileDir, testDirPermission) - convey.So(err, convey.ShouldBeNil) - - patch2.Reset() - patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time3 := currentTime - return time3.Add(time.Hour * oneDayHour * twentyOneDays) - }) - defer patch3.Reset() - thirdFileName := getBackupFile(dir, time.Now()) - b3 := []byte("baaaaaaz!") - fileWrite(b3, l) - existWithContent(b2, thirdFileName) - <-time.After(time.Millisecond * waitTime) - fileCount(fileCountFour, dir) - existWithContent(b3, fileName) - convey.So(secondFileName, shouldNotExist) - convey.So(notLogFile, shouldExist) - convey.So(notLogFileDir, shouldExist) - }) -} - -// TestCleanupExistingBackupFiles fot test the clearing the current backup log files -func TestCleanupExistingBackupFiles(t *testing.T) { - convey.Convey("TestCleanupExistingBackupFiles", t, func() { - mByte = testMByte - dir := makeTempDir("TestCleanupExistingBackupFiles") - defer os.RemoveAll(dir) - currentTime := time.Now() - - data := []byte("data") - backup := getBackupFile(dir, time.Now()) - err := ioutil.WriteFile(backup, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - backup = getBackupFile(dir, time.Now()) - err = ioutil.WriteFile(backup, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - fileName := getLogFile(dir) - err = ioutil.WriteFile(fileName, data, testFilePermission) - convey.So(err, convey.ShouldBeNil) - - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - SaveVolume: testSaveVolume2, - } - defer l.Close() - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - defer patch2.Reset() - b2 := []byte("foooooo!") - fileWrite(b2, l) - - <-time.After(time.Millisecond * waitTime) - - fileCount(fileCountTwo, dir) - }) -} - -// TestSaveTime for test the deleting log files that exceed the time -func TestSaveTime(t *testing.T) { - convey.Convey("TestSaveTime", t, func() { - mByte = testMByte - dir := makeTempDir("TestSaveTime") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - Capacity: testCapacity, - SaveTime: testSaveTime2, - } - defer l.Close() - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b := []byte("zoo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - b2 := []byte("foooooo!") - fileWrite(b2, l) - existWithContent(b, getBackupFile(dir, time.Now())) - - <-time.After(waitTime * time.Millisecond) - - fileCount(fileCountTwo, dir) - existWithContent(b2, fileName) - existWithContent(b, getBackupFile(dir, time.Now())) - - patch2.Reset() - patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time3 := currentTime - return time3.Add(time.Hour * oneDayHour * twentyOneDays) - }) - defer patch3.Reset() - b3 := []byte("baaaaar!") - fileWrite(b3, l) - existWithContent(b2, getBackupFile(dir, time.Now())) - - <-time.After(waitTime * time.Millisecond) - - fileCount(fileCountTwo, dir) - existWithContent(b3, fileName) - existWithContent(b2, getBackupFile(dir, time.Now())) - }) -} - -// TestOldLogFilesList for test the obtaining the list of old log files -func TestOldLogFilesList(t *testing.T) { - convey.Convey("TestOldLogFilesList", t, func() { - mByte = testMByte - dir := makeTempDir("TestOldLogFiles") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - data := []byte("data") - err := ioutil.WriteFile(fileName, data, testDirPermission) - convey.So(err, convey.ShouldBeNil) - t1, err := time.Parse(timeFormat, currentTime.UTC().Format(timeFormat)) - convey.So(err, convey.ShouldBeNil) - backup := getBackupFile(dir, currentTime) - err = ioutil.WriteFile(backup, data, testDirPermission) - convey.So(err, convey.ShouldBeNil) - - patch := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - defer patch.Reset() - t2, err := time.Parse(timeFormat, time.Now().UTC().Format(timeFormat)) - convey.So(err, convey.ShouldBeNil) - backup2 := getBackupFile(dir, time.Now()) - err = ioutil.WriteFile(backup2, data, testDirPermission) - convey.So(err, convey.ShouldBeNil) - - l := &Logs{FileName: fileName} - files, err := l.oldFilesList() - convey.So(err, convey.ShouldBeNil) - convey.So(fileCountTwo, convey.ShouldEqual, len(files)) - convey.So(t2, convey.ShouldEqual, files[0].timeStamp) - convey.So(t1, convey.ShouldEqual, files[1].timeStamp) - }) -} - -// TestExtractTime for test obtaining log file timestamp -func TestExtractTime(t *testing.T) { - convey.Convey("TestExtractTime", t, func() { - l := &Logs{FileName: "/var/log/myfoo/foo.log"} - prefix, extention := l.getPreAndExt() - - tests := []struct { - fileName string - want time.Time - wantErr bool - }{ - {"foo-2014-05-04T14-44-33.555.log", time.Date( - testYear, testMonth, testDay, testHour, testMin, testSec, testNsec, time.UTC), false}, - {"foo-2014-05-04T14-44-33.555", time.Time{}, true}, - {"2014-05-04T14-44-33.555.log", time.Time{}, true}, - {"foo.log", time.Time{}, true}, - } - - for _, test := range tests { - got, err := l.extractTime(test.fileName, prefix, extention) - convey.So(got, convey.ShouldEqual, test.want) - convey.So(err != nil, convey.ShouldEqual, test.wantErr) - } - }) -} - -// TestLocalTime for test the situation that current time is the local time -func TestLocalTime(t *testing.T) { - convey.Convey("TestLocalTime", t, func() { - mByte = testMByte - dir := makeTempDir("TestLocalTime") - defer os.RemoveAll(dir) - currentTime := time.Now() - - l := &Logs{ - FileName: getLogFile(dir), - Capacity: testCapacity, - LocalOrUTC: true, - } - defer l.Close() - - patch := gomonkey.ApplyFunc(time.Now, func() time.Time { - return currentTime - }) - defer patch.Reset() - b := []byte("boo!") - fileWrite(b, l) - - b2 := []byte("fooooooo!") - fileWrite(b2, l) - existWithContent(b2, getLogFile(dir)) - existWithContent(b, getBackupFileLocal(dir, currentTime)) - }) -} - -// TestRoll for test rolling -func TestRoll(t *testing.T) { - convey.Convey("TestRoll", t, func() { - dir := makeTempDir("TestRotate") - defer os.RemoveAll(dir) - currentTime := time.Now() - - fileName := getLogFile(dir) - l := &Logs{ - FileName: fileName, - SaveVolume: testSaveVolume2, - Capacity: testCapacity2, // megabytes - } - defer l.Close() - - patch1 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time1 := currentTime - return time1.Add(time.Hour * oneDayHour * sevenDays) - }) - b := []byte("boo!") - fileWrite(b, l) - existWithContent(b, fileName) - fileCount(fileCountOne, dir) - - patch1.Reset() - patch2 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time2 := currentTime - return time2.Add(time.Hour * oneDayHour * fourteenDays) - }) - err := l.Roll() - convey.So(err, convey.ShouldBeNil) - - <-time.After(waitTime * time.Millisecond) - - filename2 := getBackupFile(dir, time.Now()) - existWithContent(b, filename2) - existWithContent([]byte{}, fileName) - fileCount(fileCountTwo, dir) - - patch2.Reset() - patch3 := gomonkey.ApplyFunc(time.Now, func() time.Time { - time3 := currentTime - return time3.Add(time.Hour * oneDayHour * twentyOneDays) - }) - defer patch3.Reset() - err = l.Roll() - convey.So(err, convey.ShouldBeNil) - - <-time.After(waitTime * time.Millisecond) - - filename3 := getBackupFile(dir, time.Now()) - existWithContent([]byte{}, filename3) - existWithContent([]byte{}, fileName) - fileCount(fileCountTwo, dir) - - b2 := []byte("foooooo!") - fileWrite(b2, l) - existWithContent(b2, fileName) - }) -} - -// TestJson for test JSON conversion -func TestJson(t *testing.T) { - convey.Convey("TestJson", t, func() { - data := []byte(` - { - "filename": "foo", - "capacity": 10, - "savetime": 10, - "savevolume": 3, - "localorutc": true - }`[1:]) - - l := Logs{} - err := json.Unmarshal(data, &l) - convey.So(err, convey.ShouldBeNil) - convey.So("foo", convey.ShouldEqual, l.FileName) - convey.So(testCapacity, convey.ShouldEqual, l.Capacity) - convey.So(testSaveTime, convey.ShouldEqual, l.SaveTime) - convey.So(testSaveVolume, convey.ShouldEqual, l.SaveVolume) - convey.So(true, convey.ShouldEqual, l.LocalOrUTC) - }) -} - -// makeTempDir creates a file in the OS temp directory to keep parallel test -func makeTempDir(name string) string { - dir := time.Now().Format(name + timeFormat) - dir = filepath.Join(os.TempDir(), dir) - err := os.Mkdir(dir, testDirPermission) - convey.So(err, convey.ShouldBeNil) - return dir -} - -// existWithContent checks that the given file exists and has the correct content -func existWithContent(content []byte, dir string) { - info, err := os.Stat(dir) - convey.So(err, convey.ShouldBeNil) - convey.So(int64(len(content)), convey.ShouldEqual, info.Size()) - - b, err := ioutil.ReadFile(dir) - convey.So(err, convey.ShouldBeNil) - convey.So(content, convey.ShouldResemble, b) -} - -// getLogFile returns the log file name in the given directory for the current fake time -func getLogFile(dir string) string { - return filepath.Join(dir, "foobar.log") -} - -func getBackupFile(dir string, t time.Time) string { - return filepath.Join(dir, "foobar-"+t.UTC().Format(timeFormat)+".log") -} - -func getBackupFileLocal(dir string, t time.Time) string { - return filepath.Join(dir, "foobar-"+t.Format(timeFormat)+".log") -} - -// fileCount checks that the number of files in the directory is exp. -func fileCount(exp int, dir string) { - files, err := ioutil.ReadDir(dir) - convey.So(err, convey.ShouldBeNil) - convey.So(len(files), convey.ShouldEqual, exp) -} - -func fileWrite(b []byte, l *Logs) { - n, err := l.Write(b) - convey.So(err, convey.ShouldBeNil) - convey.So(len(b), convey.ShouldEqual, n) -} - -func shouldNotBeExist(actual interface{}, expected ...interface{}) string { - err, ok := actual.(error) - if !ok { - return "incorrect parameter type" - } - if os.IsNotExist(err) { - return "" - } - return "File exists, but should not have been created" -} -func shouldNotExist(actual interface{}, expected ...interface{}) string { - path, ok := actual.(string) - if !ok { - return "incorrect parameter type" - } - _, err := os.Stat(path) - if os.IsNotExist(err) { - return "" - } - return fmt.Sprintf("expected to get os.IsNotExist, but instead got %v", err) -} - -func shouldExist(actual interface{}, expected ...interface{}) string { - path, ok := actual.(string) - if !ok { - return "incorrect parameter type" - } - _, err := os.Stat(path) - if err != nil { - return fmt.Sprintf("expected file to exist, but got error from os.Stat: %v", err) - } - return "" -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/types.go b/mind-cluster/component/ascend-common/common-utils/hwlog/types.go deleted file mode 100644 index e97c80b..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/types.go +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import "errors" - -// ContextKey especially for context value -// to solve problem of "should not use basic type untyped string as key in context.WithValue" -type ContextKey string - -// String the implement of String method -func (c ContextKey) String() string { - return string(c) -} - -const ( - // UserID used for context value key of "ID" - UserID ContextKey = "UserID" - // ReqID used for context value key of "requestID" - ReqID ContextKey = "RequestID" - // extraDeepKey used for context value key of "extraDeepKey" - extraDeepKey ContextKey = "extraDeepKey" -) - -// SelfLogWriter used this to replace some opensource log -type SelfLogWriter struct { -} - -// Write implement the interface of io.writer -func (l *SelfLogWriter) Write(p []byte) (int, error) { - if RunLog == nil { - return -1, errors.New("hwlog is not initialized") - } - RunLog.Info(string(p)) - return len(p), nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go deleted file mode 100644 index 40955f4..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/utils.go +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog provides the capability of processing Huawei log rules. -package hwlog - -import ( - "bytes" - "context" - "fmt" - "log" - "runtime" - "strings" -) - -// printHelper helper function for log printing -func printHelper(lg *log.Logger, msg string, maxLogLength int, ctx ...context.Context) { - str := getCallerInfo(ctx...) - trimMsg := strings.Replace(msg, "\r", " ", -1) - trimMsg = strings.Replace(trimMsg, "\n", " ", -1) - runeArr := []rune(trimMsg) - if length := len(runeArr); length > maxLogLength { - trimMsg = string(runeArr[:maxLogLength]) - } - lg.Println(str + trimMsg) -} - -// getCallerInfo gets the caller's information -func getCallerInfo(ctx ...context.Context) string { - var deep = stackDeep - var userID interface{} - var traceID interface{} - for _, c := range ctx { - if c == nil { - deep++ - continue - } - userID = c.Value(UserID) - traceID = c.Value(ReqID) - if val := c.Value(extraDeepKey); val != nil { - currentVal, _ := val.(int) // security type assertions, invalid values are automatically zeroed - deep += currentVal - } - } - var funcName string - pc, codePath, codeLine, ok := runtime.Caller(deep) - if ok { - funcName = runtime.FuncForPC(pc).Name() - } - p := strings.Split(codePath, "/") - l := len(p) - if l == pathLen { - funcName = p[l-1] - } else if l > pathLen { - funcName = fmt.Sprintf("%s/%s", p[l-pathLen], p[l-1]) - } - callerPath := fmt.Sprintf("%s:%d", funcName, codeLine) - goroutineID := getGoroutineID() - str := fmt.Sprintf("%-8s%s ", goroutineID, callerPath) - if userID != nil || traceID != nil { - str = fmt.Sprintf("%s{%#v}-{%#v} ", str, userID, traceID) - } - return str -} - -// getCallerGoroutineID gets the goroutineID -func getGoroutineID() string { - b := make([]byte, bitsize, bitsize) - b = b[:runtime.Stack(b, false)] - b = bytes.TrimPrefix(b, []byte("goroutine ")) - b = b[:bytes.IndexByte(b, ' ')] - return string(b) -} - -// DeepIncrease increases the stack depth by 1 -func DeepIncrease(ctx context.Context) context.Context { - if ctx == nil { - return context.WithValue(context.Background(), extraDeepKey, 1) - } - - var currentVal int - if val := ctx.Value(extraDeepKey); val != nil { - currentVal, _ = val.(int) // security type assertions, invalid values are automatically zeroed - } - - return context.WithValue(ctx, extraDeepKey, currentVal+1) -} diff --git a/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go b/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go deleted file mode 100644 index ca2bda2..0000000 --- a/mind-cluster/component/ascend-common/common-utils/hwlog/utils_test.go +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hwlog test file -package hwlog - -import ( - "context" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestUtilsFunc(t *testing.T) { - convey.Convey("test utils", t, func() { - convey.Convey("test utils func", func() { - lg := new(logger) - conf := &LogConfig{OnlyToStdout: true} - userCtx := context.TODO() - userCtx = context.WithValue(userCtx, UserID, 0) - userCtx = context.WithValue(userCtx, ReqID, 0) - err := lg.setLogger(conf) - convey.So(err, convey.ShouldBeNil) - printHelper(lg.lgInfo, "test", defaultMaxEachLineLen) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go deleted file mode 100644 index fdab9a8..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler.go +++ /dev/null @@ -1,226 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limiter -package limiter - -import ( - "context" - "errors" - "fmt" - "math" - "net/http" - "regexp" - "strconv" - "strings" - "syscall" - "time" - - "ascend-common/common-utils/cache" - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" -) - -const ( - kilo = 1000.0 - // DefaultDataLimit default http body limit size - DefaultDataLimit = 1024 * 1024 * 10 - defaultMaxConcurrency = 1024 - maxStringLen = 20 - // DefaultCacheSize default cache size - DefaultCacheSize = 1024 * 100 - arrLen = 2 - // IPReqLimitReg ip request limit regex string - IPReqLimitReg = "^[1-9]\\d{0,2}/[1-9]\\d{0,2}$" -) - -type limitHandler struct { - concurrency chan struct{} - httpHandler http.Handler - log bool - method string - limitBytes int64 - ipExpiredTime time.Duration - ipCache *cache.ConcurrencyLRUCache -} - -// HandlerConfig the configuration of the limitHandler -type HandlerConfig struct { - // PrintLog whether you need print access log, when use gin framework, suggest to set false,otherwise set true - PrintLog bool - // Method only allow setting http method pass - Method string - // LimitBytes set the max http body size - LimitBytes int64 - // TotalConCurrency set the program total concurrent http request - TotalConCurrency int - // IPConCurrency set the signle IP concurrent http request "2/1sec" - IPConCurrency string - // CacheSize the local cacheSize - CacheSize int -} - -// StatusResponseWriter the writer record the http status -type StatusResponseWriter struct { - http.ResponseWriter - http.Hijacker - Status int -} - -// WriteHeader override the WriteHeader method -func (w *StatusResponseWriter) WriteHeader(status int) { - w.ResponseWriter.WriteHeader(status) - w.Status = status -} - -// ServeHTTP implement http.Handler -func (h *limitHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { - req.Body = http.MaxBytesReader(w, req.Body, h.limitBytes) - ctx := initContext(req) - path := req.URL.Path - clientUserAgent := req.UserAgent() - clientIP := utils.ClientIP(req) - if clientIP != "" && h.ipCache != nil { - if !h.ipCache.SetIfNX(fmt.Sprintf("key-%s", clientIP), "v", h.ipExpiredTime) { - hwlog.RunLog.WarnfWithCtx(ctx, "Single IP request reject:%s: %s <%3d> |%15s |%s |%d ", req.Method, - path, http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) - http.Error(w, "503 too busy", http.StatusServiceUnavailable) - return - } - } - select { - case _, ok := <-h.concurrency: - if !ok { - // channel closed and no need return token - return - } - if h.method != "" && req.Method != h.method { - http.NotFound(w, req) - // recover token to the bucket - h.concurrency <- struct{}{} - return - } - hwlog.RunLog.Debugf("token count:%d", len(h.concurrency)) - start := time.Now() - statusRes := newResponse(w) - h.httpHandler.ServeHTTP(statusRes, req) - stop := time.Since(start) - h.concurrency <- struct{}{} - latency := int(math.Ceil(float64(stop.Nanoseconds()) / kilo / kilo)) - if h.log { - hwlog.RunLog.InfofWithCtx(ctx, "%s %s: %s <%3d> (%dms) |%15s |%s |%d", req.Proto, req.Method, path, - statusRes.Status, latency, clientIP, clientUserAgent, syscall.Getuid()) - } - default: - hwlog.RunLog.WarnfWithCtx(ctx, "Total reject request:%s: %s <%3d> |%15s |%s |%d ", req.Method, path, - http.StatusServiceUnavailable, clientIP, clientUserAgent, syscall.Getuid()) - http.Error(w, "503 too busy", http.StatusServiceUnavailable) - } -} - -func newResponse(w http.ResponseWriter) *StatusResponseWriter { - jk, ok := w.(http.Hijacker) - if !ok { - hwlog.RunLog.Warn("hijack not implement") - } - statusRes := &StatusResponseWriter{ - ResponseWriter: w, - Status: http.StatusOK, - Hijacker: jk, - } - return statusRes -} - -func initContext(req *http.Request) context.Context { - ctx := context.Background() - reqID := req.Header.Get(hwlog.ReqID.String()) - if reqID != "" { - ctx = context.WithValue(context.Background(), hwlog.ReqID, reqID) - } - id := req.Header.Get(hwlog.UserID.String()) - if id != "" { - ctx = context.WithValue(ctx, hwlog.UserID, id) - } - return ctx -} - -// NewLimitHandler new a bucket-token limiter -func NewLimitHandler(maxConcur, maxConcurrency int, handler http.Handler, printLog bool) (http.Handler, error) { - return NewLimitHandlerWithMethod(maxConcur, maxConcurrency, handler, printLog, "") -} - -// NewLimitHandlerWithMethod new a bucket-token limiter with specific http method -func NewLimitHandlerWithMethod(maxConcur, maxConcurrency int, handler http.Handler, printLog bool, - httpMethod string) (http.Handler, error) { - if maxConcur < 1 || maxConcur > maxConcurrency { - return nil, errors.New("maxConcurrency parameter error") - } - conchan := make(chan struct{}, maxConcur) - return createHandler(conchan, handler, printLog, httpMethod, DefaultDataLimit), nil -} - -func createHandler(ch chan struct{}, handler http.Handler, printLog bool, - httpMethod string, bodySizeLimit int64) *limitHandler { - h := &limitHandler{ - concurrency: ch, - httpHandler: handler, - log: printLog, - method: httpMethod, - limitBytes: bodySizeLimit, - ipExpiredTime: time.Duration(-1), - } - for i := 0; i < cap(ch); i++ { - h.concurrency <- struct{}{} - } - return h -} - -// NewLimitHandlerV2 new a bucket-token limiter which contains limit request by IP -func NewLimitHandlerV2(handler http.Handler, conf *HandlerConfig) (http.Handler, error) { - if conf == nil { - return nil, errors.New("parameter error") - } - if conf.TotalConCurrency < 1 || conf.TotalConCurrency > defaultMaxConcurrency { - return nil, errors.New("totalConCurrency parameter error") - } - if len(conf.Method) > maxStringLen { - return nil, errors.New("method parameter error") - } - if conf.CacheSize <= 0 { - hwlog.RunLog.Info("use default cache size") - conf.CacheSize = DefaultCacheSize - } - reg := regexp.MustCompile(IPReqLimitReg) - if !reg.Match([]byte(conf.IPConCurrency)) { - return nil, errors.New("IPConCurrency parameter error") - } - conchan := make(chan struct{}, conf.TotalConCurrency) - h := createHandler(conchan, handler, conf.PrintLog, conf.Method, conf.LimitBytes) - arr := strings.Split(conf.IPConCurrency, "/") - if len(arr) != arrLen || arr[0] == "0" { - return nil, errors.New("IPConCurrency parameter error") - } - arr1, err := strconv.ParseInt(arr[1], 0, 0) - if err != nil { - return nil, fmt.Errorf("IPConCurrency parameter(%s) error, parse to int failed: %v", arr[1], err) - } - arr0, err := strconv.ParseInt(arr[0], 0, 0) - if err != nil || arr0 == 0 { - return nil, fmt.Errorf("IPConCurrency parameter(%s) error,parse to int failed: %v", arr[0], err) - } - h.ipExpiredTime = time.Duration(arr1 * int64(time.Second) / arr0) - h.ipCache = cache.New(DefaultCacheSize) - return h, nil - -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go deleted file mode 100644 index 69dbb8e..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_handler_test.go +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limiter -package limiter - -import ( - "context" - "net/http" - "net/url" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" -) - -func init() { - config := hwlog.LogConfig{ - OnlyToStdout: true, - } - hwlog.InitRunLogger(&config, context.TODO()) -} -func TestServeHTTP(t *testing.T) { - convey.Convey("test limitHandler serveHTTP", t, func() { - h, w, r := initVarable() - convey.Convey("header contains reqID and userID,", func() { - mock := gomonkey.ApplyMethodFunc(h.httpHandler, "ServeHTTP", func(http.ResponseWriter, - *http.Request) { - return - }) - defer mock.Reset() - h.ServeHTTP(w.ResponseWriter, r) - convey.So(len(h.concurrency), convey.ShouldEqual, 1) - }) - convey.Convey("token channel close,", func() { - mock := gomonkey.ApplyFunc(http.Error, func(http.ResponseWriter, string, int) { - return - }) - defer mock.Reset() - _, ok := <-h.concurrency - if !ok { - return - } - h.ServeHTTP(w.ResponseWriter, r) - convey.So(len(h.concurrency), convey.ShouldEqual, 0) - }) - }) -} - -func initVarable() (*limitHandler, StatusResponseWriter, *http.Request) { - lh, err := NewLimitHandler(1, len2, http.DefaultServeMux, false) - if err != nil { - return nil, StatusResponseWriter{}, nil - } - v, ok := lh.(*limitHandler) - if !ok { - return nil, StatusResponseWriter{}, nil - } - w := StatusResponseWriter{ - ResponseWriter: nil, - Status: 0, - } - r := &http.Request{ - URL: &url.URL{ - Path: "test.com", - }, - Header: map[string][]string{"userID": {"1"}, "reqID": {"requestIDxxxx"}}, - Method: "GET", - } - return v, w, r -} - -func TestNewLimitHandlerV2(t *testing.T) { - conf := &HandlerConfig{ - PrintLog: false, - Method: "", - LimitBytes: DefaultDataLimit, - TotalConCurrency: defaultMaxConcurrency, - IPConCurrency: "2/1", - CacheSize: DefaultCacheSize, - } - convey.Convey("normal situation,no err return", t, func() { - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("IPConCurrency parameter error", t, func() { - conf.IPConCurrency = "2021/1" - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("cacheSize parameter error", t, func() { - conf.CacheSize = 0 - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("method parameter error", t, func() { - conf.Method = "20/iajsdkjas2jhjdklsjkldjsdfasd1" - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("TotalConCurrency parameter error", t, func() { - conf.TotalConCurrency = 0 - _, err := NewLimitHandlerV2(http.DefaultServeMux, conf) - convey.So(err, convey.ShouldNotEqual, nil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go deleted file mode 100644 index b81d511..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener.go +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limit listener, refer to "golang.org/x/net/netutil" and -// change the acquire method, if acquire failed, return false immediately -package limiter - -import ( - "errors" - "fmt" - "net" - "strings" - "sync" - "time" - - "ascend-common/common-utils/cache" - "ascend-common/common-utils/hwlog" -) - -const ( - maxConnection = 1024 - maxIPConnection = 512 - - largeMaxConnection = 16384 -) - -func commonLimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { - if IPConnLimit < 0 || IPConnLimit > maxIPConnection { - return nil, errors.New("the parameter IPConnLimit is illegal") - } - bucket := make(chan struct{}, totalConnLimit) - ll := &localLimitListener{ - Listener: l, - buckets: bucket, - ipConnLimit: int64(IPConnLimit), - } - if cacheSize > 0 { - ll.ipCache = cache.New(cacheSize) - } - return ll, nil -} - -// LimitListener returns a Listener that accepts at most n connections at the same time -func LimitListener(l net.Listener, totalConnLimit, IPConnLimit, cacheSize int) (net.Listener, error) { - if totalConnLimit < 0 || totalConnLimit > maxConnection { - return nil, errors.New("the parameter totalConnLimit is illegal") - } - return commonLimitListener(l, totalConnLimit, IPConnLimit, cacheSize) -} - -type localLimitListener struct { - net.Listener - buckets chan struct{} - closeOnce sync.Once - ipCache *cache.ConcurrencyLRUCache - ipConnLimit int64 -} - -// acquire acquires the limiting semaphore. Returns true if successfully -// accquired, false if the listener is closed or reach the max limit -func (l *localLimitListener) acquire() bool { - select { - case l.buckets <- struct{}{}: - return true - default: - return false - } -} -func (l *localLimitListener) release() { <-l.buckets } - -// Accept implement net.Listener interface -func (l *localLimitListener) Accept() (net.Conn, error) { - c, err := l.Listener.Accept() - if err != nil { - return nil, err - } - // ip connection limit - ip, cacheKey := getIpAndKey(c) - if ip != "" && l.ipCache != nil { - if counts, err := l.ipCache.INCR(cacheKey, -1); err == nil && counts > l.ipConnLimit { - hwlog.RunLog.Warn("ip connections reach max limit, connection will to force closed") - return closeImmediately(c, l.ipCache), nil - } - } - // total tcp connection limit - if l.acquire() { - return &limitListenerConn{Conn: c, release: l.release, ipCache: l.ipCache}, nil - } - hwlog.RunLog.Warn("limit forbidden, connection will to force closed") - return closeImmediately(c, l.ipCache), nil - -} - -func getIpAndKey(c net.Conn) (string, string) { - ipWithPort := c.RemoteAddr().String() - if ipWithPort != "" { - s := strings.Split(ipWithPort, ":") - return s[0], fmt.Sprintf("key-conn-%s", s[0]) - } - return "", "" -} - -func closeImmediately(c net.Conn, lruCache *cache.ConcurrencyLRUCache) net.Conn { - // once the connection reach the max limit, force close the connection - tcpConn, ok := c.(*net.TCPConn) - if ok { - if err := tcpConn.SetLinger(0); err != nil { - hwlog.RunLog.Warnf("Error when setting linger: %s", err) - } - } - - err := c.Close() - if err != nil { - hwlog.RunLog.Warn(err) - } - return &limitListenerConn{Conn: c, release: func() {}, ipCache: lruCache} -} - -// Close implement net.Listener interface -func (l *localLimitListener) Close() error { - err := l.Listener.Close() - l.closeOnce.Do(func() { close(l.buckets) }) - return err -} - -type limitListenerConn struct { - net.Conn - releaseOnce sync.Once - release func() - ipCache *cache.ConcurrencyLRUCache -} - -// Close override net.Conn interface -func (l *limitListenerConn) Close() error { - err := l.Conn.Close() - if err != nil { - hwlog.RunLog.Debugf("close grpc connect failed: %v", err) - return fmt.Errorf("close grpc connect failed: %v", err) - } - l.releaseOnce.Do(l.release) - ip, cacheKey := getIpAndKey(l.Conn) - if ip != "" && l.ipCache != nil { - d, err := l.ipCache.DECR(cacheKey, time.Hour) - if err != nil { - hwlog.RunLog.Error(err) - } - hwlog.RunLog.Debugf("decrement ip connections %d", d) - } - return err -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go deleted file mode 100644 index 631e1bb..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_listener_test.go +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a token bucket limiter -package limiter - -import ( - "errors" - "net" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -const ( - len2 = 2 -) - -func TestLimitListenerAccept(t *testing.T) { - convey.Convey("test Accept function", t, func() { - - limitLor, err := LimitListener(&mockLicener{}, len2, len2, DefaultCacheSize) - if err != nil { - return - } - l, ok := limitLor.(*localLimitListener) - if !ok { - return - } - mock2 := gomonkey.ApplyFunc(getIpAndKey, func(net.Conn) (string, string) { - return "127.0.0.1", "key-127.0.0.1" - }) - defer mock2.Reset() - convey.Convey("acquire token success", func() { - _, err = l.Accept() - convey.So(err, convey.ShouldEqual, nil) - }) - - convey.Convey("accept failed", func() { - mock := gomonkey.ApplyMethodFunc(l.Listener, "Accept", func() (net.Conn, error) { - return nil, errors.New("mock error") - }) - defer mock.Reset() - con, err := l.Accept() - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(con, convey.ShouldEqual, nil) - }) - - convey.Convey("acquire token failed", func() { - mock := gomonkey.ApplyPrivateMethod(l, "acquire", func(*localLimitListener) bool { - return false - }) - defer mock.Reset() - con, err := l.Accept() - convey.So(err, convey.ShouldEqual, nil) - conm, ok := con.(*limitListenerConn) - if !ok { - return - } - convey.So(conm.release, convey.ShouldNotEqual, nil) - }) - - }) -} - -type mockLicener struct { -} - -func (l *mockLicener) Accept() (net.Conn, error) { - return &net.TCPConn{}, nil -} - -func (l *mockLicener) Addr() net.Addr { - return &net.IPAddr{ - IP: []byte("127.0.0.1"), - Zone: "", - } -} - -func (l *mockLicener) Close() error { - return nil -} - -func TestGetIpAndKey(t *testing.T) { - convey.Convey("test getIp function", t, func() { - c := net.TCPConn{} - mock := gomonkey.ApplyMethodFunc(&c, "RemoteAddr", func() net.Addr { - return &net.IPAddr{ - IP: []byte("127.0.0.1"), - Zone: "", - } - }) - defer mock.Reset() - ip, _ := getIpAndKey(&c) - convey.So(ip, convey.ShouldNotEqual, "") - }) -} - -func TestLimitListener(t *testing.T) { - convey.Convey("test new listener function success", t, func() { - l, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection, DefaultDataLimit) - convey.So(l, convey.ShouldNotEqual, nil) - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("test new listener function", t, func() { - _, err := LimitListener(&mockLicener{}, maxConnection+1, maxIPConnection, DefaultDataLimit) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("test new listener function", t, func() { - _, err := LimitListener(&mockLicener{}, maxConnection, maxIPConnection+1, DefaultDataLimit) - convey.So(err, convey.ShouldNotEqual, nil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go deleted file mode 100644 index 9117d07..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer.go +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a writer limiter -package limiter - -import ( - "bytes" - "errors" - - "ascend-common/common-utils/hwlog" -) - -const defaultLimit = 1024 - -// LimitedWriter limit the size of written data -type LimitedWriter struct { - buffer *bytes.Buffer - limit int - size int -} - -// NewLimitedWriter create a LimitedWriter -func NewLimitedWriter(limit int) *LimitedWriter { - if limit <= 0 { - hwlog.RunLog.Warnf("limit: %v is invalid, set default limit: %v", limit, defaultLimit) - limit = defaultLimit - } - return &LimitedWriter{ - buffer: &bytes.Buffer{}, - limit: limit, - } -} - -// Write write bytes to buffer -func (lw *LimitedWriter) Write(p []byte) (int, error) { - if lw.size+len(p) > lw.limit { - return 0, errors.New("buffer limit exceeded") - } - n, err := lw.buffer.Write(p) - if err == nil { - lw.size += n - } - return n, err -} - -// GetBufferBytes get buffer bytes -func (lw *LimitedWriter) GetBufferBytes() []byte { - if lw.buffer == nil { - return []byte{} - } - return lw.buffer.Bytes() -} diff --git a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go b/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go deleted file mode 100644 index 9a308f3..0000000 --- a/mind-cluster/component/ascend-common/common-utils/limiter/limit_writer_test.go +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package limiter implement a writer limiter -package limiter - -import ( - "io" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestLimitWriterWrite(t *testing.T) { - convey.Convey("test limiter Writer write function", t, func() { - data := []byte("test") - limitBuffer := NewLimitedWriter(len(data)) - - n, err := limitBuffer.Write(data) - convey.So(err, convey.ShouldBeNil) - convey.So(n, convey.ShouldEqual, len(data)) - n, err = limitBuffer.Write(data) - convey.So(err, convey.ShouldEqual, io.EOF) - convey.So(n, convey.ShouldEqual, 0) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go deleted file mode 100644 index 1a97a1b..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux.go +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security rand -package rand - -import ( - "errors" - "fmt" - "io" - "os" - "runtime" - "sync" - "time" -) - -const ( - maxReadSize = 1<<25 - 1 -) - -// A randomReader satisfies reads by reading the file named name. -type randomReader struct { - f io.Reader - mu sync.Mutex -} - -func init() { - Reader = &randomReader{} -} - -func warnBlocked() { - fmt.Println("mindx-security/rand: blocked for 60 seconds waiting to read random data from the kernel") -} - -var supportOs = "linux" - -// Read implements the interface of io.Reader -func (r *randomReader) Read(b []byte) (int, error) { - t := time.AfterFunc(time.Minute, warnBlocked) - defer t.Stop() - if len(b) > maxReadSize { - return 0, errors.New("byte size is too large") - } - r.mu.Lock() - defer r.mu.Unlock() - if runtime.GOOS != supportOs { - return 0, errors.New("not supported") - } - f, err := os.Open("/dev/random") - if err != nil { - return 0, err - } - defer func() { - err = f.Close() - if err != nil { - fmt.Println("close random file failed") - } - }() - return f.Read(b) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go b/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go deleted file mode 100644 index b02d9d6..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/rand_linux_test.go +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security rand -package rand - -import ( - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -const ( - illegalSize = 1 << 25 -) - -func TestInnerRead(t *testing.T) { - convey.Convey("test random read func", t, func() { - reader := &randomReader{} - convey.Convey("read size too large, err returned", func() { - bs := make([]byte, illegalSize, illegalSize) - r, err := reader.Read(bs) - convey.So(err.Error(), convey.ShouldEqual, "byte size is too large") - convey.So(r, convey.ShouldEqual, 0) - }) - convey.Convey("windows,err returned", func() { - mock := gomonkey.ApplyGlobalVar(&supportOs, "windows") - defer mock.Reset() - bs := make([]byte, 1, 1) - r, err := reader.Read(bs) - convey.So(err.Error(), convey.ShouldEqual, "not supported") - convey.So(r, convey.ShouldEqual, 0) - }) - convey.Convey("normal situation,no err returned", func() { - // the length of byte is one, to prevent block when generate random - bs := make([]byte, 1, 1) - r, err := reader.Read(bs) - convey.So(err, convey.ShouldEqual, nil) - convey.So(r, convey.ShouldEqual, 1) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random.go b/mind-cluster/component/ascend-common/common-utils/rand/random.go deleted file mode 100644 index 353d868..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/random.go +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security io.Reader -package rand - -import ( - "io" -) - -// Reader rand reader to generate security random bytes -var Reader io.Reader - -// Read is a helper function that calls Reader.Read using io.ReadFull. -func Read(b []byte) (int, error) { - return io.ReadFull(Reader, b) -} diff --git a/mind-cluster/component/ascend-common/common-utils/rand/random_test.go b/mind-cluster/component/ascend-common/common-utils/rand/random_test.go deleted file mode 100644 index 04ce333..0000000 --- a/mind-cluster/component/ascend-common/common-utils/rand/random_test.go +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package rand implement the security rand -package rand - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestRead(t *testing.T) { - convey.Convey("package function test,normal situation", t, func() { - // the length of byte is one, to prevent block when generate random - bs := make([]byte, 1, 1) - l, err := Read(bs) - convey.So(err, convey.ShouldEqual, nil) - convey.So(l, convey.ShouldEqual, 1) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env.go b/mind-cluster/component/ascend-common/common-utils/utils/env.go deleted file mode 100644 index 4402375..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/env.go +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils env function -package utils - -import ( - "fmt" - "os/user" - "strconv" -) - -// GetCurrentUid get current uid -func GetCurrentUid() (uint32, error) { - userInfo, err := user.Current() - if err != nil { - return 0, fmt.Errorf("get current user info failed: %v", err) - } - uid, err := strconv.Atoi(userInfo.Uid) - if err != nil { - return 0, fmt.Errorf("convert uid to int failed: %v", err) - } - return uint32(uid), nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/env_test.go b/mind-cluster/component/ascend-common/common-utils/utils/env_test.go deleted file mode 100644 index 95d8983..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/env_test.go +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils env test -package utils - -import ( - "fmt" - "os/user" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -func TestGetCurrentUid(t *testing.T) { - convey.Convey("test func GetCurrentUid success", t, func() { - var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "0"}, nil) - defer p1.Reset() - uid, err := GetCurrentUid() - convey.So(err, convey.ShouldBeNil) - convey.So(uid, convey.ShouldEqual, 0) - }) - convey.Convey("test func GetCurrentUid failed, get current user info failed", t, func() { - var p1 = gomonkey.ApplyFuncReturn(user.Current, nil, testErr) - defer p1.Reset() - uid, err := GetCurrentUid() - expErr := fmt.Errorf("get current user info failed: %v", testErr) - convey.So(err, convey.ShouldResemble, expErr) - convey.So(uid, convey.ShouldEqual, 0) - }) - convey.Convey("test func GetCurrentUid failed, uid is invalid", t, func() { - var p1 = gomonkey.ApplyFuncReturn(user.Current, &user.User{Uid: "invalid uid"}, nil) - defer p1.Reset() - uid, err := GetCurrentUid() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "convert uid to int failed") - convey.So(uid, convey.ShouldEqual, 0) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file.go b/mind-cluster/component/ascend-common/common-utils/utils/file.go deleted file mode 100644 index 253e2b5..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file.go +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "errors" - "fmt" - "io" - "io/ioutil" - "os" - "path/filepath" - "reflect" - "strings" -) - -const ( - // FileMode file privilege - FileMode = 0600 - // Size10M bytes of 10M - Size10M = 10 * 1024 * 1024 - maxSize = 1024 * 1024 * 1024 -) - -// ReadLimitBytes read limit length of contents from file path -func ReadLimitBytes(path string, limitLength int) ([]byte, error) { - if limitLength < 0 || limitLength > maxSize { - return nil, errors.New("the limit length is not valid") - } - - key, err := CheckPath(path) - if err != nil { - return nil, err - } - file, err := os.OpenFile(key, os.O_RDONLY, FileMode) - if err != nil { - return nil, errors.New(fmt.Sprintf("open file with read-only and %04o mode failed", FileMode)) - } - defer file.Close() - buf := make([]byte, limitLength, limitLength) - l, err := file.Read(buf) - if err != nil { - return nil, fmt.Errorf("read file failed: %v", err) - } - return buf[0:l], nil -} - -// LoadFile load file content -func LoadFile(filePath string) ([]byte, error) { - if filePath == "" { - return nil, nil - } - absPath, err := filepath.Abs(filePath) - if err != nil { - return nil, fmt.Errorf("the filePath is invalid: %v", err) - } - if !IsExist(absPath) { - return nil, nil - } - - return ReadLimitBytes(absPath, Size10M) -} - -func closeFile(file *os.File) { - if file == nil { - return - } - if err := file.Close(); err != nil { - return - } - return -} - -// CopyFile copy file -func CopyFile(src, dst string) error { - src, err := CheckPath(src) - if err != nil { - return err - } - if IsExist(dst) { - dst, err = CheckPath(dst) - if err != nil { - return err - } - } - - srcFile, err := os.Open(src) - if err != nil { - return err - } - defer closeFile(srcFile) - - srcInfo, err := os.Stat(src) - if err != nil { - return err - } - - dstFile, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE|os.O_TRUNC, srcInfo.Mode()) - if err != nil { - return err - } - defer closeFile(dstFile) - - if _, err = io.Copy(dstFile, srcFile); err != nil { - return err - } - return os.Chmod(dst, srcInfo.Mode()) -} - -// CopyDir recursively copy files -func CopyDir(src string, dst string) error { - var ( - err error - fds []os.FileInfo = nil - dstInfo os.FileInfo - ) - - if dstInfo, err = os.Stat(src); err != nil { - return err - } - if err = os.MkdirAll(dst, dstInfo.Mode()); err != nil { - return err - } - if subFolder(src, dst) { - return errors.New("the destination directory is a subdirectory of the source directory") - } - if fds, err = ioutil.ReadDir(src); err != nil { - return err - } - for _, fd := range fds { - srcFile := filepath.Join(src, fd.Name()) - dstFile := filepath.Join(dst, fd.Name()) - if fd.IsDir() { - if err = CopyDir(srcFile, dstFile); err != nil { - return err - } - } else { - if err = CopyFile(srcFile, dstFile); err != nil { - return err - } - } - } - return nil -} - -func subFolder(src, dst string) bool { - if src == dst { - return true - } - srcReal, err := filepath.EvalSymlinks(src) - if err != nil { - return false - } - dstReal, err := filepath.EvalSymlinks(dst) - if err != nil { - return false - } - srcList := strings.Split(srcReal, string(os.PathSeparator)) - dstList := strings.Split(dstReal, string(os.PathSeparator)) - if len(srcList) > len(dstList) { - return false - } - return reflect.DeepEqual(srcList, dstList[:len(srcList)]) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check.go deleted file mode 100644 index 4134245..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_check.go +++ /dev/null @@ -1,240 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "fmt" - "io/fs" - "os" - "path/filepath" - "strings" - "syscall" -) - -const ( - notValidPath = "not-valid-file-path" - maxAllowFileSize int64 = 1024 * 100 // in megabytes - oneMegabytes int64 = 1024 * 1024 - // DefaultWhiteList default white list in string - DefaultWhiteList = "-_./~" - // DefaultStringLength default string max length - DefaultStringLength = 256 - // DefaultPathLength default path max length - DefaultPathLength = 4096 -) - -// RealFileChecker Check whether the file is valid -func RealFileChecker(path string, checkParent, allowLink bool, size int64) (string, error) { - realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) - if err != nil { - return notValidPath, err - } - if fileInfo.IsDir() { - return notValidPath, fmt.Errorf("invalid dir") - } - if !fileInfo.Mode().IsRegular() { - return notValidPath, fmt.Errorf("invalid regular file") - } - if size > maxAllowFileSize || size < 0 { - return notValidPath, fmt.Errorf("invalid size") - } - if fileInfo.Size() > size*oneMegabytes { - return notValidPath, fmt.Errorf("size too large") - } - return realPath, nil -} - -// RealDirChecker Check whether the directory is valid -func RealDirChecker(path string, checkParent, allowLink bool) (string, error) { - realPath, fileInfo, err := realPathChecker(path, checkParent, allowLink) - if err != nil { - return notValidPath, err - } - if !fileInfo.IsDir() { - return notValidPath, fmt.Errorf("is not dir") - } - return realPath, nil -} - -// PathStringChecker Check whether the directory string is valid -func PathStringChecker(path string) (string, error) { - realPath, err := filepath.Abs(path) - if err != nil { - return notValidPath, err - } - if len(realPath) > DefaultPathLength { - return notValidPath, fmt.Errorf("path over max path length") - } - if !stringChecker(realPath, 0, DefaultPathLength) { - return notValidPath, fmt.Errorf("invalid path") - } - if err = pathDepthChecker(realPath, 0); err != nil { - return notValidPath, err - } - return realPath, nil -} - -// VerifyFile verify the file after it is opened. -func VerifyFile(file *os.File, size int64) error { - fileInfo, err := file.Stat() - if err != nil { - return err - } - if size > maxAllowFileSize || size < 0 { - return fmt.Errorf("invalid size") - } - if fileInfo.Size() > size*oneMegabytes { - return fmt.Errorf("file size error %v", fileInfo.Size()) - } - if (fileInfo.Mode() & fs.ModeSymlink) != 0 { - return fmt.Errorf("file is softlink") - } - if st := fileInfo.Sys(); st.(*syscall.Stat_t).Uid != uint32(os.Geteuid()) { - return fmt.Errorf("file owner incorrect") - } - return nil -} - -// SafeChmod after the verification is complete, run the chmod command. -func SafeChmod(path string, size int64, mode os.FileMode) error { - file, err := os.Open(path) - if err != nil { - return err - } - defer file.Close() - if err = VerifyFile(file, size); err != nil { - return err - } - if err = file.Chmod(mode); err != nil { - return err - } - return nil -} - -func realPathChecker(path string, checkParent, allowLink bool) (string, os.FileInfo, error) { - realPath, err := filepath.Abs(path) - if err != nil { - return notValidPath, nil, err - } - if len(realPath) > DefaultPathLength { - return notValidPath, nil, fmt.Errorf("path over max path length") - } - if !stringChecker(realPath, 0, DefaultPathLength) { - return notValidPath, nil, fmt.Errorf("invalid path") - } - if err = fileChecker(realPath, true, checkParent, allowLink, 0); err != nil { - return notValidPath, nil, err - } - fileInfo, err := os.Stat(realPath) - if err != nil { - return notValidPath, nil, err - } - return realPath, fileInfo, nil -} - -func fileChecker(path string, allowDir, checkParent, allowLink bool, deep int) error { - const maxDepth int = 99 - if deep > maxDepth { - return fmt.Errorf("over maxDepth %d", maxDepth) - } - fileInfo, err := normalFileCheck(path, allowDir, allowLink) - if err != nil { - return err - } - if err = checkOwnerAndPermission(fileInfo, path); err != nil { - return err - } - if path != "/" && checkParent { - return fileChecker(filepath.Dir(path), true, true, allowLink, deep+1) - } - return nil -} - -func pathDepthChecker(path string, deep int) error { - const maxDepth int = 99 - if deep > maxDepth { - return fmt.Errorf("over maxDepth %d", maxDepth) - } - if path != "/" { - return pathDepthChecker(filepath.Dir(path), deep+1) - } - return nil -} - -func checkOwnerAndPermission(fileInfo os.FileInfo, filePath string) error { - const groupWriteIndex, otherWriteIndex, permLength int = 5, 8, 10 - perm := fileInfo.Mode().Perm().String() - if len(perm) != permLength { - return fmt.Errorf("permission not right %v %v", filePath, perm) - } - for index, char := range perm { - if (index == groupWriteIndex || index == otherWriteIndex) && char == 'w' { - return fmt.Errorf("write permission not right %v %v", filePath, perm) - } - } - stat, ok := fileInfo.Sys().(*syscall.Stat_t) - if !ok { - return fmt.Errorf("can not get stat %v", filePath) - } - if !(int(stat.Uid) == 0 || int(stat.Uid) == os.Getuid()) { - return fmt.Errorf("owner not right %v %v", filePath, int(stat.Uid)) - } - return nil -} - -func normalFileCheck(filePath string, allowDir, allowLink bool) (os.FileInfo, error) { - realPath, err := filepath.EvalSymlinks(filePath) - if err != nil || (realPath != filePath && !allowLink) { - return nil, fmt.Errorf("symlinks or not existed, failed %v, %v", filePath, err) - } - fileInfo, err := os.Stat(filePath) - if err != nil { - return nil, fmt.Errorf("get file stat failed %v", err) - } - if allowDir && !fileInfo.Mode().IsRegular() && !fileInfo.IsDir() { - return nil, fmt.Errorf("not regular file/dir %v", filePath) - } - if !allowDir && !fileInfo.Mode().IsRegular() { - return nil, fmt.Errorf("not regular file %v", filePath) - } - if fileInfo.Mode()&os.ModeSetuid != 0 { - return nil, fmt.Errorf("setuid not allowed %v", filePath) - } - if fileInfo.Mode()&os.ModeSetgid != 0 { - return nil, fmt.Errorf("setgid not allowed %v", filePath) - } - return fileInfo, nil -} - -func isValidCode(c rune) bool { - return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') -} - -func isInWhiteList(c rune) bool { - return strings.Contains(DefaultWhiteList, string(c)) -} - -func stringChecker(text string, minLength, maxLength int) bool { - if len(text) <= minLength || len(text) >= maxLength { - return false - } - for _, char := range text { - if !isValidCode(char) && !isInWhiteList(char) { - return false - } - } - return true -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go deleted file mode 100644 index 3c8e065..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_check_test.go +++ /dev/null @@ -1,194 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package mindxcheckutils is a check utils package -package utils - -import ( - "os" - "strings" - "testing" -) - -func TestNormalFileCheckRegularFile(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - defer removeTmpDir(t, tmpDir) - err = os.Symlink(filePath, tmpDir+"/syslink") - if err != nil { - t.Fatalf("create symlink failed %q: %s", filePath, err) - } - - if _, err = normalFileCheck(tmpDir, true, false); err != nil { - t.Fatalf("check allow dir failed %q: %s", tmpDir+"/__test__", err) - } - - if _, err = normalFileCheck(tmpDir, false, false); !strings.Contains(err.Error(), "not regular file") { - t.Fatalf("check not allow dir failed %q: %s", tmpDir+"/__test__", err) - } - - if _, err = normalFileCheck("/dev/zero", true, false); !strings.Contains(err.Error(), "not regular file/dir") { - t.Fatalf("check /dev/zero failed %q: %s", tmpDir+"/__test__", err) - } - - if _, err = normalFileCheck(tmpDir+"/syslink", false, false); !strings.Contains(err.Error(), "symlinks") { - t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) - } - - if _, err = normalFileCheck(filePath, false, false); err != nil { - t.Fatalf("check failed %q: %s", filePath, err) - } - - if _, err = normalFileCheck(tmpDir+"/notexisted", false, false); !strings.Contains(err.Error(), "not existed") { - t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) - } -} - -func TestRealFileChecker(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - const permission os.FileMode = 0700 - err = os.WriteFile(filePath, []byte("hello\n"), permission) - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - if _, err = RealFileChecker(filePath, false, true, 0); err == nil { - t.Fatalf("size check wrong 0 %q: %s", filePath, err) - } - if _, err = RealFileChecker(filePath, false, true, 1); err != nil { - t.Fatalf("size check wrong 1 %q: %s", filePath, err) - } -} - -func TestRealFileCheckerInside(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - const permission os.FileMode = 0700 - const deep int = 100 - err = os.WriteFile(filePath, []byte("hello\n"), permission) - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - if err = fileChecker(filePath, false, false, false, deep); err == nil { - t.Fatalf("size check wrong 0 %q: %s", filePath, err) - } -} - -func TestRealDirChecker(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - if _, err = RealDirChecker(filePath, false, true); err == nil { - t.Fatalf("should be dir 0 %q: %s", filePath, err) - } - if _, err = RealDirChecker(tmpDir, false, true); err != nil { - t.Fatalf("should be dir 1 %q: %s", filePath, err) - } -} - -func TestVerifyFile(t *testing.T) { - tmpDir, filePath, err := createTestFile(t, "test_file.txt") - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - defer removeTmpDir(t, tmpDir) - err = os.Symlink(filePath, tmpDir+"/syslink") - if err != nil { - t.Fatalf("create symlink failed %q: %s", filePath, err) - } - file, err := os.Open(filePath) - if err != nil { - t.Fatalf("open file failed") - } - defer file.Close() - linkFile, err := os.Open(tmpDir + "/syslink") - if err != nil { - t.Fatalf("open file failed") - } - defer linkFile.Close() - const permission os.FileMode = 0700 - err = os.WriteFile(filePath, []byte("hello\n"), permission) - if err != nil { - t.Fatalf("create file failed %q: %s", filePath, err) - } - if err = VerifyFile(file, 0); err == nil { - t.Fatalf("size check wrong 0 %q: %s", filePath, err) - } - if err = VerifyFile(file, 1); err != nil { - t.Fatalf("size check wrong 1 %q: %s", filePath, err) - } - if err = VerifyFile(linkFile, 1); err != nil && !strings.Contains(err.Error(), "symlinks") { - t.Fatalf("check symlinks failed %q: %s", tmpDir+"/syslink", err) - } -} - -func TestStringChecker(t *testing.T) { - if ok := stringChecker("0123456789abcABC", 0, DefaultStringLength); !ok { - t.Fatalf("failed on regular letters") - } - const testSize = 3 - if ok := stringChecker("123", 0, testSize); ok { - t.Fatalf("failed on max length") - } - if ok := stringChecker("1234", 0, testSize); ok { - t.Fatalf("failed on max length") - } - if ok := stringChecker("12", 0, testSize); !ok { - t.Fatalf("failed on max length") - } - if ok := stringChecker("", 0, testSize); ok { - t.Fatalf("failed on min length") - } - if ok := stringChecker("123", testSize, DefaultStringLength); ok { - t.Fatalf("failed on min length") - } - if ok := stringChecker("123%", 0, DefaultStringLength); ok { - t.Fatalf("failed on strange words") - } - if ok := stringChecker("123.-/~", 0, DefaultStringLength); !ok { - t.Fatalf("failed on strange words") - } -} - -func createTestFile(t *testing.T, fileName string) (string, string, error) { - const fileMode os.FileMode = 0600 - tmpDir := os.TempDir() - const permission os.FileMode = 0700 - if os.MkdirAll(tmpDir+"/__test__", permission) != nil { - t.Fatalf("MkdirAll failed %q", tmpDir+"/__test__") - } - f, err := os.Create(tmpDir + "/__test__" + fileName) - if err != nil { - t.Fatalf("create file failed %q: %s", tmpDir+"/__test__", err) - } - defer f.Close() - err = f.Chmod(fileMode) - if err != nil { - t.Fatalf("change file mode failed %q: %s", tmpDir+"/__test__", err) - } - return tmpDir + "/__test__", tmpDir + "/__test__" + fileName, err -} - -func removeTmpDir(t *testing.T, tmpDir string) { - if os.RemoveAll(tmpDir) != nil { - t.Logf("removeall %v", tmpDir) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_test.go deleted file mode 100644 index 8f91417..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_test.go +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "errors" - "fmt" - "os" - "path/filepath" - "reflect" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -func TestReadLimitBytes(t *testing.T) { - convey.Convey("test ReadLimitBytes func", t, func() { - convey.Convey("should return nil given empty string", func() { - emptyString := "" - const limitLength = 10 - res, err := ReadLimitBytes(emptyString, limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err, convey.ShouldBeError) - }) - - convey.Convey("should not return nil given valid path", func() { - const limitLength = 10 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldNotBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return nil given invalid limit length", func() { - const limitLength = -1 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "the limit length is not valid") - }) - - convey.Convey("should return nil when check path failed", func() { - checkStub := gomonkey.ApplyFunc(CheckPath, func(path string) (string, error) { - return "", errors.New("check failed") - }) - defer checkStub.Reset() - const limitLength = 10 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "check failed") - }) - - convey.Convey("should return nil when read file failed", func() { - var file *os.File - checkStub := gomonkey.ApplyMethod(reflect.TypeOf(file), "Read", - func(_ *os.File, _ []byte) (int, error) { - return 0, errors.New("read file failed") - }) - defer checkStub.Reset() - const limitLength = 10 - res, err := ReadLimitBytes("../../go.mod", limitLength) - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "read file failed: read file failed") - }) - }) -} - -func TestLoadFile(t *testing.T) { - convey.Convey("test LoadFile func", t, func() { - convey.Convey("should return error given empty path", func() { - res, err := LoadFile("") - convey.So(res, convey.ShouldBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return nil given path not existing", func() { - res, err := LoadFile("xxxx") - convey.So(res, convey.ShouldBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should not return nil given valid path", func() { - res, err := LoadFile("../../go.mod") - convey.So(res, convey.ShouldNotBeNil) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return nil given invalid path", func() { - absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { - return "", errors.New("the path is invalid") - }) - defer absStub.Reset() - res, err := LoadFile("../../go.mod") - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "the filePath is invalid: the path is invalid") - }) - - convey.Convey("should return nil when read file failed", func() { - readStub := gomonkey.ApplyFunc(ReadLimitBytes, func(path string, limitLength int) ([]byte, error) { - return nil, errors.New("read file failed") - }) - defer readStub.Reset() - res, err := LoadFile("../../go.mod") - convey.So(res, convey.ShouldBeNil) - convey.So(err.Error(), convey.ShouldEqual, "read file failed") - }) - }) -} - -func TestCopyDir(t *testing.T) { - convey.Convey("test CopyDir func", t, func() { - convey.Convey("should return error given empty src path", func() { - err := CopyDir("", "") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given file src path", func() { - err := CopyDir("../../go.mod", "") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return nil given dir src path", func() { - err := CopyDir("../utils", "../utils_test") - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("should return error given file dst path", func() { - err := CopyDir("../utils", "../utils_test/file_test.go") - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestCopyFile(t *testing.T) { - convey.Convey("test CopyFile func", t, func() { - convey.Convey("should return error given empty src file path", func() { - err := CopyFile("", "../utils_test/file_test.go") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given empty dst path", func() { - err := CopyFile("../utils_test/file_test.go", "") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given dir scr path", func() { - err := CopyFile("../utils", "../utils_test/file_test.go") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return error given dir dst path", func() { - err := CopyFile("../utils/file_test.go", "../utils_test") - convey.So(err, convey.ShouldNotBeNil) - }) - convey.Convey("should return nil given file scr and dst path", func() { - err := CopyFile("../utils/file_test.go", "../utils_test/file_test.go") - convey.So(err, convey.ShouldBeNil) - }) - }) - if err := os.RemoveAll("../utils_test"); err != nil { - fmt.Print("remove util_test file failed") - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go deleted file mode 100644 index 78f4266..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher.go +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer utils for file watcher -package utils - -import ( - "fmt" - "os" - - "github.com/fsnotify/fsnotify" -) - -// FileWatcher struct file watcher -type FileWatcher struct { - watcher *fsnotify.Watcher -} - -// NewFileWatcher new FileWatcher -func NewFileWatcher() (*FileWatcher, error) { - watcher, err := fsnotify.NewWatcher() - if err != nil { - return nil, err - } - return &FileWatcher{watcher: watcher}, nil -} - -// WatchFile add file to watch -func (fw *FileWatcher) WatchFile(filePath string) error { - if _, err := os.Stat(filePath); err != nil { - return err - } - if _, err := PathStringChecker(filePath); err != nil { - return err - } - return fw.watcher.Add(filePath) -} - -// Events get event channel -func (fw *FileWatcher) Events() chan fsnotify.Event { - if fw == nil || fw.watcher == nil { - return nil - } - return fw.watcher.Events -} - -// Errors get error channel -func (fw *FileWatcher) Errors() chan error { - if fw == nil || fw.watcher == nil { - return nil - } - return fw.watcher.Errors -} - -// Close to close the file watcher -func (fw *FileWatcher) Close() error { - if fw == nil || fw.watcher == nil { - return nil - } - return fw.watcher.Close() -} - -// GetFileWatcherChan get eventCh and errCh for file watcher -func GetFileWatcherChan(filePath string) (*FileWatcher, error) { - watcher, err := NewFileWatcher() - if err != nil { - return nil, fmt.Errorf("new file watcher failed, error: %v", err) - } - if err = watcher.WatchFile(filePath); err != nil { - return nil, fmt.Errorf("watch file <%s> failed, error: %v", filePath, err) - } - fmt.Printf("watching file <%s>...\n", filePath) - return watcher, nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go b/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go deleted file mode 100644 index 32220da..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/file_watcher_test.go +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils test for file watcher utils -package utils - -import ( - "errors" - "fmt" - "os" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/fsnotify/fsnotify" - "github.com/smartystreets/goconvey/convey" -) - -var testErr = errors.New("test error") - -const ( - testFilePath = "./test.txt" - errFilePath = "./not_exist_file.txt" -) - -func TestGetFileWatcherChan(t *testing.T) { - prepareTestFile(t) - defer removeFile() - - p1 := gomonkey.ApplyFuncReturn(PathStringChecker, "", nil) - defer p1.Reset() - convey.Convey("test func GetFileWatcherChan success", t, func() { - _, err := GetFileWatcherChan(testFilePath) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("test func GetFileWatcherChan failed, new watcher err", t, func() { - p2 := gomonkey.ApplyFuncReturn(fsnotify.NewWatcher, nil, testErr) - defer p2.Reset() - _, err := GetFileWatcherChan(testFilePath) - expErr := fmt.Errorf("new file watcher failed, error: %v", testErr) - convey.So(err, convey.ShouldResemble, expErr) - }) - convey.Convey("test func GetFileWatcherChan failed, file does not exist", t, func() { - _, err := GetFileWatcherChan(errFilePath) - expErr := fmt.Sprintf("watch file <%s> failed", errFilePath) - convey.So(err.Error(), convey.ShouldContainSubstring, expErr) - }) - convey.Convey("test func GetFileWatcherChan failed, watcher is nil", t, func() { - var watcher = &FileWatcher{} - eventCh := watcher.Events() - convey.So(eventCh, convey.ShouldBeNil) - errCh := watcher.Errors() - convey.So(errCh, convey.ShouldBeNil) - err := watcher.Close() - convey.So(err, convey.ShouldBeNil) - }) -} - -func prepareTestFile(t *testing.T) { - const mode644 = 0644 - err := os.WriteFile(testFilePath, []byte("file context"), mode644) - if err != nil { - t.Error(err) - } -} - -func removeFile() { - if err := os.Remove(testFilePath); err != nil && errors.Is(err, os.ErrNotExist) { - fmt.Printf("remove file %s failed, %v\n", testFilePath, err) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface.go b/mind-cluster/component/ascend-common/common-utils/utils/interface.go deleted file mode 100644 index 7ccae4d..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/interface.go +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import "reflect" - -// IsNil check whether the interface is nil, including type or data is nil -func IsNil(i interface{}) bool { - if i == nil { - return true - } - defer func() { - recover() - }() - return reflect.ValueOf(i).IsNil() -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go b/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go deleted file mode 100644 index f2ce878..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/interface_test.go +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -func TestIsNil(t *testing.T) { - var a interface{} // type = nil, data = nil - var b interface{} = (*int)(nil) // type is *int , data = nil - var c interface{} = "dd" - convey.Convey("test IsNil func, type and data is both nil", t, func() { - convey.So(a == nil, convey.ShouldEqual, true) - convey.So(b == nil, convey.ShouldEqual, false) - convey.So(c == nil, convey.ShouldEqual, false) - convey.So(IsNil(a), convey.ShouldEqual, true) - convey.So(IsNil(b), convey.ShouldEqual, true) - convey.So(IsNil(c), convey.ShouldEqual, false) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go deleted file mode 100644 index f3ed96e..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils.go +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import ( - "errors" - "net" - "net/http" - "regexp" - "strings" -) - -const ( - domainReg = "^[a-zA-Z0-9][a-zA-Z0-9.-]{1,256}[a-zA-Z0-9]$" -) - -// ClientIP try to get the clientIP -func ClientIP(r *http.Request) string { - // get forward ip fistly - var ip string - xForwardedFor := r.Header.Get("X-Forwarded-For") - forwardSlice := strings.Split(xForwardedFor, ",") - if len(forwardSlice) >= 1 { - if ip = strings.TrimSpace(forwardSlice[0]); ip != "" { - return ip - } - } - // try get ip from "X-Real-Ip" - ip = strings.TrimSpace(r.Header.Get("X-Real-Ip")) - if ip != "" { - return ip - } - var err error - if ip, _, err = net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)); err == nil { - return ip - } - return "" -} - -// CheckDomain check domain which by regex and blacklist -func CheckDomain(domain string, forLocalUsage bool) error { - matched, err := regexp.MatchString(domainReg, domain) - if err != nil { - return err - } - if !matched { - return errors.New("domain does not match allowed regex") - } - if !forLocalUsage { - return nil - } - if IsDigitString(domain) { - return errors.New("domain can not be all digits") - } - if strings.Contains(domain, "localhost") { - return errors.New("domain can not contain localhost") - } - return nil -} - -// IsHostValid check if the host is valid -func IsHostValid(host string) error { - parsedIp := net.ParseIP(host) - if parsedIp != nil { - return IsIPValid(parsedIp) - } - return CheckDomain(host, false) -} - -// IsIPValid check ip valid -func IsIPValid(parsedIp net.IP) error { - if parsedIp == nil { - return errors.New("parse ip is nil") - } - if parsedIp.To4() == nil && parsedIp.To16() == nil { - return errors.New("not a valid ipv4 or ipv6 ip") - } - if parsedIp.IsUnspecified() { - return errors.New("is all zeros ip") - } - if parsedIp.IsMulticast() { - return errors.New("is multicast ip") - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go b/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go deleted file mode 100644 index 6ad93ab..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/ip_utils_test.go +++ /dev/null @@ -1,182 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils offer the some utils for certificate handling -package utils - -import ( - "net/http" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -const ( - localhost = "127.0.0.1" - localhostLoop = "0.0.0.0" -) - -func TestClientIP(t *testing.T) { - convey.Convey("test ClientIP func", t, func() { - convey.Convey("get IP from X-Forwarded-For", func() { - ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {localhost, localhostLoop}})) - convey.So(ip, convey.ShouldEqual, localhost) - }) - convey.Convey("get IP from X-Real-Ip", func() { - ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, - "X-Real-Ip": {localhost}})) - convey.So(ip, convey.ShouldEqual, localhost) - }) - convey.Convey("get IP from RemoteAddr", func() { - ip := ClientIP(mockRequest(map[string][]string{"X-Forwarded-For": {}, - "X-Real-Ip": {}})) - convey.So(ip, convey.ShouldEqual, localhost) - }) - convey.Convey("get IP from RemoteAddr failed", func() { - ip := ClientIP(&http.Request{RemoteAddr: localhost}) - convey.So(ip, convey.ShouldEqual, "") - }) - convey.Convey("get IP failed", func() { - ip := ClientIP(&http.Request{}) - convey.So(ip, convey.ShouldEqual, "") - }) - }) -} - -func mockRequest(header map[string][]string) *http.Request { - return &http.Request{ - Method: "GET", - URL: nil, - Proto: "HTTP", - ProtoMajor: 0, - ProtoMinor: 0, - Header: header, - ContentLength: 0, - Close: false, - Host: "www.test.com", - RemoteAddr: "127.0.0.1:8080", - } -} - -func TestCheckDomain(t *testing.T) { - convey.Convey("CheckDomain function test suite", t, func() { - testDomainFormatValidation() - testLocalUsageConstraints() - testParameterCombinations() - }) -} - -// Test domain format validation -func testDomainFormatValidation() { - convey.Convey("Validate domain format rules", func() { - convey.Convey("Valid domain should pass validation", func() { - err := CheckDomain("example.com", false) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("Domain with special characters should be rejected", func() { - err := CheckDomain("example@com", false) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "domain does not match allowed regex") - }) - - convey.Convey("Domain starting with hyphen should be rejected", func() { - err := CheckDomain("-example.com", false) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -// Test local usage constraints -func testLocalUsageConstraints() { - convey.Convey("Validate constraints for local usage (forLocalUsage=true)", func() { - convey.Convey("All-digit domain should be rejected", func() { - err := CheckDomain("123456", true) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not be all digits") - }) - - convey.Convey("Domain containing 'localhost' should be rejected", func() { - err := CheckDomain("my-localhost.com", true) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "domain can not contain localhost") - }) - - convey.Convey("Valid local domain should pass validation", func() { - err := CheckDomain("local-app.example", true) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -// Test parameter combinations -func testParameterCombinations() { - convey.Convey("Validate parameter combinations", func() { - convey.Convey("All-digit restriction ignored when forLocalUsage=false", func() { - err := CheckDomain("123456", false) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("DNS check skipped when forLocalUsage=false", func() { - err := CheckDomain("unresolvable.test", false) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestIsHostValid(t *testing.T) { - tests := []struct { - name string - ip string - wantErr bool - errMsg string - }{ - { - name: "invalid IP format but domain", ip: "not.an.ip", - wantErr: false, - }, - { - name: "valid IPv4", ip: "192.168.1.1", wantErr: false, - }, - { - name: "valid IPv6", ip: "2001:0db8:85a3:0000:0000:8a2e:0370:7334", - wantErr: false, - }, - { - name: "unspecified IPv4", ip: "0.0.0.0", - wantErr: true, errMsg: "is all zeros ip", - }, - { - name: "unspecified IPv6", ip: "::", - wantErr: true, errMsg: "is all zeros ip", - }, - { - name: "IPv6 multicast", ip: "ff02::1", - wantErr: true, errMsg: "is multicast ip", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := IsHostValid(tt.ip) - if (err != nil) != tt.wantErr { - t.Errorf("IsIPValid() error = %v, wantErr %v", err, tt.wantErr) - return - } - if err != nil && err.Error() != tt.errMsg { - t.Errorf("IsIPValid() error = %v, wantErrMsg %v", - err.Error(), tt.errMsg) - } - }) - } -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path.go b/mind-cluster/component/ascend-common/common-utils/utils/path.go deleted file mode 100644 index b3150b9..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/path.go +++ /dev/null @@ -1,382 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "bufio" - "errors" - "fmt" - "io" - "io/fs" - "log" - "os" - "os/exec" - "path" - "path/filepath" - "strings" - "syscall" -) - -const ( - dirMode = 0700 - - rootUID = 0 - maxPathDepth = 20 - maxPathLength = 1024 - // DefaultWriteFileMode default file mode for write permission check - DefaultWriteFileMode = 0022 - - ldSplitLen = 2 - ldLibNameIndex = 0 - ldLibPathIndex = 1 - ldCommand = "/sbin/ldconfig" - ldParam = "--print-cache" - // LdLibPath LD_LIBRARY_PATH - LdLibPath = "LD_LIBRARY_PATH" - grepCommand = "/bin/grep" -) - -// IsDir check whether the path is a directory. -func IsDir(path string) bool { - if path == "" { - return false - } - - if !IsExist(path) { - return path[len(path)-1:] == "/" - } - s, err := os.Stat(path) - if err != nil { - return false - } - return s.IsDir() -} - -// IsFile check whether the path is a file -func IsFile(path string) bool { - if path == "" { - return false - } - return !IsDir(path) -} - -// IsSoftlink check whether the path is softlink -func IsSoftlink(path string) (bool, error) { - file, err := os.Open(path) - if err != nil { - return false, err - } - defer file.Close() - fileInfo, err := file.Stat() - if err != nil { - return false, err - } - if (fileInfo.Mode() & fs.ModeSymlink) != 0 { - return true, nil - } - return false, nil -} - -// IsExist check whether the path exists, If the file is a symbolic link, the returned the final FileInfo -func IsExist(filePath string) bool { - _, err := os.Stat(filePath) - if err == nil { - return true - } - if os.IsExist(err) { - return true - } - return false -} - -// IsLexist check whether the path exists, If the file is a symbolic link, the returned FileInfo -// describes the symbolic link -func IsLexist(filePath string) bool { - _, err := os.Lstat(filePath) - if err == nil { - return true - } - if os.IsExist(err) { - return true - } - return false -} - -// CheckPath validate given path and return resolved absolute path -func CheckPath(path string) (string, error) { - if path == "" { - return path, nil - } - origin := path - for !IsLexist(path) { - path = filepath.Dir(path) - if path == "." { - return "", os.ErrNotExist - } - } - absPath, err := filepath.Abs(path) - if err != nil { - return "", fmt.Errorf("get the absolute path failed: %v", err) - } - resoledPath, err := filepath.EvalSymlinks(absPath) - if err != nil { - if strings.Contains(err.Error(), "no such file or directory") { - return "", os.ErrNotExist - } - return "", fmt.Errorf("get the symlinks path failed: %v", err) - } - if absPath != resoledPath { - return "", errors.New("can't support symlinks") - } - // get the original full path - absOrigin, err := filepath.Abs(origin) - if err != nil { - return "", fmt.Errorf("get the absolute path failed: %v", err) - } - return absOrigin, nil -} - -// MakeSureDir create directory. The last element of path should end with slash, or it will be omitted. -func MakeSureDir(path string) error { - dir := filepath.Dir(path) - if IsExist(dir) { - return nil - } - - if err := os.MkdirAll(dir, dirMode); err != nil { - return fmt.Errorf("create directory failed: %v", err) - } - - return nil -} - -// CheckMode check input file mode whether includes invalid mode. -// For example, if read operation of group and other is forbidden, then call CheckMode(inputFileMode, 0044). -// All operations are forbidden for group and other, then call CheckMode(inputFileMode, 0077). -// Write operation is forbidden for group and other by default, with calling CheckMode(inputFileMode) -func CheckMode(mode os.FileMode, optional ...os.FileMode) bool { - var targetMode os.FileMode - if len(optional) > 0 { - targetMode = optional[0] - } else { - targetMode = DefaultWriteFileMode - } - checkMode := uint32(mode) & uint32(targetMode) - return checkMode == 0 -} - -// CheckOwnerAndPermission check path owner and permission -func CheckOwnerAndPermission(verifyPath string, mode os.FileMode, uid uint32) (string, error) { - if verifyPath == "" { - return verifyPath, errors.New("empty path") - } - absPath, err := filepath.Abs(verifyPath) - if err != nil { - return "", fmt.Errorf("abs failed %v", err) - } - resoledPath, err := filepath.EvalSymlinks(absPath) - if err != nil { - return "", fmt.Errorf("evalSymlinks failed %v", err) - } - // if symlinks - if absPath != resoledPath { - // check symlinks its self owner - pathInfo, err := os.Lstat(absPath) - if err != nil { - return "", fmt.Errorf("lstat failed, %v", err) - } - stat, ok := pathInfo.Sys().(*syscall.Stat_t) - if !ok || stat.Uid != uid { - return "", errors.New("symlinks owner may not root") - } - } - pathInfo, err := os.Stat(resoledPath) - if err != nil { - return "", fmt.Errorf("stat failed %v", err) - } - stat, ok := pathInfo.Sys().(*syscall.Stat_t) - if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { - return "", errors.New("check uid or mode failed") - } - return resoledPath, nil -} - -// DoCheckOwnerAndPermission check path owner and permission -func DoCheckOwnerAndPermission(path string, mode os.FileMode, uid uint32) error { - if !IsExist(path) { - return nil - } - pathInfo, err := os.Stat(path) - if err != nil { - return fmt.Errorf("stat failed %v", err) - } - stat, ok := pathInfo.Sys().(*syscall.Stat_t) - if !ok || stat.Uid != uid || !CheckMode(pathInfo.Mode(), mode) { - return fmt.Errorf("check uid or mode failed : %v", path) - } - return nil -} - -func checkAbsPath(libPath string) (string, error) { - absLibPath, err := CheckOwnerAndPermission(libPath, DefaultWriteFileMode, rootUID) - if err != nil { - return "", fmt.Errorf("%s: %v", libPath, err) - } - count := 0 - fPath := absLibPath - for { - if count >= maxPathDepth { - break - } - count++ - if fPath == "/" { - return absLibPath, nil - } - fPath = filepath.Dir(fPath) - if _, err := CheckOwnerAndPermission(fPath, DefaultWriteFileMode, rootUID); err != nil { - return "", fmt.Errorf("%s: %v", fPath, err) - } - } - return "", errors.New("absolute path check failed") -} - -func checkLibsPath(libraryPaths []string) (string, error) { - errs := make([]string, 0, len(libraryPaths)) - for _, libraryAbsName := range libraryPaths { - absLibPath, err := checkAbsPath(libraryAbsName) - if err == nil { - return absLibPath, nil - } - errs = append(errs, fmt.Sprintf("%s;", err.Error())) - } - return "", fmt.Errorf("lib path is invalid, %v", errs) -} - -func getLibFromEnv(libraryName string) (string, error) { - ldLibraryPath := os.Getenv(LdLibPath) - if len(ldLibraryPath) > maxPathLength { - return "", fmt.Errorf("invalid library path env") - } - libraryPaths := strings.Split(ldLibraryPath, ":") - targetLibs := make([]string, 0, len(ldLibraryPath)) - for _, libraryPath := range libraryPaths { - libraryAbsName := path.Join(libraryPath, libraryName) - if len(libraryAbsName) > maxPathLength || !IsLexist(libraryAbsName) { - continue - } - targetLibs = append(targetLibs, libraryAbsName) - } - if len(libraryPaths) == 0 { - return "", errors.New("file path no exist or too long") - } - return checkLibsPath(targetLibs) -} - -func trimSpaceTable(data string) string { - data = strings.Replace(data, " ", "", -1) - data = strings.Replace(data, "\t", "", -1) - data = strings.Replace(data, "\n", "", -1) - return data -} - -func parserLibPath(line, libraryName string) string { - ldInfo := strings.Split(line, "=>") - if len(ldInfo) < ldSplitLen { - return "" - } - libNames := strings.Split(ldInfo[ldLibNameIndex], " ") - for index, libName := range libNames { - if index >= maxPathDepth { - break - } - if len(libName) == 0 { - continue - } - if name := trimSpaceTable(libName); name != libraryName { - continue - } - return trimSpaceTable(ldInfo[ldLibPathIndex]) - } - return "" -} - -func parseLibFromLdCmd(libraryName string) (string, error) { - ldCmd := exec.Command(ldCommand, ldParam) - grepCmd := exec.Command(grepCommand, libraryName) - ldCmdStdout, err := ldCmd.StdoutPipe() - if err != nil { - return "", fmt.Errorf("command exec failed: %v", err) - } - grepCmd.Stdin = ldCmdStdout - stdout, err := grepCmd.StdoutPipe() - if err != nil { - return "", fmt.Errorf("get pipe failed: %v", err) - } - if err = grepCmd.Start(); err != nil { - return "", fmt.Errorf("command exec failed: %v", err) - } - if err = ldCmd.Run(); err != nil { - return "", fmt.Errorf("command exec failed: %v", err) - } - defer func() { - if err = grepCmd.Wait(); err != nil { - log.Printf("command exec failed, %v", err) - } - }() - reader := bufio.NewReader(stdout) - count := 0 - line := "" - for { - if count >= maxPathLength { - err = errors.New("too many items in command stdout") - break - } - count++ - line, err = reader.ReadString('\n') - if err != nil || io.EOF == err { - break - } - if libPath := parserLibPath(line, libraryName); libPath != "" { - return libPath, nil - } - } - return "", fmt.Errorf("can't find valid lib: %v", err) -} - -func getLibFromLdCmd(libraryName string) (string, error) { - libraryAbsName, err := parseLibFromLdCmd(libraryName) - if err != nil { - return "", err - } - var absLibPath string - if absLibPath, err = checkAbsPath(libraryAbsName); err == nil { - return absLibPath, nil - } - return "", fmt.Errorf("driver lib is not exist or it's permission is invalid, %v", err) -} - -// GetDriverLibPath get driver lib path from ld config -func GetDriverLibPath(libraryName string) (string, error) { - var libPath string - var envErr, cmdErr error - if libPath, envErr = getLibFromEnv(libraryName); envErr == nil { - return libPath, nil - } - if libPath, cmdErr = getLibFromLdCmd(libraryName); cmdErr == nil { - return libPath, nil - } - return "", fmt.Errorf("cannot found valid driver lib, fromEnv: %v, fromLdCmd: %v", envErr, cmdErr) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/path_test.go b/mind-cluster/component/ascend-common/common-utils/utils/path_test.go deleted file mode 100644 index 4e2346f..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/path_test.go +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "errors" - "fmt" - "os" - "path/filepath" - "syscall" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" -) - -func TestIsDir(t *testing.T) { - convey.Convey("test logger", t, func() { - convey.Convey("test IsDir func", func() { - res := IsDir("/tmp/") - convey.So(res, convey.ShouldBeTrue) - res = IsDir("/utils/") - convey.So(res, convey.ShouldBeTrue) - res = IsDir("") - convey.So(res, convey.ShouldBeFalse) - }) - }) -} - -func TestIsFile(t *testing.T) { - convey.Convey("test IsFile func", t, func() { - res := IsFile("/tmp/") - convey.So(res, convey.ShouldBeFalse) - res = IsFile("") - convey.So(res, convey.ShouldBeFalse) - }) -} - -func TestIsExist(t *testing.T) { - convey.Convey("test IsExist func", t, func() { - res := IsExist("/xxxx/") - convey.So(res, convey.ShouldBeFalse) - }) -} - -func TestIsLexist(t *testing.T) { - convey.Convey("test IsLexist func", t, func() { - res := IsLexist("/xxxx/") - convey.So(res, convey.ShouldBeFalse) - }) -} - -func TestCheckPath(t *testing.T) { - convey.Convey("test CheckPath func", t, func() { - convey.Convey("should return itself given empty string", func() { - res, err := CheckPath("") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error given not exist path", func() { - res, err := CheckPath("xxxxxxx") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "file does not exist") - }) - - convey.Convey("should return resolve path given normal path", func() { - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldNotBeEmpty) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return err when get abs path failed", func() { - absStub := gomonkey.ApplyFunc(filepath.Abs, func(path string) (string, error) { - return "", errors.New("abs failed") - }) - defer absStub.Reset() - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "get the absolute path failed: abs failed") - }) - - convey.Convey("should return err when get eval symbol link failed", func() { - symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { - return "", errors.New("symlinks path failed") - }) - defer symStub.Reset() - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "get the symlinks path failed: symlinks path failed") - }) - - convey.Convey("should return err given symbol link", func() { - symStub := gomonkey.ApplyFunc(filepath.EvalSymlinks, func(path string) (string, error) { - return "xxx", nil - }) - defer symStub.Reset() - res, err := CheckPath("../../go.mod") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err.Error(), convey.ShouldEqual, "can't support symlinks") - }) - - }) -} - -func TestMakeSureDir(t *testing.T) { - convey.Convey("test MakeSureDir func", t, func() { - convey.Convey("normal situation, no err returned", func() { - err := MakeSureDir("./testdata/tmp/test") - convey.So(err, convey.ShouldEqual, nil) - }) - convey.Convey("abnormal situation,err returned", func() { - mock := gomonkey.ApplyFunc(os.MkdirAll, func(name string, perm os.FileMode) error { - return fmt.Errorf("error") - }) - defer mock.Reset() - err := MakeSureDir("./xxxx/xxx") - convey.So(err.Error(), convey.ShouldEqual, "create directory failed: error") - }) - }) -} - -func TestGetDriverLibPath(t *testing.T) { - convey.Convey("test GetDriverLibPath func", t, func() { - convey.Convey("should return itself given empty string", func() { - err := os.Setenv(LdLibPath, "") - convey.So(err, convey.ShouldBeNil) - res, err := GetDriverLibPath("") - convey.So(res, convey.ShouldBeEmpty) - convey.So(err, convey.ShouldBeError) - }) - - convey.Convey("should return path when getLibFromEnv succeed", func() { - envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { - return "/test", nil - }) - defer envStub.Reset() - res, err := GetDriverLibPath("") - convey.So(res, convey.ShouldEqual, "/test") - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return path when getLibFromEnv failed but getLibFromLdCmd succeed", func() { - envStub := gomonkey.ApplyFunc(getLibFromEnv, func(libraryName string) (string, error) { - return "", errors.New("failed") - }) - defer envStub.Reset() - cmdStub := gomonkey.ApplyFunc(getLibFromLdCmd, func(libraryName string) (string, error) { - return "/test", nil - }) - defer cmdStub.Reset() - res, err := GetDriverLibPath("") - convey.So(res, convey.ShouldEqual, "/test") - convey.So(err, convey.ShouldBeNil) - }) - - }) -} - -type mockFileInfo struct { - mode os.FileMode - sys interface{} -} - -func (m *mockFileInfo) Name() string { return "mock" } -func (m *mockFileInfo) Size() int64 { return 0 } -func (m *mockFileInfo) Mode() os.FileMode { return m.mode } -func (m *mockFileInfo) ModTime() time.Time { return time.Now() } -func (m *mockFileInfo) IsDir() bool { return false } -func (m *mockFileInfo) Sys() interface{} { return m.sys } - -func TestDoCheckOwnerAndPermission(t *testing.T) { - var testPath = "/test" - var testMode os.FileMode = 0660 - var excludePermissions os.FileMode = 0002 - patch := gomonkey.NewPatches() - defer patch.Reset() - convey.Convey("should return nil when path is not exist", t, func() { - patch.ApplyFuncReturn(IsExist, false) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldBeNil) - }) - - patch.ApplyFuncReturn(IsExist, true) - convey.Convey("should return err when stat failed", t, func() { - patch.ApplyFuncReturn(os.Stat, nil, os.ErrNotExist) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err.Error(), convey.ShouldContainSubstring, "stat failed") - }) - - convey.Convey("should return err when get uid failed", t, func() { - patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: "invalid-type"}, nil) - defer patch.Reset() - - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") - }) - - convey.Convey("should return err when permission check failure", t, func() { - patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) - patch.ApplyFuncReturn(CheckMode, false) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "check uid or mode failed") - }) - - convey.Convey("should return nil where all checks pass", t, func() { - patch.ApplyFuncReturn(os.Stat, &mockFileInfo{mode: testMode, sys: &syscall.Stat_t{Uid: rootUID}}, nil) - patch.ApplyFuncReturn(CheckMode, true) - defer patch.Reset() - err := DoCheckOwnerAndPermission(testPath, excludePermissions, rootUID) - convey.So(err, convey.ShouldBeNil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go deleted file mode 100644 index 49c2f36..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util.go +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for password handler -package utils - -import ( - "bytes" - "errors" - "regexp" -) - -const ( - lowercaseCharactersRegex = `[a-z]{1,}` - uppercaseCharactersRegex = `[A-Z]{1,}` - baseNumberRegex = `[0-9]{1,}` - specialCharactersRegex = `[!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{1,}` - passWordRegex = `^[a-zA-Z0-9!\"#$%&'()*+,\-. /:;<=>?@[\\\]^_\x60{|}~]{8,64}$` - minComplexCount = 2 -) - -// CheckPassWordComplexity check password complexity -func CheckPassWordComplexity(s []byte) error { - complexCheckRegexArr := []string{ - lowercaseCharactersRegex, - uppercaseCharactersRegex, - baseNumberRegex, - specialCharactersRegex, - } - complexCount := 0 - for _, pattern := range complexCheckRegexArr { - if matched, err := regexp.Match(pattern, s); matched && err == nil { - complexCount++ - } - } - if complexCount < minComplexCount { - return errors.New("password complex not meet the requirement") - } - return nil -} - -// ValidatePassWord validate password -func ValidatePassWord(userName string, passWord []byte) error { - if err := commonCheckForPassWord(userName, passWord); err != nil { - return err - } - return CheckPassWordComplexity(passWord) -} - -func commonCheckForPassWord(userName string, passWord []byte) error { - if matched, err := regexp.Match(passWordRegex, passWord); err != nil || !matched { - return errors.New("password not meet requirement") - } - var userNameByte []byte = []byte(userName) - if bytes.Equal(userNameByte, passWord) { - return errors.New("password cannot equals username") - } - var reverseUserName = ReverseString(userName) - var reverseUserNameByte []byte = []byte(reverseUserName) - if bytes.Equal(reverseUserNameByte, passWord) { - return errors.New("password cannot equal reversed username") - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go b/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go deleted file mode 100644 index 808c231..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/pwd_util_test.go +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for password handler -package utils - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -var ( - truePasswd = []byte("aA0!\"#$%&'()*+,-. /:;<=>?@[\\]^_`{|}~") - falsePasswd1 = []byte("userName") - falsePasswd2 = []byte("12345678") - falsePasswd3 = []byte("1234567") - falsePasswd4 = []byte("emaNresu.") - falsePasswd5 = []byte("不支持特殊字符测试test") -) - -// TestCommonCheckForPassWord test common check for passWord -func TestCommonCheckForPassWord(t *testing.T) { - convey.Convey("correct password", t, func() { - err := ValidatePassWord("userName", truePasswd) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("username == password", t, func() { - err := ValidatePassWord("userName", falsePasswd1) - convey.So(err.Error(), convey.ShouldEqual, "password cannot equals username") - }) - convey.Convey("complex not meet the requirement", t, func() { - err := ValidatePassWord("userName", falsePasswd2) - convey.So(err.Error(), convey.ShouldEqual, "password complex not meet the requirement") - }) - convey.Convey("password too short", t, func() { - err := ValidatePassWord("userName", falsePasswd3) - convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") - }) - convey.Convey("username equal reverse password", t, func() { - err := ValidatePassWord(".userName", falsePasswd4) - convey.So(err.Error(), convey.ShouldEqual, "password cannot equal reversed username") - }) - convey.Convey("test special ", t, func() { - err := ValidatePassWord("userName", falsePasswd5) - convey.So(err.Error(), convey.ShouldEqual, "password not meet requirement") - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice.go b/mind-cluster/component/ascend-common/common-utils/utils/slice.go deleted file mode 100644 index f673bc1..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/slice.go +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for slice utils -package utils - -import ( - "fmt" - "slices" - "strconv" -) - -// hex hexadecimal -const hex = 16 - -type stringTool struct{} - -// StringTool slice for string tool -var StringTool stringTool - -// HexStringToInt hex string slice to int64 slice -func (s stringTool) HexStringToInt(sources []string) map[int64]struct{} { - intMap := make(map[int64]struct{}, len(sources)) - for _, source := range sources { - num, err := strconv.ParseInt(source, hex, 0) - if err != nil { - fmt.Printf("parse hex to int failed, skip it. error: %v\n", err) - continue - } - intMap[num] = struct{}{} - } - return intMap -} - -// Contains check whether slice contains target -func Contains[T comparable](sources []T, target T) bool { - for _, v := range sources { - if v == target { - return true - } - } - return false -} - -// Remove delete the first matching element in the slice -func Remove[T comparable](slice []T, target T) []T { - for i, v := range slice { - if v == target { - return append(slice[:i], slice[i+1:]...) - } - } - return slice -} - -// RemoveDuplicates remove duplicates from slice -func RemoveDuplicates[T comparable](slice []T) []T { - existMap := make(map[T]struct{}) - result := make([]T, 0) - for _, str := range slice { - if _, ok := existMap[str]; !ok { - existMap[str] = struct{}{} - result = append(result, str) - } - } - return result -} - -// SameElementInMap whether map contains target -func SameElementInMap[T comparable](sources map[T]struct{}, targets []T) bool { - for _, target := range targets { - if _, ok := sources[target]; ok { - return true - } - } - return false -} - -// RemoveEleSli remove element in sources which is in target -func RemoveEleSli[T comparable](source, target []T) []T { - sliMap := make(map[T]struct{}) - for _, item := range target { - sliMap[item] = struct{}{} - } - - result := make([]T, 0) - for _, ele := range source { - if _, ok := sliMap[ele]; !ok { - result = append(result, ele) - } - } - return result -} - -// RemoveElementsNotInSecond remove elements not in slice2 -func RemoveElementsNotInSecond[T comparable](slice1, slice2 []T) []T { - sliMap := make(map[T]struct{}) - for _, item := range slice2 { - sliMap[item] = struct{}{} - } - - result := make([]T, 0) - for _, item := range slice1 { - if _, ok := sliMap[item]; ok { - result = append(result, item) - } - } - return result -} - -// CheckSliceSupport check elements is supported in expects -func CheckSliceSupport(elements []int64, expects []int64) error { - for _, e := range elements { - if !slices.Contains(expects, e) { - return fmt.Errorf("element %v does not contain %v", e, expects) - } - } - return nil -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go b/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go deleted file mode 100644 index b3bf161..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/slice_test.go +++ /dev/null @@ -1,536 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils this file for slice utils -package utils - -import ( - "fmt" - "reflect" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -const ( - decimal1A = 26 - decimalFF = 255 - decimalNeg10 = 16 - decimalNegFF = -255 -) - -func buildHexStringToIntTestCase() []struct { - name string - input []string - expected map[int64]struct{} -} { - return []struct { - name string - input []string - expected map[int64]struct{} - }{ - { - name: "01 - Valid hex strings", - input: []string{"1A", "FF", "10"}, - expected: map[int64]struct{}{ - decimal1A: {}, - decimalFF: {}, - decimalNeg10: {}, - }, - }, - { - name: "02 - Invalid hex strings", - input: []string{"xyz", "ghijk"}, - expected: map[int64]struct{}{}, - }, - { - name: "03 - Empty input array", - input: []string{}, - expected: map[int64]struct{}{}, - }, - { - name: "04 - Duplicate values should be deduplicated", - input: []string{"0x1A", "1A", "0x1a"}, // All represent 26 in decimal - expected: map[int64]struct{}{ - decimal1A: {}, - }, - }, - { - name: "05 - Mixed valid and invalid inputs", - input: []string{"0x1A", "xyz", "0xFF", "invalid", "0x10"}, - expected: map[int64]struct{}{}, - }, - { - name: "06 - Negative hex numbers", - input: []string{"-0x1A", "-FF"}, - expected: map[int64]struct{}{ - decimalNegFF: {}, - }, - }, - } -} - -func TestHexStringToInt(t *testing.T) { - for _, tt := range buildHexStringToIntTestCase() { - t.Run(tt.name, func(t *testing.T) { - result := StringTool.HexStringToInt(tt.input) - for i := range tt.expected { - fmt.Println(i) - } - if len(result) != len(tt.expected) { - t.Errorf("Expected map length %d, but got %d", len(tt.expected), len(result)) - return - } - for key := range tt.expected { - if _, exists := result[key]; !exists { - t.Errorf("Expected key %d not found in result", key) - } - } - for key := range result { - if _, exists := tt.expected[key]; !exists { - t.Errorf("Unexpected key %d found in result", key) - } - } - }) - } -} - -func TestSameElementInMap(t *testing.T) { - for _, tt := range buildSameElementInMapTestCase() { - t.Run(tt.name, func(t *testing.T) { - result := SameElementInMap(tt.sources, tt.targets) - if result != tt.expected { - t.Errorf("SameElementInMap() = %v, expected %v", result, tt.expected) - } - }) - } -} - -func buildSameElementInMapTestCase() []struct { - name string - sources map[int]struct{} - targets []int - expected bool -} { - return []struct { - name string - sources map[int]struct{} - targets []int - expected bool - }{ - { - name: "01 There are identical elements present", - sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, - targets: []int{4, 5, 2}, - expected: true, - }, - { - name: "02 There are no identical elements present\n", - sources: map[int]struct{}{1: {}, 2: {}, 3: {}}, - targets: []int{4, 5, 6}, - expected: false, - }, - { - name: "03 target is nil", - sources: map[int]struct{}{1: {}, 2: {}}, - targets: []int{}, - expected: false, - }, - { - name: "04 source is nil", - sources: map[int]struct{}{}, - targets: []int{1, 2, 3}, - expected: false, - }, - { - name: "05 source and target are both nil", - sources: map[int]struct{}{}, - targets: []int{}, - expected: false, - }, - } -} - -func TestSameElementInMap_StringType(t *testing.T) { - sources := map[string]struct{}{ - "apple": {}, - "banana": {}, - "orange": {}, - } - targets := []string{"grape", "apple", "kiwi"} - result := SameElementInMap(sources, targets) - if !result { - t.Errorf("SameElementInMap() with string type should return true, got false") - } - targetsNoMatch := []string{"grape", "kiwi", "mango"} - resultNoMatch := SameElementInMap(sources, targetsNoMatch) - if resultNoMatch { - t.Errorf("SameElementInMap() with string type should return false, got true") - } -} - -func TestContains(t *testing.T) { - for _, tt := range buildContainsTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.source.(type) { - case []int: - s2 := tt.target.(int) - result := Contains(s1, s2) - if !reflect.DeepEqual(result, tt.expected) { - t.Errorf("Contains() = %v, want %v", result, tt.expected) - } - case []string: - s2 := tt.target.(string) - result := Contains(s1, s2) - if !reflect.DeepEqual(result, tt.expected) { - t.Errorf("Contains() = %v, want %v", result, tt.expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildContainsTestCase() []struct { - name string - source interface{} - target interface{} - expected bool -} { - return []struct { - name string - source interface{} - target interface{} - expected bool - }{ - { - name: "01 contains for int type", - source: []int{1, 2, 3, 4}, - target: 1, - expected: true, - }, - { - name: "02 not contains for int type", - source: []int{1, 2, 3, 4}, - target: 0, - expected: false, - }, - { - name: "03 contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "1", - expected: true, - }, - { - name: "04 not contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "0", - expected: false, - }, - { - name: "05 empty source slice", - source: []int{}, - target: 1, - expected: false, - }, - } -} - -func TestRemove(t *testing.T) { - for _, tt := range buildRemoveTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.source.(type) { - case []int: - s2 := tt.target.(int) - result := Remove(s1, s2) - expected := tt.expected.([]int) - if !reflect.DeepEqual(result, expected) { - t.Errorf("Contains() = %v, want %v", result, expected) - } - case []string: - s2 := tt.target.(string) - result := Remove(s1, s2) - expected := tt.expected.([]string) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildRemoveTestCase() []struct { - name string - source interface{} - target interface{} - expected interface{} -} { - return []struct { - name string - source interface{} - target interface{} - expected interface{} - }{ - { - name: "01 contains for int type", - source: []int{1, 2, 3, 4}, - target: 1, - expected: []int{2, 3, 4}, - }, - { - name: "02 not contains for int type", - source: []int{1, 2, 3, 4}, - target: 0, - expected: []int{1, 2, 3, 4}, - }, - { - name: "03 contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "1", - expected: []string{"2", "3", "4"}, - }, - { - name: "04 not contains for string type", - source: []string{"1", "2", "3", "4"}, - target: "0", - expected: []string{"1", "2", "3", "4"}, - }, - { - name: "05 empty source slice", - source: []int{}, - target: 1, - expected: []int{}, - }, - } -} - -func buildRemoveElementsNotInSecondTestCase() []struct { - name string - slice1 interface{} - slice2 interface{} - expected interface{} -} { - return []struct { - name string - slice1 interface{} - slice2 interface{} - expected interface{} - }{ - { - name: "01 Basic functionality - integer slices with partial overlap", - slice1: []int{1, 2, 3, 4}, - slice2: []int{2, 4, 6, 8}, - expected: []int{2, 4}, - }, - { - name: "02 Empty first slice", - slice1: []int{}, - slice2: []int{1, 2, 3}, - expected: []int{}, - }, - { - name: "03 Empty second slice", - slice1: []int{1, 2, 3}, - slice2: []int{}, - expected: []int{}, - }, - { - name: "04 Both slices empty", - slice1: []int{}, - slice2: []int{}, - expected: []int{}, - }, - { - name: "05 No intersection between slices", - slice1: []int{1, 2, 3}, - slice2: []int{4, 5, 6}, - expected: []int{}, - }, - { - name: "06 String type test", - slice1: []string{"1", "2", "3"}, - slice2: []string{"2", "3", "4"}, - expected: []string{"2", "3"}, - }, - } -} - -func TestRemoveElementsNotInSecond(t *testing.T) { - for _, tt := range buildRemoveElementsNotInSecondTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.slice1.(type) { - case []int: - s2 := tt.slice2.([]int) - expected := tt.expected.([]int) - result := RemoveElementsNotInSecond(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) - } - case []string: - s2 := tt.slice2.([]string) - expected := tt.expected.([]string) - result := RemoveElementsNotInSecond(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveElementsNotInSecond() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildRemoveEleSliTestCase() []struct { - name string - source interface{} - target interface{} - expected interface{} -} { - return []struct { - name string - source interface{} - target interface{} - expected interface{} - }{ - { - name: "01 int type", - source: []int{1, 2, 3, 4, 5}, - target: []int{2, 4}, - expected: []int{1, 3, 5}, - }, - { - name: "02 source is empty for int type", - source: []int{}, - target: []int{1, 2}, - expected: []int{}, - }, - { - name: "03 target is empty for int type", - source: []int{1, 2, 3}, - target: []int{}, - expected: []int{1, 2, 3}, - }, - { - name: "04 source and target are both empty for int type", - source: []int{}, - target: []int{}, - expected: []int{}, - }, - { - name: "05 string type", - source: []string{"a", "b", "c", "d"}, - target: []string{"b", "d"}, - expected: []string{"a", "c"}, - }, - } -} - -func TestRemoveEleSli(t *testing.T) { - for _, tt := range buildRemoveEleSliTestCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.source.(type) { - case []int: - s2 := tt.target.([]int) - expected := tt.expected.([]int) - result := RemoveEleSli(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveEleSli() = %v, want %v", result, expected) - } - case []string: - s2 := tt.target.([]string) - expected := tt.expected.([]string) - result := RemoveEleSli(s1, s2) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveEleSli() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func buildRemoveDuplicatesCase() []struct { - name string - input interface{} - expected interface{} -} { - return []struct { - name string - input interface{} - expected interface{} - }{ - { - name: "01 empty slice for int type", - input: []int{}, - expected: []int{}, - }, - { - name: "02 no duplicates for int type", - input: []int{1, 2, 3}, - expected: []int{1, 2, 3}, - }, - { - name: "03 with duplicates for int type", - input: []int{1, 2, 2, 3, 1, 4}, - expected: []int{1, 2, 3, 4}, - }, - { - name: "04 with duplicates for string type", - input: []string{"1", "3", "3", "4"}, - expected: []string{"1", "3", "4"}, - }, - } -} - -func TestRemoveDuplicates(t *testing.T) { - for _, tt := range buildRemoveDuplicatesCase() { - t.Run(tt.name, func(t *testing.T) { - switch s1 := tt.input.(type) { - case []int: - expected := tt.expected.([]int) - result := RemoveDuplicates(s1) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) - } - case []string: - expected := tt.expected.([]string) - result := RemoveDuplicates(s1) - if !reflect.DeepEqual(result, expected) { - t.Errorf("RemoveDuplicates() = %v, want %v", result, expected) - } - default: - t.Errorf("unsupported type") - } - }) - } -} - -func TestCheckSliceSupport(t *testing.T) { - convey.Convey("test TestCheckSliceSupport, check ok", t, func() { - elements := []int64{1, 2} - expects := []int64{1, 2, 3} - err := CheckSliceSupport(elements, expects) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("test TestCheckSliceSupport, check fail", t, func() { - elements := []int64{1, 2, 4} - expects := []int64{1, 2, 3} - err := CheckSliceSupport(elements, expects) - convey.So(err, convey.ShouldNotBeNil) - }) -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings.go b/mind-cluster/component/ascend-common/common-utils/utils/strings.go deleted file mode 100644 index c3d98aa..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/strings.go +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "crypto/sha256" - "fmt" - "unicode" -) - -const ( - maskLen = 2 -) - -// ReplacePrefix replace string with prefix -func ReplacePrefix(source, prefix string) string { - if prefix == "" { - prefix = "****" - } - if len(source) <= maskLen { - return prefix - } - end := string([]rune(source)[maskLen:len(source)]) - return prefix + end -} - -// MaskPrefix mask string prefix with **** -func MaskPrefix(source string) string { - return ReplacePrefix(source, "") -} - -// GetSha256Code return the sha256 hash bytes -func GetSha256Code(data []byte) []byte { - hash256 := sha256.New() - if _, err := hash256.Write(data); err != nil { - fmt.Println(err) - return nil - } - return hash256.Sum(nil) -} - -// ReverseString reverse string -func ReverseString(s string) string { - runes := []rune(s) - for start, end := 0, len(runes)-1; start < end; start, end = start+1, end-1 { - runes[start], runes[end] = runes[end], runes[start] - } - return string(runes) -} - -// IsDigitString return string is all digit -func IsDigitString(s string) bool { - if len(s) == 0 { - return false - } - for _, c := range s { - if !unicode.IsDigit(c) { - return false - } - } - return true -} diff --git a/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go b/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go deleted file mode 100644 index 390e424..0000000 --- a/mind-cluster/component/ascend-common/common-utils/utils/strings_test.go +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils provides the util func -package utils - -import ( - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -const byteLength = 32 - -func TestReplacePrefix(t *testing.T) { - convey.Convey("relative path", t, func() { - path := ReplacePrefix("./testdata/cert/ca.crt", "****") - convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") - }) - convey.Convey("abconvey.Solute path", t, func() { - path := ReplacePrefix("/testdata/cert/ca.crt", "****") - convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") - }) - convey.Convey("path length less than 2", t, func() { - path := ReplacePrefix("/", "****") - convey.So(path, convey.ShouldEqual, "****") - }) - convey.Convey("empty string", t, func() { - path := ReplacePrefix("", "****") - convey.So(path, convey.ShouldEqual, "****") - }) - -} - -func TestMaskPrefix(t *testing.T) { - convey.Convey("relative path", t, func() { - path := MaskPrefix("./testdata/cert/ca.crt") - convey.So(path, convey.ShouldEqual, "****testdata/cert/ca.crt") - }) - convey.Convey("abconvey.Solute path", t, func() { - path := MaskPrefix("/testdata/cert/ca.crt") - convey.So(path, convey.ShouldEqual, "****estdata/cert/ca.crt") - }) - convey.Convey("path length less than 2", t, func() { - path := MaskPrefix("/") - convey.So(path, convey.ShouldEqual, "****") - }) - convey.Convey("empty string", t, func() { - path := MaskPrefix("") - convey.So(path, convey.ShouldEqual, "****") - }) - -} - -func TestGetSha256Code(t *testing.T) { - convey.Convey("test sha256", t, func() { - hashs := GetSha256Code([]byte("this is a test sentence")) - convey.So(len(hashs), convey.ShouldEqual, byteLength) - }) -} - -func TestIsDigitString(t *testing.T) { - convey.Convey("test IsDigitString", t, func() { - convey.Convey("case IsDigitString is true", func() { - str := "123" - convey.ShouldBeTrue(IsDigitString(str)) - }) - convey.Convey("case IsDigitString is false", func() { - str := "123a" - convey.ShouldBeFalse(IsDigitString(str)) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/devmanager/a310mgr.go b/mind-cluster/component/ascend-common/devmanager/a310mgr.go deleted file mode 100644 index 081f167..0000000 --- a/mind-cluster/component/ascend-common/devmanager/a310mgr.go +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this Ascend310 device manager -package devmanager - -import ( - "ascend-common/devmanager/dcmi" -) - -// A310Manager Ascend310 device manager -type A310Manager struct { - dcmi.DcManager -} diff --git a/mind-cluster/component/ascend-common/devmanager/a310pmgr.go b/mind-cluster/component/ascend-common/devmanager/a310pmgr.go deleted file mode 100644 index b32d1fa..0000000 --- a/mind-cluster/component/ascend-common/devmanager/a310pmgr.go +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this Ascend310P device manager -package devmanager - -import ( - "ascend-common/devmanager/dcmi" -) - -// A310PManager Ascend310P device manager -type A310PManager struct { - dcmi.DcManager -} - -// DcGetDevicePowerInfo query power by mcu interface for 310P -func (d *A310PManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { - return d.DcGetMcuPowerInfo(cardID) -} - -// DcGetMcuPowerInfo this function is only for Ascend310P -func (d *A310PManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { - return dcmi.FuncDcmiMcuGetPowerInfo(cardID) -} diff --git a/mind-cluster/component/ascend-common/devmanager/a910mgr.go b/mind-cluster/component/ascend-common/devmanager/a910mgr.go deleted file mode 100644 index 1bb2beb..0000000 --- a/mind-cluster/component/ascend-common/devmanager/a910mgr.go +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this Ascend910 device manager -package devmanager - -import ( - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// A910Manager Ascend910 device manager -type A910Manager struct { - dcmi.DcManager -} - -// DcGetHbmInfo get HBM information, only for Ascend910 -func (d *A910Manager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { - return dcmi.FuncDcmiGetDeviceHbmInfo(cardID, deviceID) -} diff --git a/mind-cluster/component/ascend-common/devmanager/common/constants.go b/mind-cluster/component/ascend-common/devmanager/common/constants.go deleted file mode 100644 index e39ddac..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/constants.go +++ /dev/null @@ -1,272 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common define common variable -package common - -import ( - "math" - - "k8s.io/apimachinery/pkg/util/sets" -) - -// DeviceType define device type -type DeviceType struct { - // Code device type code - Code int32 - // Name device type name - Name string -} - -var ( - // ProfilingTime for getting PCIe bandwidth - ProfilingTime int - - // HccsBWProfilingTime for getting hccs bandwidth - HccsBWProfilingTime int - - // a3BoardIds for A3 Board IDs - a3BoardIds = sets.NewInt32(A900A3SuperPodBin1BoardId, A900A3SuperPodBin2BoardId, - A900A3SuperPodBin3BoardId, A800IA3BoardId) - - // a900A3SuperPodMainBoardIds for A900 A3 Super Pod Main Board IDs - a900A3SuperPodMainBoardIds = sets.NewInt32(A900A3SuperPodMainBoardId1, A900A3SuperPodMainBoardId2) - - // a9000A3SuperPodMainBoardIds for A9000 A3 Super Pod Main Board IDs - a9000A3SuperPodMainBoardIds = sets.NewInt32(A9000A3SuperPodMainBoardId1, A9000A3SuperPodMainBoardId2) -) - -// DeviceType for utilization -var ( - // AICore Ascend310 & Ascend910 - AICore = DeviceType{Code: 2, Name: "AICore"} - // HbmUtilization utilization rate of hbm - HbmUtilization = DeviceType{Code: 6, Name: "Hbm"} - // VectorCore Ascend310P - VectorCore = DeviceType{Code: 12, Name: "VectorCore"} - // Overall Overall utilization rate of NPU - Overall = DeviceType{Code: 13, Name: "Overall"} -) - -// DeviceType for frequency -var ( - // AICoreCurrentFreq Ascend310 & Ascend910 & Ascend910B & Ascend310P - AICoreCurrentFreq = DeviceType{Code: 7, Name: "AICore Current"} -) - -const ( - // Success for interface return code - Success = 0 - // DeviceNotReadyErrCodeStr for dcmi interface device not ready err code string - DeviceNotReadyErrCodeStr = "-8012" - // DeviceNotReadyErrCode for dcmi interface device not ready err code - DeviceNotReadyErrCode = -8012 - // CardDropFaultCode card drop fault code - CardDropFaultCode = 0x40F84E00 - // RetError return error when the function failed - RetError = -1 - // Percent constant of 100 - Percent = 100 - // MaxErrorCodeCount number of error codes - MaxErrorCodeCount = 128 - // UnRetError return unsigned int error - UnRetError = math.MaxUint32 - // Abnormal status of Abnormal - Abnormal = "Abnormal" - // ChannelStateOk means out band channel is ok for resetting - ChannelStateOk = 1 - - // HiAIMaxCardID max card id for Ascend chip - HiAIMaxCardID = math.MaxInt32 - - // HiAIMaxCardNum max card number - HiAIMaxCardNum = 64 - - // HiAIMaxDeviceNum max device number - HiAIMaxDeviceNum = 4 - - // NpuType present npu chip - NpuType = 0 - - // ReduceOnePercent for calculation reduce one percent - ReduceOnePercent = 0.01 - // ReduceTenth for calculation reduce one tenth - ReduceTenth = 0.1 - // DefaultTemperatureWhenQueryFailed when get temperature failed, use this value - DefaultTemperatureWhenQueryFailed = -275 - - // Ascend310P ascend 310P chip - Ascend310P = "Ascend310P" - // Ascend910 ascend 910 chip - Ascend910 = "Ascend910" - // Ascend910B ascend 910B chip - Ascend910B = "Ascend910B" - // Ascend910A3 ascend Ascend910A3 chip - Ascend910A3 = "Ascend910A3" - // Atlas200ISoc 200 soc env - Atlas200ISoc = "Atlas 200I SoC A1" - - // DcmiApiTimeout dcmi interface timeout seconds - DcmiApiTimeout = 1 - - // SubscribeAllDevice subscribe all device ID - SubscribeAllDevice = -1 - // MinVDevID min value of virtual device id - MinVDevID = 100 - // MaxVDevID max value of virtual device id - MaxVDevID = 1124 - - // InvalidID invalid ID - InvalidID = 0xffffffff - - // FailedMetricValue for failed metric value - FailedMetricValue = -1 - - // FailedValue for failed value - FailedValue = 0xffffffff - - // MaxErrorCodeLen max length of error code for Prometheus - MaxErrorCodeLen = 10 -) - -const ( - // BootStartFinish chip hot reset finish - BootStartFinish = 16 -) - -const ( - // FaultRecover device fault recover - FaultRecover = int8(0) - // FaultOccur device fault occur - FaultOccur = int8(1) - // FaultOnce once device fault - FaultOnce = int8(2) -) - -const ( - // AMPMode for AMP chip work mode - AMPMode = "AMP" - // SMPMode for SMP chip work mode - SMPMode = "SMP" - - // NetworkInit init status - NetworkInit = 6 - // NetworkSuccess chip network is healthy - NetworkSuccess = 0 - - // MaxProcNum process number in device side - MaxProcNum = 32 - // UnitMB MB - UnitMB float64 = 1024 * 1024 - - // Chip910 chip name 910 - Chip910 = "910" - - // A300IA2BoardId board id of A300I A2 and 910proB - A300IA2BoardId = 0x28 - - // A300IA2GB64BoardId board id of A300I A2 64GB - A300IA2GB64BoardId = 0x29 - - // A900A3SuperPodBin1BoardId board id of A900/A9000 A3 SuperPod Bin1 - A900A3SuperPodBin1BoardId = 0xb0 - - // A900A3SuperPodBin2BoardId board id of A900/A9000 A3 SuperPod Bin2 - A900A3SuperPodBin2BoardId = 0xb1 - - // A900A3SuperPodBin3BoardId board id of A900/A9000 A3 SuperPod Bin3 - A900A3SuperPodBin3BoardId = 0xb2 - - // A800IA3BoardId board id of A800I A3 - A800IA3BoardId = 0xb3 - - // A900A3SuperPodMainBoardId1 board id of A900 A3 SuperPod MainBoard1 - A900A3SuperPodMainBoardId1 = 0x18 - - // A900A3SuperPodMainBoardId2 board id of A900 A3 SuperPod MainBoard2 - A900A3SuperPodMainBoardId2 = 0x19 - - // A800IA3MainBoardId A800I A3 MainBoardId - A800IA3MainBoardId = 0x14 - - // A9000A3SuperPodMainBoardId1 board id of A9000 A3 SuperPod MainBoard1 - A9000A3SuperPodMainBoardId1 = 0x1C - - // A9000A3SuperPodMainBoardId2 board id of A9000 A3 SuperPod MainBoard2 - A9000A3SuperPodMainBoardId2 = 0x1D -) - -// log limit domains for metrics -const ( - // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID - DomainForLogicIdErr = "logicID" -) - -// DcmiDeviceType used to represent the dcmi device type -type DcmiDeviceType int32 - -const ( - // DcmiDeviceTypeDDR represents the component type DCMI_DEVICE_TYPE_DDR - DcmiDeviceTypeDDR DcmiDeviceType = 0 - // DcmiDeviceTypeSRAM represents the component type DCMI_DEVICE_TYPE_SRAM - DcmiDeviceTypeSRAM DcmiDeviceType = 1 - // DcmiDeviceTypeHBM represents the component type DCMI_DEVICE_TYPE_HBM - DcmiDeviceTypeHBM DcmiDeviceType = 2 - // DcmiDeviceTypeNPU represents the component type DCMI_DEVICE_TYPE_NPU - DcmiDeviceTypeNPU DcmiDeviceType = 3 - // DcmiDeviceTypeNONE represents the component type DCMI_DEVICE_TYPE_NONE - DcmiDeviceTypeNONE DcmiDeviceType = 0xff -) - -const ( - // ErrMsgInitCardListFailed is used where initialization of the card list fails - ErrMsgInitCardListFailed = "get card list failed for init" - // ErrMsgGetBoardInfoFailed is used where there is a failure in getting board info - ErrMsgGetBoardInfoFailed = "get board info failed, no card found" -) - -const ( - // MaxHccspingMeshAddr is the max number of hccsping addresses - MaxHccspingMeshAddr = 1024 - // MinPktSize is the min packet size - MinPktSize = 1792 - // MaxPktSize is the max packet size - MaxPktSize = 3000 - // MinPktSendNum is the min packet send number - MinPktSendNum = 1 - // MaxPktSendNum is the max packet send number - MaxPktSendNum = 1000 - // MinPktInterval is the min packet interval - MinPktInterval = 1 - // MaxPktInterval is the max packet interval - MaxPktInterval = 1000 - // MinTaskInterval is the min task interval - MinTaskInterval = 1 - // MaxTaskInterval is the max task interval - MaxTaskInterval = 60 - // InternalPingMeshTaskID is the inner ping mesh task id - InternalPingMeshTaskID uint = 0 - // ExternalPingMeshTaskID is the outer ping mesh task id - ExternalPingMeshTaskID uint = 1 - // DefaultPingMeshPortID is the default ping mesh port - DefaultPingMeshPortID = 0 - // DefaultPktSize is the default packet size - DefaultPktSize = 1792 - // DefaultPktSendNum is the default packet send number - DefaultPktSendNum = 10 - // DefaultPktInterval is the default packet interval - DefaultPktInterval = 10 - // DefaultTimeout is the default timeout - DefaultTimeout = 1 -) diff --git a/mind-cluster/component/ascend-common/devmanager/common/types.go b/mind-cluster/component/ascend-common/devmanager/common/types.go deleted file mode 100644 index 870c716..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/types.go +++ /dev/null @@ -1,435 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common define common types -package common - -// MemoryInfo memory information struct -type MemoryInfo struct { - MemorySize uint64 `json:"memory_size"` - MemoryAvailable uint64 `json:"memory_available"` - Frequency uint32 `json:"memory_frequency"` - Utilization uint32 `json:"memory_utilization"` -} - -// HbmInfo high bandwidth memory info -type HbmInfo struct { - MemorySize uint64 `json:"memory_size"` // total size,MB - Frequency uint32 `json:"hbm_frequency"` // frequency MHz - Usage uint64 `json:"memory_usage"` // memory usage,MB - Temp int32 `json:"hbm_temperature"` // temperature - BandWidthUtilRate uint32 `json:"hbm_bandwidth_util"` // bandwidth utilization -} - -// HbmAggregateInfo more comprehensive high bandwidth memory information with ecc information -type HbmAggregateInfo struct { - *HbmInfo - ECCInfo *ECCInfo `json:"hbm_ecc_info"` // ECC information -} - -// ChipInfo chip info -type ChipInfo struct { - Type string `json:"chip_type"` - Name string `json:"chip_name"` - Version string `json:"chip_version"` - NpuName string `json:"npu_name"` - AICoreCnt int `json:"aicore_cnt"` -} - -// ChipBaseInfo all id of chip -type ChipBaseInfo struct { - PhysicID int32 - LogicID int32 - CardID int32 - DeviceID int32 -} - -// CgoCreateVDevOut create virtual device output info -type CgoCreateVDevOut struct { - VDevID uint32 - PcieBus uint32 - PcieDevice uint32 - PcieFunc uint32 - VfgID uint32 - Reserved []uint8 -} - -// CgoCreateVDevRes create virtual device input info -type CgoCreateVDevRes struct { - VDevID uint32 - VfgID uint32 - TemplateName string - Reserved []uint8 -} - -// CgoBaseResource base resource info -type CgoBaseResource struct { - Token uint64 - TokenMax uint64 - TaskTimeout uint64 - VfgID uint32 - VipMode uint8 - Reserved []uint8 -} - -// CgoComputingResource compute resource info -type CgoComputingResource struct { - // accelator resource - Aic float32 - Aiv float32 - Dsa uint16 - Rtsq uint16 - Acsq uint16 - Cdqm uint16 - CCore uint16 - Ffts uint16 - Sdma uint16 - PcieDma uint16 - - // memory resource, MB as unit - MemorySize uint64 - - // id resource - EventID uint32 - NotifyID uint32 - StreamID uint32 - ModelID uint32 - - // cpu resource - TopicScheduleAicpu uint16 - HostCtrlCPU uint16 - HostAicpu uint16 - DeviceAicpu uint16 - TopicCtrlCPUSlot uint16 - - Reserved []uint8 -} - -// CgoMediaResource media resource info -type CgoMediaResource struct { - Jpegd float32 - Jpege float32 - Vpc float32 - Vdec float32 - Pngd float32 - Venc float32 - Reserved []uint8 -} - -// CgoVDevQueryInfo virtual resource special info -type CgoVDevQueryInfo struct { - Name string - Status uint32 - IsContainerUsed uint32 - Vfid uint32 - VfgID uint32 - ContainerID uint64 - Base CgoBaseResource - Computing CgoComputingResource - Media CgoMediaResource -} - -// CgoVDevQueryStru virtual resource info -type CgoVDevQueryStru struct { - VDevID uint32 - QueryInfo CgoVDevQueryInfo -} - -// CgoSocFreeResource soc free resource info -type CgoSocFreeResource struct { - VfgNum uint32 - VfgBitmap uint32 - Base CgoBaseResource - Computing CgoComputingResource - Media CgoMediaResource -} - -// CgoSocTotalResource soc total resource info -type CgoSocTotalResource struct { - VDevNum uint32 - VDevID []uint32 - VfgNum uint32 - VfgBitmap uint32 - Base CgoBaseResource - Computing CgoComputingResource - Media CgoMediaResource -} - -// CgoSuperPodInfo super pod info -type CgoSuperPodInfo struct { - SdId uint32 - ScaleType uint32 - SuperPodId uint32 - ServerId uint32 - Reserve []uint32 -} - -// VirtualDevInfo virtual device infos -type VirtualDevInfo struct { - TotalResource CgoSocTotalResource - FreeResource CgoSocFreeResource - VDevInfo []CgoVDevQueryStru - VDevActivityInfo []VDevActivityInfo -} - -// DevFaultInfo device's fault info -type DevFaultInfo struct { - EventID int64 - LogicID int32 - ModuleType int8 // ModuleType prototype is dcmi node_type - ModuleID int8 // ModuleID prototype is dcmi node_id - SubModuleType int8 // SubModuleType prototype is dcmi sub_node_type - SubModuleID int8 // SubModuleID prototype is dcmi sub_node_id - Severity int8 - Assertion int8 - AlarmRaisedTime int64 -} - -// DevProcessInfo device process info -type DevProcessInfo struct { - DevProcArray []DevProcInfo - ProcNum int32 -} - -// DevProcInfo process info in device side -type DevProcInfo struct { - Pid int32 - // the total amount of memory occupied by the device side OS and allocated by the business, unit is MB - MemUsage float64 -} - -// BoardInfo board info of device -type BoardInfo struct { - BoardId uint32 - PcbId uint32 - BomId uint32 - SlotId uint32 -} - -// VDevActivityInfo vNPU activity info for 310P -type VDevActivityInfo struct { - VDevID uint32 - VDevAiCoreRate uint32 - VDevTotalMem uint64 - VDevUsedMem uint64 - VDevAiCore float64 - IsVirtualDev bool -} - -// PCIEBwStat contains pcie bandwidth -type PCIEBwStat struct { - PcieRxPBw PcieStatValue - PcieRxNPBw PcieStatValue - PcieRxCPLBw PcieStatValue - PcieTxPBw PcieStatValue - PcieTxNPBw PcieStatValue - PcieTxCPLBw PcieStatValue -} - -// PcieStatValue pcie stat three value, like [min_bw,max_bw,avg_bw] -type PcieStatValue struct { - PcieMinBw int32 - PcieMaxBw int32 - PcieAvgBw int32 -} - -// DeviceNetworkHealth dcmi_get_device_network_health api return value -type DeviceNetworkHealth struct { - HealthCode uint32 - RetCode int32 -} - -// ECCInfo dcmi_get_device_ecc_info api return value -type ECCInfo struct { - EnableFlag int32 - SingleBitErrorCnt int64 - DoubleBitErrorCnt int64 - TotalSingleBitErrorCnt int64 - TotalDoubleBitErrorCnt int64 - SingleBitIsolatedPagesCnt int64 - DoubleBitIsolatedPagesCnt int64 -} - -// NpuNetInfo network info of npu -type NpuNetInfo struct { - // The optical info - OpticalInfo *OpticalInfo - // The transfer rate of network port - LinkSpeedInfo *LinkSpeedInfo - // Historical link statistics of network ports - LinkStatInfo *LinkStatInfo - // Statistics about packets - StatInfo *StatInfo - // Network port real-time bandwidth - BandwidthInfo *BandwidthInfo - // LinkStatusInfo refers to the link state - LinkStatusInfo *LinkStatusInfo -} - -// BandwidthInfo contains network port real-time bandwidth -type BandwidthInfo struct { - // TxValue transform speed - TxValue float64 `json:"tx_value"` - // RxValue receive speed - RxValue float64 `json:"rx_value"` -} - -// HccsStatisticInfo contains hccs statistic info -type HccsStatisticInfo struct { - TxCnt []uint64 - RxCnt []uint64 - CrcErrCnt []uint64 - retryCnt []uint64 - reservedFieldCnt []uint64 -} - -// HccsBandwidthInfo contains hccs bandwidth info -type HccsBandwidthInfo struct { - ProfilingTime uint32 - TotalTxbw float64 - TotalRxbw float64 - TxBandwidth []float64 - RxBandwidth []float64 -} - -// SioCrcErrStatisticInfo contains sio crc error statistic info -type SioCrcErrStatisticInfo struct { - TxErrCnt int64 - RxErrCnt int64 - Reserved []uint32 -} - -// StatInfo the statistics about packets -type StatInfo struct { - // Total number of pause frames received by the MAC - MacRxPauseNum float64 - // Total number of pause frames sent by MAC - MacTxPauseNum float64 - // Total number of PFC frames received by MAC - MacRxPfcPktNum float64 - // Total number of PFC frames sent by MAC - MacTxPfcPktNum float64 - // Total number of bad packets received by MAC - MacRxBadPktNum float64 - // Total number of bad packets sent by MAC - MacTxBadPktNum float64 - // The total number of packets received by the RoCE network card - RoceRxAllPktNum float64 - // The total number of packets sent by the RoCE network card - RoceTxAllPktNum float64 - // The number of bad packets received by the RoCE network card - RoceRxErrPktNum float64 - // The number of bad packets sent by the RoCE network card - RoceTxErrPktNum float64 - // The number of CNP type packets received by the RoCE network card - RoceRxCnpPktNum float64 - // The number of CNP type packets sent by the RoCE network card - RoceTxCnpPktNum float64 - // Number of RoCE network card retry messages - RoceNewPktRtyNum float64 - // Total number of bytes of bad packets sent by MAC - MacTxBadOctNum float64 - // Total number of bytes of bad packets received by MAC - MacRxBadOctNum float64 - // The number of unexpected ACK messages received by the RoCE network card - RoceUnexpectedAckNum float64 - // The number of out-of-order packets received by the RoCE network card - RoceOutOfOrderNum float64 - // The number of packets with domain segment verification errors received by the RoCE network card - RoceVerificationErrNum float64 - // The number of messages generated by abnormal QP connection status received by the RoCE network card - RoceQpStatusErrNum float64 - // The number of ecn - RoceEcnDBNum float64 - // The number of err info - MacRXFcsErrPktNum float64 -} - -// LinkStatInfo refers to the historical link statistics, including the times of link-up -type LinkStatInfo struct { - // The times of link-up - LinkUPNum float64 -} - -// LinkStatusInfo refers to the link state -type LinkStatusInfo struct { - // The state of link - LinkState string -} - -// LinkSpeedInfo the transfer rate of network port -type LinkSpeedInfo struct { - // The rate of network port - Speed float64 -} - -// OpticalInfo indicates the optical module information -type OpticalInfo struct { - // Optical module status, indicating whether it is in place (present) - OpticalState float64 - // Power sent by No.0 optical module - OpticalTxPower0 float64 - // Power sent by No.1 optical module - OpticalTxPower1 float64 - // Power sent by No.2 optical module - OpticalTxPower2 float64 - // Power sent by No.3 optical module - OpticalTxPower3 float64 - // Reception power of No.0 optical module - OpticalRxPower0 float64 - // Reception power of No.1 optical module - OpticalRxPower1 float64 - // Reception power of No.2 optical module - OpticalRxPower2 float64 - // Reception power of No.3 optical module - OpticalRxPower3 float64 - // Optical module voltage - OpticalVcc float64 - // Optical module temperature - OpticalTemp float64 -} - -// HccspingMeshOperate refers to the operation of hccsping mesh -type HccspingMeshOperate struct { - DstAddr string - PktSize int - PktSendNum int - PktInterval int - Timeout int - TaskInterval int - TaskId int -} - -// HccspingMeshInfo refers to the result of hccsping mesh -type HccspingMeshInfo struct { - DstAddr []string - SucPktNum []uint - FailPktNum []uint - MaxTime []int - MinTime []int - AvgTime []int - TP95Time []int - ReplyStatNum []int - PingTotalNum []int - DestNum int -} - -// ElabelInfo elabel information structure -type ElabelInfo struct { - ProductName string - Model string - Manufacturer string - ManufacturerDate string - SerialNumber string -} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils.go b/mind-cluster/component/ascend-common/devmanager/common/utils.go deleted file mode 100644 index 87e14df..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/utils.go +++ /dev/null @@ -1,305 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common this for util method -package common - -import ( - "fmt" - "math" - "regexp" - "strings" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" -) - -var ( - reg910A = regexp.MustCompile(api.Ascend910APattern) - reg910B = regexp.MustCompile(api.Ascend910BPattern) - reg310P = regexp.MustCompile(api.Ascend310PPattern) -) - -// IsGreaterThanOrEqualInt32 check num range -func IsGreaterThanOrEqualInt32(num int64) bool { - if num >= int64(math.MaxInt32) { - return true - } - - return false -} - -// IsValidUtilizationRate valid utilization rate is 0-100 -func IsValidUtilizationRate(num uint32) bool { - if num > uint32(Percent) || num < 0 { - return false - } - - return true -} - -// IsValidChipInfo valid chip info is or not empty -func IsValidChipInfo(chip *ChipInfo) bool { - return chip.Name != "" || chip.Type != "" || chip.Version != "" -} - -// IsValidBoardInfo check whether the board info is valid -func IsValidBoardInfo(board *BoardInfo) bool { - return board.BoardId != InvalidID || board.PcbId != InvalidID || - board.BomId != InvalidID || board.SlotId != InvalidID -} - -// IsValidMainBoardInfo check whether the mainBoardId is valid -func IsValidMainBoardInfo(mainBoardId uint32) bool { - return mainBoardId != InvalidID -} - -// IsValidCardID valid card id -func IsValidCardID(cardID int32) bool { - // for cardID, please watch the maximum value of the driver is changed in the future version - return cardID >= 0 && cardID < HiAIMaxCardID -} - -// IsValidDeviceID valid device id -func IsValidDeviceID(deviceID int32) bool { - return deviceID >= 0 && deviceID < HiAIMaxDeviceNum -} - -// IsValidLogicIDOrPhyID valid logic id -func IsValidLogicIDOrPhyID(id int32) bool { - return id >= 0 && id < HiAIMaxCardNum*HiAIMaxDeviceNum -} - -// IsValidCardIDAndDeviceID check two params both needs meet the requirement -func IsValidCardIDAndDeviceID(cardID, deviceID int32) bool { - if !IsValidCardID(cardID) { - return false - } - - return IsValidDeviceID(deviceID) -} - -// IsValidDevNumInCard valid devNum in card -func IsValidDevNumInCard(num int32) bool { - return num > 0 && num <= HiAIMaxDeviceNum -} - -// IsValidVDevID valid vir device id -func IsValidVDevID(vDevID uint32) bool { - return vDevID >= MinVDevID && vDevID < MaxVDevID -} - -// IsValidPortID valid port id -func IsValidPortID(portID int) bool { - return portID == DefaultPingMeshPortID -} - -// IsValidTaskID valid task id -func IsValidTaskID(taskID uint) bool { - return taskID == InternalPingMeshTaskID || taskID == ExternalPingMeshTaskID -} - -// IsValidHccspingMeshOperate valid hccsping mesh operate -func IsValidHccspingMeshOperate(operate HccspingMeshOperate) error { - if len(operate.DstAddr) > MaxHccspingMeshAddr { - return fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(operate.DstAddr), - MaxHccspingMeshAddr) - } - if operate.PktSize < MinPktSize || operate.PktSize > MaxPktSize { - return fmt.Errorf("pkt size %d is invalid, should be between %d and %d", operate.PktSize, MinPktSize, MaxPktSize) - } - if operate.PktSendNum < MinPktSendNum || operate.PktSendNum > MaxPktSendNum { - return fmt.Errorf("pkt send num %d is invalid, should be between %d and %d", operate.PktSendNum, - MinPktSendNum, MaxPktSendNum) - } - if operate.PktInterval < MinPktInterval || operate.PktInterval > MaxPktInterval { - return fmt.Errorf("pkt interval %d is invalid, should be between %d and %d", operate.PktInterval, - MinPktInterval, MaxPktInterval) - } - if operate.TaskInterval < MinTaskInterval || operate.TaskInterval > MaxTaskInterval { - return fmt.Errorf("task interval %d is invalid, should be between %d and %d", operate.TaskInterval, - MinTaskInterval, MaxTaskInterval) - } - if !IsValidTaskID(uint(operate.TaskId)) { - return fmt.Errorf("task id %d is invalid", operate.TaskId) - } - return nil -} - -// GetDeviceTypeByChipName get device type by chipName -func GetDeviceTypeByChipName(chipName string) string { - if reg310P.MatchString(chipName) { - return api.Ascend310P - } - if strings.Contains(chipName, api.Ascend310BNo) { - return api.Ascend310B - } - if strings.Contains(chipName, api.Ascend310No) { - return api.Ascend310 - } - if reg910B.MatchString(chipName) { - return api.Ascend910B - } - if reg910A.MatchString(chipName) { - return api.Ascend910A - } - return "" -} - -func get910TemplateNameList() map[string]struct{} { - return map[string]struct{}{"vir16": {}, "vir08": {}, "vir04": {}, "vir02": {}, "vir01": {}} -} - -func get910BTemplateNameList() map[string]struct{} { - return map[string]struct{}{ - "vir03_1c_8g": {}, "vir05_1c_8g": {}, "vir05_1c_16g": {}, - "vir06_1c_16g": {}, "vir10_3c_16g": {}, "vir10_3c_16g_nm": {}, - "vir10_3c_32g": {}, "vir10_4c_16g_m": {}, "vir12_3c_32g": {}} -} - -func get310PTemplateNameList() map[string]struct{} { - return map[string]struct{}{"vir04": {}, "vir02": {}, "vir01": {}, "vir04_3c": {}, "vir02_1c": {}, - "vir04_4c_dvpp": {}, "vir04_3c_ndvpp": {}} -} - -// IsValidTemplateName check template name meet the requirement -func IsValidTemplateName(devType, templateName string) bool { - isTemplateNameValid := false - switch devType { - case api.Ascend310P: - _, isTemplateNameValid = get310PTemplateNameList()[templateName] - case api.Ascend910A: - _, isTemplateNameValid = get910TemplateNameList()[templateName] - case api.Ascend910B: - _, isTemplateNameValid = get910BTemplateNameList()[templateName] - default: - } - return isTemplateNameValid -} - -// RemoveDuplicate remove duplicate device -func RemoveDuplicate(list *[]string) []string { - listValueMap := make(map[string]string, len(*list)) - var rmDupValueList []string - for _, value := range *list { - listValueMap[value] = value - } - for _, value := range listValueMap { - rmDupValueList = append(rmDupValueList, value) - } - return rmDupValueList -} - -// GetNpuName get npu name eg: name-type-version -func GetNpuName(chipInfo *ChipInfo) string { - if chipInfo == nil { - return "" - } - if len(chipInfo.Name) == 0 && len(chipInfo.Type) == 0 && len(chipInfo.Version) == 0 { - return "" - } - return fmt.Sprintf("%s-%s-%s", chipInfo.Name, chipInfo.Type, chipInfo.Version) -} - -// SetExternalParams transmit npu-exporter's startup parameters -func SetExternalParams(profilingTime int) { - ProfilingTime = profilingTime -} - -// SetHccsBWProfilingTime set hccs bw profiling time -func SetHccsBWProfilingTime(hccsbwProfilingTime int) { - HccsBWProfilingTime = hccsbwProfilingTime -} - -// DeepCopyChipInfo copy chip info deeply -func DeepCopyChipInfo(chipInfo *ChipInfo) *ChipInfo { - if chipInfo == nil { - return nil - } - - return &ChipInfo{ - Type: chipInfo.Type, - Name: chipInfo.Name, - Version: chipInfo.Version, - } -} - -// DeepCopyVDevActivityInfo copy VDevActivityInfo deeply -func DeepCopyVDevActivityInfo(vDevActivityInfo *VDevActivityInfo) *VDevActivityInfo { - if vDevActivityInfo == nil { - return nil - } - - return &VDevActivityInfo{ - VDevID: vDevActivityInfo.VDevID, - VDevAiCoreRate: vDevActivityInfo.VDevAiCoreRate, - VDevTotalMem: vDevActivityInfo.VDevTotalMem, - VDevUsedMem: vDevActivityInfo.VDevUsedMem, - VDevAiCore: vDevActivityInfo.VDevAiCore, - IsVirtualDev: vDevActivityInfo.IsVirtualDev, - } -} - -// DeepCopySlice Deep copy slice -func deepCopySlice(slice interface{}) interface{} { - - switch v := slice.(type) { - case []int: - newSlice := make([]int, len(v)) - copy(newSlice, v) - return newSlice - case []uint32: - newSlice := make([]uint32, len(v)) - copy(newSlice, v) - return newSlice - case []float64: - newSlice := make([]float64, len(v)) - copy(newSlice, v) - return newSlice - default: - hwlog.RunLog.Warn("Unsupported slice type") - return slice - } -} - -// GetDevType get device type by chip name,boardId -func GetDevType(chipName string, boardId uint32) string { - var devType string - if Is910A3Chip(boardId) { - devType = api.Ascend910A3 - } else { - devType = GetDeviceTypeByChipName(chipName) - } - return devType -} - -// Is910A3Chip current chip is 910A3 or not,include A900A3 and A9000A3 -func Is910A3Chip(boardId uint32) bool { - return a3BoardIds.Has(int32(boardId)) -} - -// IsA900A3SuperPod current product is A900A3 super pod or not -func IsA900A3SuperPod(mainBoardId uint32) bool { - return a900A3SuperPodMainBoardIds.Has(int32(mainBoardId)) -} - -// IsA9000A3SuperPod current product is A9000A3 super pod or not -func IsA9000A3SuperPod(mainBoardId uint32) bool { - return a9000A3SuperPodMainBoardIds.Has(int32(mainBoardId)) -} - -// Is800IA3Chip current chip is 800IA3 or not -func Is800IA3Chip(mainBoardId uint32) bool { - return mainBoardId == A800IA3MainBoardId -} diff --git a/mind-cluster/component/ascend-common/devmanager/common/utils_test.go b/mind-cluster/component/ascend-common/devmanager/common/utils_test.go deleted file mode 100644 index 548a1c0..0000000 --- a/mind-cluster/component/ascend-common/devmanager/common/utils_test.go +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package common - -import ( - "fmt" - "strings" - "testing" - - "github.com/smartystreets/goconvey/convey" -) - -// TestDeepCopyHccsBandwidthInfo TestDeepCopySlice -func TestDeepCopyHccsBandwidthInfo(t *testing.T) { - - convey.Convey("should copy a new []int", t, func() { - slice := []int{1, 2} - newSlice := deepCopySlice(slice) - convey.So(&newSlice, convey.ShouldNotEqual, &slice) - }) - - convey.Convey("should copy a new []int32", t, func() { - slice := []uint32{1, 2} - - newSlice := deepCopySlice(slice) - convey.So(&newSlice, convey.ShouldNotEqual, &slice) - }) - - convey.Convey("should copy a new []float64", t, func() { - slice := []float64{1, 2} - newSlice := deepCopySlice(slice) - convey.So(&newSlice, convey.ShouldNotEqual, &slice) - }) -} - -func TestIsValidPortID(t *testing.T) { - convey.Convey("Given a port ID", t, func() { - convey.Convey("01-When the port ID is invalid, should return false", func() { - portID1 := 1 - convey.So(IsValidPortID(portID1), convey.ShouldBeFalse) - }) - - convey.Convey("02-When the port ID is the default, should return true", func() { - portID3 := DefaultPingMeshPortID - convey.So(IsValidPortID(portID3), convey.ShouldBeTrue) - }) - }) -} - -func TestIsValidTaskID(t *testing.T) { - convey.Convey("Given a task ID", t, func() { - convey.Convey("01-When the task ID is valid, should return true", func() { - taskID1 := InternalPingMeshTaskID - convey.So(IsValidTaskID(taskID1), convey.ShouldBeTrue) - - taskID2 := ExternalPingMeshTaskID - convey.So(IsValidTaskID(taskID2), convey.ShouldBeTrue) - }) - - convey.Convey("02-When the task ID is invalid, should return false", func() { - const taskID3 = 3 - convey.So(IsValidTaskID(taskID3), convey.ShouldBeFalse) - }) - }) -} - -func defaultHccspingMeshOperate() HccspingMeshOperate { - return HccspingMeshOperate{ - DstAddr: "1111", - PktSize: MinPktSize, - PktSendNum: MinPktSendNum, - PktInterval: MinPktInterval, - TaskInterval: MinTaskInterval, - TaskId: int(InternalPingMeshTaskID), - } -} - -func check(op HccspingMeshOperate, expectedErr error) { - err := IsValidHccspingMeshOperate(op) - convey.So(err, convey.ShouldResemble, expectedErr) -} - -func expectedError(pattern string, current, min, max int) error { - return fmt.Errorf(pattern, current, min, max) -} - -func TestIsValidHccspingMeshOperate01(t *testing.T) { - convey.Convey("Given a pingmesh operate", t, func() { - op := defaultHccspingMeshOperate() - convey.Convey("01-When operation valid, should return nil", func() { - check(op, nil) - }) - var expectedErr error - convey.Convey("01-When the dst addr is invalid, should return error", func() { - op.DstAddr = strings.Repeat("a", MaxHccspingMeshAddr+1) - expectedErr = fmt.Errorf("dst addr length %d is invalid, should not be greater than %d", len(op.DstAddr), - MaxHccspingMeshAddr) - check(op, expectedErr) - }) - op.DstAddr = "1111" - convey.Convey("02-When the pkt size is invalid, should return error", func() { - pattern := "pkt size %d is invalid, should be between %d and %d" - op.PktSize = MinPktSize - 1 - check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) - op.PktSize = MaxPktSize + 1 - check(op, expectedError(pattern, op.PktSize, MinPktSize, MaxPktSize)) - }) - op.PktSize = MinPktSize - convey.Convey("03-When the pkt send num is invalid, should return error", func() { - pattern := "pkt send num %d is invalid, should be between %d and %d" - op.PktSendNum = MinPktSendNum - 1 - check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) - op.PktSendNum = MaxPktSendNum + 1 - check(op, expectedError(pattern, op.PktSendNum, MinPktSendNum, MaxPktSendNum)) - }) - op.TaskInterval = MinTaskInterval - convey.Convey("06-When the task id is invalid, should return error", func() { - op.TaskId = int(ExternalPingMeshTaskID) + 1 - expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) - check(op, expectedErr) - }) - }) -} - -func TestIsValidHccspingMeshOperate02(t *testing.T) { - convey.Convey("Given a pingmesh operate", t, func() { - op := defaultHccspingMeshOperate() - convey.Convey("04-When the pkt interval is invalid, should return error", func() { - pattern := "pkt interval %d is invalid, should be between %d and %d" - op.PktInterval = MinPktInterval - 1 - check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) - op.PktInterval = MaxPktInterval + 1 - check(op, expectedError(pattern, op.PktInterval, MinPktInterval, MaxPktInterval)) - }) - op.PktInterval = MinPktInterval - convey.Convey("05-When the task interval is invalid, should return error", func() { - pattern := "task interval %d is invalid, should be between %d and %d" - op.TaskInterval = MinTaskInterval - 1 - check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) - op.TaskInterval = MaxTaskInterval + 1 - check(op, expectedError(pattern, op.TaskInterval, MinTaskInterval, MaxTaskInterval)) - }) - op.TaskInterval = MinTaskInterval - var expectedErr error - convey.Convey("06-When the task id is invalid, should return error", func() { - op.TaskId = int(ExternalPingMeshTaskID) + 1 - expectedErr = fmt.Errorf("task id %d is invalid", op.TaskId) - check(op, expectedErr) - }) - }) -} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go b/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go deleted file mode 100644 index bd68af3..0000000 --- a/mind-cluster/component/ascend-common/devmanager/dcmi/constants.go +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package dcmi this for constants -package dcmi - -// MainCmd main command enum -type MainCmd uint32 - -// VDevMngSubCmd virtual device manager sub command -type VDevMngSubCmd uint32 - -// DieType present chip die type -type DieType int32 - -const ( - // dcmiMaxVdevNum is max number of vdevice, value is from driver specification - dcmiMaxVdevNum = 32 - // dcmiMaxReserveNum is max number of reserve, value is from driver specification - dcmiMaxReserveNum = 8 - // dcmiVDevResNameLen length of vnpu resource name - dcmiVDevResNameLen = 16 - // dcmiHccsMaxPcsNum max pcs number for hccs - dcmiHccsMaxPcsNum = 16 - - maxChipNameLen = 32 - productTypeLen = 64 - dcmiVersionLen = 32 - - // MainCmdChipInf main cmd chip inf - MainCmdChipInf MainCmd = 12 - // MainCmdHccs main cmd of hccs - MainCmdHccs MainCmd = 16 - // MainCmdVDevMng virtual device manager - MainCmdVDevMng MainCmd = 52 - // MainCmdSio SIO status between die - MainCmdSio MainCmd = 56 - - // VmngSubCmdGetVDevResource get virtual device resource info - VmngSubCmdGetVDevResource VDevMngSubCmd = 0 - // VmngSubCmdGetTotalResource get total resource info - VmngSubCmdGetTotalResource VDevMngSubCmd = 1 - // VmngSubCmdGetFreeResource get free resource info - VmngSubCmdGetFreeResource VDevMngSubCmd = 2 - // VmngSubCmdGetVDevActivity get vir device activity info - VmngSubCmdGetVDevActivity VDevMngSubCmd = 5 - // CinfSubCmdGetSPodInfo get super pod info - CinfSubCmdGetSPodInfo VDevMngSubCmd = 1 - // SioSubCmdCrcErrStatistics get SIO err statistics info - SioSubCmdCrcErrStatistics VDevMngSubCmd = 0 - // HccsSubCmdGetStatisticInfo get statistic info - HccsSubCmdGetStatisticInfo VDevMngSubCmd = 3 - // HccsSubCmdGetStatisticInfoU64 get statistic info in u64 - HccsSubCmdGetStatisticInfoU64 VDevMngSubCmd = 5 - - // NDIE NDie ID, only Ascend910 has - NDIE DieType = 0 - // VDIE VDie ID, it can be the uuid of chip - VDIE DieType = 1 - // DieIDCount die id array max length - DieIDCount = 5 - - // ipAddrTypeV6 ip address type of IPv6 - ipAddrTypeV6 = 1 - - agentdrvProfDataNum = 3 -) diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go deleted file mode 100644 index 834397c..0000000 --- a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi.go +++ /dev/null @@ -1,2213 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package dcmi this for dcmi manager -package dcmi - -// #cgo LDFLAGS: -ldl -/* - #include - #include - #include - #include - - #include "dcmi_interface_api.h" - - static void *dcmiHandle; - #define SO_NOT_FOUND -99999 - #define FUNCTION_NOT_FOUND -99998 - #define SUCCESS 0 - #define ERROR_UNKNOWN -99997 - #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); - - // dcmi - static int (*dcmi_init_func)(); - static int dcmi_init_new(){ - CALL_FUNC(dcmi_init) - } - - static int (*dcmi_get_card_num_list_func)(int *card_num,int *card_list,int list_length); - static int dcmi_get_card_num_list_new(int *card_num,int *card_list,int list_length){ - CALL_FUNC(dcmi_get_card_num_list,card_num,card_list,list_length) - } - - static int (*dcmi_get_device_num_in_card_func)(int card_id,int *device_num); - static int dcmi_get_device_num_in_card_new(int card_id,int *device_num){ - CALL_FUNC(dcmi_get_device_num_in_card,card_id,device_num) - } - - static int (*dcmi_get_device_logic_id_func)(int *device_logic_id,int card_id,int device_id); - static int dcmi_get_device_logic_id_new(int *device_logic_id,int card_id,int device_id){ - CALL_FUNC(dcmi_get_device_logic_id,device_logic_id,card_id,device_id) - } - - static int (*dcmi_create_vdevice_func)(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, - struct dcmi_create_vdev_out *out); - int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, - struct dcmi_create_vdev_out *out){ - CALL_FUNC(dcmi_create_vdevice,card_id,device_id,vdev,out) - } - - static int (*dcmi_get_device_info_func)(int card_id, int device_id, enum dcmi_main_cmd main_cmd, - unsigned int sub_cmd,void *buf, unsigned int *size); - int dcmi_get_device_info(int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, - unsigned int *size){ - CALL_FUNC(dcmi_get_device_info,card_id,device_id,main_cmd,sub_cmd,buf,size) - } - - static int (*dcmi_get_hccs_link_bandwidth_info_func)(int card_id, int device_id, -struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); - int dcmi_get_hccs_link_bandwidth_info(int card_id, int device_id, -struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info){ - CALL_FUNC(dcmi_get_hccs_link_bandwidth_info,card_id,device_id,hccs_bandwidth_info) - } - - static int (*dcmi_set_destroy_vdevice_func)(int card_id,int device_id, unsigned int VDevid); - int dcmi_set_destroy_vdevice(int card_id,int device_id, unsigned int VDevid){ - CALL_FUNC(dcmi_set_destroy_vdevice,card_id,device_id,VDevid) - } - - static int (*dcmi_get_device_type_func)(int card_id,int device_id,enum dcmi_unit_type *device_type); - int dcmi_get_device_type(int card_id,int device_id,enum dcmi_unit_type *device_type){ - CALL_FUNC(dcmi_get_device_type,card_id,device_id,device_type) - } - - static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); - int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ - CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) - } - - static int (*dcmi_get_device_utilization_rate_func)(int card_id, int device_id, int input_type, - unsigned int *utilization_rate); - int dcmi_get_device_utilization_rate(int card_id, int device_id, int input_type, unsigned int *utilization_rate){ - CALL_FUNC(dcmi_get_device_utilization_rate,card_id,device_id,input_type,utilization_rate) - } - - static int (*dcmi_get_device_temperature_func)(int card_id, int device_id, int *temperature); - int dcmi_get_device_temperature(int card_id, int device_id, int *temperature){ - CALL_FUNC(dcmi_get_device_temperature,card_id,device_id,temperature) - } - - static int (*dcmi_get_device_voltage_func)(int card_id, int device_id, unsigned int *voltage); - int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage){ - CALL_FUNC(dcmi_get_device_voltage,card_id,device_id,voltage) - } - - static int (*dcmi_get_device_power_info_func)(int card_id, int device_id, int *power); - int dcmi_get_device_power_info(int card_id, int device_id, int *power){ - CALL_FUNC(dcmi_get_device_power_info,card_id,device_id,power) - } - - static int (*dcmi_get_device_frequency_func)(int card_id, int device_id, enum dcmi_freq_type input_type, - unsigned int *frequency); - int dcmi_get_device_frequency(int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency){ - CALL_FUNC(dcmi_get_device_frequency,card_id,device_id,input_type,frequency) - } - - static int (*dcmi_get_device_memory_info_v3_func)(int card_id, int device_id, - struct dcmi_get_memory_info_stru *memory_info); - int dcmi_get_device_memory_info_v3(int card_id, int device_id, struct dcmi_get_memory_info_stru *memory_info){ - CALL_FUNC(dcmi_get_device_memory_info_v3,card_id,device_id,memory_info) - } - - static int (*dcmi_get_device_hbm_info_func)(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); - int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info){ - CALL_FUNC(dcmi_get_device_hbm_info,card_id,device_id,hbm_info) - } - - static int (*dcmi_get_device_errorcode_v2_func)(int card_id, int device_id, int *error_count, - unsigned int *error_code_list, unsigned int list_len); - int dcmi_get_device_errorcode_v2(int card_id, int device_id, int *error_count, - unsigned int *error_code_list, unsigned int list_len){ - CALL_FUNC(dcmi_get_device_errorcode_v2,card_id,device_id,error_count,error_code_list,list_len) - } - - static int (*dcmi_get_device_chip_info_func)(int card_id, int device_id, struct dcmi_chip_info *chip_info); - int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info){ - CALL_FUNC(dcmi_get_device_chip_info,card_id,device_id,chip_info) - } - - static int (*dcmi_get_device_chip_info_v2_func)(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); - int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info){ - CALL_FUNC(dcmi_get_device_chip_info_v2,card_id,device_id,chip_info) - } - - static int (*dcmi_get_device_phyid_from_logicid_func)(unsigned int logicid, unsigned int *phyid); - int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid){ - CALL_FUNC(dcmi_get_device_phyid_from_logicid,logicid,phyid) - } - - static int (*dcmi_get_device_logicid_from_phyid_func)(unsigned int phyid, unsigned int *logicid); - int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid){ - CALL_FUNC(dcmi_get_device_logicid_from_phyid,phyid,logicid) - } - - static int (*dcmi_get_device_ip_func)(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, - struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); - int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, - struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask){ - CALL_FUNC(dcmi_get_device_ip,card_id,device_id,input_type,port_id,ip,mask) - } - - static int (*dcmi_get_device_network_health_func)(int card_id, int device_id, - enum dcmi_rdfx_detect_result *result); - int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result){ - CALL_FUNC(dcmi_get_device_network_health,card_id,device_id,result) - } - - static int (*dcmi_get_card_list_func)(int *card_num, int *card_list, int list_len); - int dcmi_get_card_list(int *card_num, int *card_list, int list_len){ - CALL_FUNC(dcmi_get_card_list,card_num,card_list,list_len) - } - - static int (*dcmi_get_device_id_in_card_func)(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); - int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id){ - CALL_FUNC(dcmi_get_device_id_in_card,card_id,device_id_max,mcu_id,cpu_id) - } - - static int (*dcmi_get_memory_info_func)(int card_id, int device_id, - struct dcmi_memory_info_stru *device_memory_info); - int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info){ - CALL_FUNC(dcmi_get_memory_info,card_id,device_id,device_memory_info) - } - - static int (*dcmi_get_device_errorcode_func)(int card_id, int device_id, int *error_count, unsigned int *error_code, - int *error_width); - int dcmi_get_device_errorcode(int card_id, int device_id, int *error_count, unsigned int *error_code, - int *error_width){ - CALL_FUNC(dcmi_get_device_errorcode,card_id,device_id,error_count,error_code,error_width) - } - - static int (*dcmi_get_card_id_device_id_from_logicid_func)(int *card_id, int *device_id, - unsigned int device_logic_id); - int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id){ - CALL_FUNC(dcmi_get_card_id_device_id_from_logicid,card_id,device_id,device_logic_id) - } - - static int (*dcmi_mcu_get_power_info_func)(int card_id, int *power); - static int dcmi_mcu_get_power_info_new(int card_id, int *power){ - CALL_FUNC(dcmi_mcu_get_power_info,card_id,power) - } - - static int (*dcmi_get_product_type_func)(int card_id, int device_id, char *product_type_str, int buf_size); - int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size){ - CALL_FUNC(dcmi_get_product_type,card_id,device_id,product_type_str,buf_size) - } - - static int (*dcmi_get_card_elabel_v2_func)(int card_id, struct dcmi_elabel_info *elabel_info); - int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info){ - CALL_FUNC(dcmi_get_card_elabel_v2,card_id,elabel_info) - } - - static int (*dcmi_set_device_reset_func)(int card_id, int device_id, enum dcmi_reset_channel channel_type); - int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type){ - CALL_FUNC(dcmi_set_device_reset,card_id,device_id,channel_type) - } - - static int (*dcmi_get_device_outband_channel_state_func)(int card_id, int device_id, int* channel_state); - int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state){ - CALL_FUNC(dcmi_get_device_outband_channel_state,card_id,device_id,channel_state) - } - - static int (*dcmi_pre_reset_soc_func)(int card_id, int device_id); - int dcmi_pre_reset_soc(int card_id, int device_id){ - CALL_FUNC(dcmi_pre_reset_soc,card_id,device_id) - } - - static int (*dcmi_rescan_soc_func)(int card_id, int device_id); - int dcmi_rescan_soc(int card_id, int device_id){ - CALL_FUNC(dcmi_rescan_soc,card_id,device_id) - } - - static int (*dcmi_get_netdev_brother_device_func)(int card_id, int device_id, int* brother_card_id); - int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id){ - CALL_FUNC(dcmi_get_netdev_brother_device,card_id,device_id,brother_card_id) - } - - static int (*dcmi_get_device_boot_status_func)(int card_id, int device_id, enum dcmi_boot_status *boot_status); - int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status){ - CALL_FUNC(dcmi_get_device_boot_status,card_id,device_id,boot_status) - } - - void goEventFaultCallBack(struct dcmi_dms_fault_event); - static void event_handler(struct dcmi_event *fault_event) { - goEventFaultCallBack(fault_event->event_t.dms_event); - } - - static int (*dcmi_subscribe_fault_event_func)(int card_id, int device_id, struct dcmi_event_filter filter, - void (*f_name)(struct dcmi_event *fault_event)); - int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter){ - CALL_FUNC(dcmi_subscribe_fault_event,card_id,device_id,filter,event_handler) - } - - static int (*dcmi_get_npu_work_mode_func)(int card_id, unsigned char *work_mode); - int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode){ - CALL_FUNC(dcmi_get_npu_work_mode,card_id,work_mode) - } - - static int (*dcmi_get_device_die_v2_func)(int card_id, int device_id, enum dcmi_die_type input_type, - struct dcmi_die_id *die_id); - int dcmi_get_device_die_v2(int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id){ - CALL_FUNC(dcmi_get_device_die_v2,card_id,device_id,input_type,die_id) - } - - static int (*dcmi_get_device_resource_info_func)(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, - int *proc_num); - int dcmi_get_device_resource_info(int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, int *proc_num){ - CALL_FUNC(dcmi_get_device_resource_info,card_id,device_id,proc_info,proc_num) - } - - static int (*dcmi_get_device_pcie_info_v2_func)(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); - int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info){ - CALL_FUNC(dcmi_get_device_pcie_info_v2,card_id,device_id,pcie_info) - } - - static int (*dcmi_get_device_board_info_func)(int card_id, int device_id, struct dcmi_board_info *board_info); - int dcmi_get_device_board_info(int card_id, int device_id, struct dcmi_board_info *board_info){ - CALL_FUNC(dcmi_get_device_board_info,card_id,device_id,board_info) - } - - static int (*dcmi_get_pcie_link_bandwidth_info_func)(int card_id, int device_id, - struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); - int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, - struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info){ - CALL_FUNC(dcmi_get_pcie_link_bandwidth_info,card_id,device_id,pcie_link_bandwidth_info) - } - - static int (*dcmi_get_dcmi_version_func)(char *dcmi_ver, int buf_size); - int dcmi_get_dcmi_version(char *dcmi_ver, int buf_size){ - CALL_FUNC(dcmi_get_dcmi_version,dcmi_ver,buf_size) - } - - static int (*dcmi_get_device_ecc_info_func)(int card_id, int device_id, enum dcmi_device_type input_type, - struct dcmi_ecc_info *device_ecc_info); - int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, - struct dcmi_ecc_info *device_ecc_info){ - CALL_FUNC(dcmi_get_device_ecc_info,card_id,device_id,input_type,device_ecc_info) - } - - static int (*dcmi_get_mainboard_id_func)(int card_id, int device_id, unsigned int *mainboard_id); - int dcmi_get_mainboard_id(int card_id, int device_id, unsigned int *mainboard_id){ - CALL_FUNC(dcmi_get_mainboard_id,card_id,device_id,mainboard_id) - } - - static int (*dcmi_start_hccsping_mesh_func)(int card_id, int device_id, int port_id, -struct dcmi_hccsping_mesh_operate *hccsping_mesh); - int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, -struct dcmi_hccsping_mesh_operate *hccsping_mesh){ - CALL_FUNC(dcmi_start_hccsping_mesh,card_id,device_id,port_id,hccsping_mesh) -} - static int (*dcmi_stop_hccsping_mesh_func)(int card_id, int device_id, int port_id, unsigned int task_id); - int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id){ - CALL_FUNC(dcmi_stop_hccsping_mesh,card_id,device_id,port_id,task_id) - } - - static int (*dcmi_get_hccsping_mesh_info_func)(int card_id, int device_id, int port_id, unsigned int task_id, -struct dcmi_hccsping_mesh_info *hccsping_mesh_info); - int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, -struct dcmi_hccsping_mesh_info *hccsping_mesh_info){ - CALL_FUNC(dcmi_get_hccsping_mesh_info,card_id,device_id,port_id,task_id,hccsping_mesh_info) -} - - static int (*dcmi_get_hccsping_mesh_state_func)(int card_id, int device_id, int port_id, unsigned int task_id, -unsigned int *state); - int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, -unsigned int *state){ - CALL_FUNC(dcmi_get_hccsping_mesh_state,card_id,device_id,port_id,task_id,state) -} - - static int (*dcmi_get_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int *status); - int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status){ - CALL_FUNC(dcmi_get_spod_node_status,card_id,device_id,sdid,status) - } - - static int (*dcmi_set_spod_node_status_func)(int card_id, int device_id, unsigned int sdid, unsigned int status); - int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status){ - CALL_FUNC(dcmi_set_spod_node_status,card_id,device_id,sdid,status) - } - - // load .so files and functions - static int dcmiInit_dl(const char* dcmiLibPath){ - if (dcmiLibPath == NULL) { - fprintf (stderr,"lib path is null\n"); - return SO_NOT_FOUND; - } - dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); - if (dcmiHandle == NULL){ - fprintf (stderr,"%s\n",dlerror()); - return SO_NOT_FOUND; - } - - dcmi_init_func = dlsym(dcmiHandle,"dcmi_init"); - - dcmi_get_card_num_list_func = dlsym(dcmiHandle,"dcmi_get_card_num_list"); - - dcmi_get_device_num_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_num_in_card"); - - dcmi_get_device_logic_id_func = dlsym(dcmiHandle,"dcmi_get_device_logic_id"); - - dcmi_create_vdevice_func = dlsym(dcmiHandle,"dcmi_create_vdevice"); - - dcmi_get_device_info_func = dlsym(dcmiHandle,"dcmi_get_device_info"); - - dcmi_set_destroy_vdevice_func = dlsym(dcmiHandle,"dcmi_set_destroy_vdevice"); - - dcmi_get_device_type_func = dlsym(dcmiHandle,"dcmi_get_device_type"); - - dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); - - dcmi_get_device_utilization_rate_func = dlsym(dcmiHandle,"dcmi_get_device_utilization_rate"); - - dcmi_get_device_temperature_func = dlsym(dcmiHandle,"dcmi_get_device_temperature"); - - dcmi_get_device_voltage_func = dlsym(dcmiHandle,"dcmi_get_device_voltage"); - - dcmi_get_device_power_info_func = dlsym(dcmiHandle,"dcmi_get_device_power_info"); - - dcmi_get_device_frequency_func = dlsym(dcmiHandle,"dcmi_get_device_frequency"); - - dcmi_get_device_memory_info_v3_func = dlsym(dcmiHandle,"dcmi_get_device_memory_info_v3"); - - dcmi_get_device_hbm_info_func = dlsym(dcmiHandle,"dcmi_get_device_hbm_info"); - - dcmi_get_device_errorcode_v2_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode_v2"); - - dcmi_get_device_chip_info_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info"); - - dcmi_get_device_chip_info_v2_func = dlsym(dcmiHandle,"dcmi_get_device_chip_info_v2"); - - dcmi_get_device_phyid_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_device_phyid_from_logicid"); - - dcmi_get_device_logicid_from_phyid_func = dlsym(dcmiHandle,"dcmi_get_device_logicid_from_phyid"); - - dcmi_get_device_ip_func = dlsym(dcmiHandle,"dcmi_get_device_ip"); - - dcmi_get_device_network_health_func = dlsym(dcmiHandle,"dcmi_get_device_network_health"); - - dcmi_get_card_list_func = dlsym(dcmiHandle,"dcmi_get_card_list"); - - dcmi_get_device_id_in_card_func = dlsym(dcmiHandle,"dcmi_get_device_id_in_card"); - - dcmi_get_memory_info_func = dlsym(dcmiHandle,"dcmi_get_memory_info"); - - dcmi_get_device_errorcode_func = dlsym(dcmiHandle,"dcmi_get_device_errorcode"); - - dcmi_get_card_id_device_id_from_logicid_func = dlsym(dcmiHandle,"dcmi_get_card_id_device_id_from_logicid"); - - dcmi_mcu_get_power_info_func = dlsym(dcmiHandle,"dcmi_mcu_get_power_info"); - - dcmi_get_product_type_func = dlsym(dcmiHandle,"dcmi_get_product_type"); - - dcmi_get_card_elabel_v2_func = dlsym(dcmiHandle,"dcmi_get_card_elabel_v2"); - - dcmi_set_device_reset_func = dlsym(dcmiHandle,"dcmi_set_device_reset"); - - dcmi_get_device_outband_channel_state_func = dlsym(dcmiHandle,"dcmi_get_device_outband_channel_state"); - - dcmi_pre_reset_soc_func = dlsym(dcmiHandle,"dcmi_pre_reset_soc"); - - dcmi_rescan_soc_func = dlsym(dcmiHandle,"dcmi_rescan_soc"); - - dcmi_get_netdev_brother_device_func = dlsym(dcmiHandle,"dcmi_get_netdev_brother_device"); - - dcmi_get_device_boot_status_func = dlsym(dcmiHandle,"dcmi_get_device_boot_status"); - - dcmi_subscribe_fault_event_func = dlsym(dcmiHandle,"dcmi_subscribe_fault_event"); - - dcmi_get_npu_work_mode_func = dlsym(dcmiHandle, "dcmi_get_npu_work_mode"); - - dcmi_get_device_die_v2_func = dlsym(dcmiHandle, "dcmi_get_device_die_v2"); - - dcmi_get_device_resource_info_func = dlsym(dcmiHandle, "dcmi_get_device_resource_info"); - - dcmi_get_device_pcie_info_v2_func = dlsym(dcmiHandle, "dcmi_get_device_pcie_info_v2"); - - dcmi_get_device_board_info_func = dlsym(dcmiHandle, "dcmi_get_device_board_info"); - - dcmi_get_pcie_link_bandwidth_info_func = dlsym(dcmiHandle, "dcmi_get_pcie_link_bandwidth_info"); - - dcmi_get_dcmi_version_func = dlsym(dcmiHandle,"dcmi_get_dcmi_version"); - - dcmi_get_device_ecc_info_func = dlsym(dcmiHandle,"dcmi_get_device_ecc_info"); - - dcmi_get_mainboard_id_func = dlsym(dcmiHandle, "dcmi_get_mainboard_id"); - - dcmi_get_hccs_link_bandwidth_info_func = dlsym(dcmiHandle,"dcmi_get_hccs_link_bandwidth_info"); - - dcmi_start_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_start_hccsping_mesh"); - - dcmi_stop_hccsping_mesh_func = dlsym(dcmiHandle,"dcmi_stop_hccsping_mesh"); - - dcmi_get_hccsping_mesh_info_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_info"); - - dcmi_get_hccsping_mesh_state_func = dlsym(dcmiHandle,"dcmi_get_hccsping_mesh_state"); - - dcmi_get_spod_node_status_func = dlsym(dcmiHandle,"dcmi_get_spod_node_status"); - - dcmi_set_spod_node_status_func = dlsym(dcmiHandle,"dcmi_set_spod_node_status"); - - return SUCCESS; - } - - static int dcmiShutDown(void){ - if (dcmiHandle == NULL) { - return SUCCESS; - } - return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); - } -*/ -import "C" -import ( - "errors" - "fmt" - "math" - "net" - "strconv" - "strings" - "time" - "unsafe" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "ascend-common/devmanager/common" -) - -// CDcmiMemoryInfoV3 the c struct of memoryInfo for v3 -type CDcmiMemoryInfoV3 = C.struct_dcmi_get_memory_info_stru - -// CDcmiMemoryInfoV1 the c struct of memoryInfo for v1 -type CDcmiMemoryInfoV1 = C.struct_dcmi_memory_info_stru - -// DcDriverInterface interface for dcmi -type DcDriverInterface interface { - DcInit() error - DcShutDown() error - - DcGetDcmiVersion() (string, error) - DcGetDeviceCount() (int32, error) - DcGetLogicIDList() (int32, []int32, error) - DcGetDeviceHealth(int32, int32) (int32, error) - DcGetDeviceNetWorkHealth(int32, int32) (uint32, error) - DcGetDeviceUtilizationRate(int32, int32, common.DeviceType) (int32, error) - DcGetDeviceTemperature(int32, int32) (int32, error) - DcGetDeviceVoltage(int32, int32) (float32, error) - DcGetDevicePowerInfo(int32, int32) (float32, error) - DcGetDeviceFrequency(int32, int32, common.DeviceType) (uint32, error) - DcGetMemoryInfo(int32, int32) (*common.MemoryInfo, error) - DcGetHbmInfo(int32, int32) (*common.HbmInfo, error) - DcGetDeviceErrorCode(int32, int32) (int32, int64, error) - DcGetChipInfo(int32, int32) (*common.ChipInfo, error) - DcGetPhysicIDFromLogicID(int32) (int32, error) - DcGetLogicIDFromPhysicID(int32) (int32, error) - DcGetDeviceLogicID(int32, int32) (int32, error) - DcGetDeviceIPAddress(int32, int32, int32) (string, error) - DcGetMcuPowerInfo(int32) (float32, error) - DcGetDieID(int32, int32, DieType) (string, error) - DcGetPCIeBusInfo(int32, int32) (string, error) - - DcGetCardList() (int32, []int32, error) - DcGetDeviceNumInCard(int32) (int32, error) - DcSetDestroyVirtualDevice(int32, int32, uint32) error - DcCreateVirtualDevice(int32, int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) - DcGetDeviceVDevResource(int32, int32, uint32) (common.CgoVDevQueryStru, error) - DcGetDeviceTotalResource(int32, int32) (common.CgoSocTotalResource, error) - DcGetDeviceFreeResource(int32, int32) (common.CgoSocFreeResource, error) - DcGetVDevActivityInfo(int32, int32, uint32) (common.VDevActivityInfo, error) - DcVGetDeviceInfo(int32, int32) (common.VirtualDevInfo, error) - DcGetCardIDDeviceID(int32) (int32, int32, error) - DcCreateVDevice(int32, common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) - DcGetVDeviceInfo(int32) (common.VirtualDevInfo, error) - DcDestroyVDevice(int32, uint32) error - DcGetProductType(int32, int32) (string, error) - DcGetNpuWorkMode(int32) (int, error) - DcSetDeviceReset(int32, int32) error - DcGetBrotherCardID(int32, int32) (int32, error) - DcPreResetSoc(int32, int32) error - DcGetOutBandChannelState(int32, int32) error - DcSetDeviceResetOutBand(int32, int32) error - DcRescanSoc(int32, int32) error - DcGetDeviceBootStatus(int32) (int, error) - DcGetSuperPodInfo(int32, int32) (common.CgoSuperPodInfo, error) - - DcGetDeviceAllErrorCode(int32, int32) (int32, []int64, error) - DcSubscribeDeviceFaultEvent(int32, int32) error - DcSetFaultEventCallFunc(func(common.DevFaultInfo)) - DcGetDevProcessInfo(int32, int32) (*common.DevProcessInfo, error) - DcGetDeviceBoardInfo(int32, int32) (common.BoardInfo, error) - DcGetPCIEBandwidth(int32, int32, int) (common.PCIEBwStat, error) - DcGetDeviceEccInfo(int32, int32, common.DcmiDeviceType) (*common.ECCInfo, error) - DcGetSioInfo(int32, int32) (common.SioCrcErrStatisticInfo, error) - DcGetHccsStatisticInfo(int32, int32) (common.HccsStatisticInfo, error) - DcGetHccsStatisticInfoU64(int32, int32) (common.HccsStatisticInfo, error) - DcGetDeviceMainBoardInfo(int32, int32) (uint32, error) - DcGetHccsBandwidthInfo(int32, int32, int) (common.HccsBandwidthInfo, error) - - DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error - DcStopHccsPingMesh(int32, int32, int, uint) error - DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) - DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) - DcGetSuperPodStatus(int32, int32, uint32) (int, error) - DcSetSuperPodStatus(int32, int32, uint32, uint32) error - DcGetCardElabelV2(int32) (common.ElabelInfo, error) -} - -const ( - dcmiLibraryName = "libdcmi.so" - templateNameLen = 32 - ipAddrListLen = 1024 - hcclpingMeshMaxNum = 48 -) - -var faultEventCallFunc func(common.DevFaultInfo) = nil -var ( - dcmiErrMap = map[int32]string{ - -8001: "The input parameter is incorrect", - -8002: "Permission error", - -8003: "The memory interface operation failed", - -8004: "The security function failed to be executed", - -8005: "Internal errors", - -8006: "Response timed out", - -8007: "Invalid deviceID", - -8008: "The device does not exist", - -8009: "ioctl returns failed", - -8010: "The message failed to be sent", - -8011: "Message reception failed", - -8012: "Not ready yet,please try again", - -8013: "This API is not supported in containers", - -8014: "The file operation failed", - -8015: "Reset failed", - -8016: "Reset cancels", - -8017: "Upgrading", - -8020: "Device resources are occupied", - -8022: "Partition consistency check,inconsistent partitions were found", - -8023: "The configuration information does not exist", - -8255: "Device ID/function is not supported", - -99997: "dcmi shutdown failed", - -99998: "The called function is missing,please upgrade the driver", - -99999: "dcmi libdcmi.so failed to load", - } -) - -// DcManager for manager dcmi interface -type DcManager struct{} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DcManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, - operate common.HccspingMeshOperate) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return fmt.Errorf("portID(%d) is invalid", portID) - } - if err := common.IsValidHccspingMeshOperate(operate); err != nil { - return fmt.Errorf("operate(%v) is invalid, err: %v", operate, err) - } - dtsAddrLsit := [ipAddrListLen]C.char{0} - for i := 0; i < len(operate.DstAddr) && i < len(dtsAddrLsit); i++ { - dtsAddrLsit[i] = C.char(operate.DstAddr[i]) - } - - op := C.struct_dcmi_hccsping_mesh_operate{ - dst_addr_list: dtsAddrLsit, - pkt_size: C.int(operate.PktSize), - pkt_send_num: C.int(operate.PktSendNum), - pkt_interval: C.int(operate.PktInterval), - timeout: C.int(operate.Timeout), - task_interval: C.int(operate.TaskInterval), - task_id: C.int(operate.TaskId), - } - if retCode := C.dcmi_start_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), - &op); retCode != common.Success { - return fmt.Errorf("dcmi start hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", - cardID, deviceID, int32(retCode)) - } - - return nil -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DcManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return fmt.Errorf("portID(%d) is invalid", portID) - } - if !common.IsValidTaskID(taskID) { - return fmt.Errorf("taskID(%d) is invalid", taskID) - } - if retCode := C.dcmi_stop_hccsping_mesh(C.int(cardID), C.int(deviceID), C.int(portID), - C.uint(taskID)); retCode != common.Success { - return fmt.Errorf("dcmi stop hccs ping mesh failed cardID(%d) deviceID(%d) error code: %d", - cardID, deviceID, int32(retCode)) - } - return nil -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DcManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, - taskID uint) (*common.HccspingMeshInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return nil, fmt.Errorf("portID(%d) is invalid", portID) - } - if !common.IsValidTaskID(taskID) { - return nil, fmt.Errorf("taskID(%d) is invalid", taskID) - } - var info C.struct_dcmi_hccsping_mesh_info - if retCode := C.dcmi_get_hccsping_mesh_info(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), - &info); retCode != common.Success { - return nil, fmt.Errorf("dcmi get hccs ping mesh info failed cardID(%d) deviceID(%d) error code: %d", - cardID, deviceID, int32(retCode)) - } - return convertHccspingMeshInfo(&info) -} - -func convertHccspingMeshInfo(cInfo *C.struct_dcmi_hccsping_mesh_info) (*common.HccspingMeshInfo, error) { - if int(cInfo.dest_num) > hcclpingMeshMaxNum { - return nil, fmt.Errorf("dest_num(%d) is invalid, should not be greater than %d", int(cInfo.dest_num), - hcclpingMeshMaxNum) - } - info := &common.HccspingMeshInfo{} - for i := 0; i < int(cInfo.dest_num); i++ { - info.DstAddr = append(info.DstAddr, convertToString(cInfo.dst_addr[i])) - info.SucPktNum = append(info.SucPktNum, uint(cInfo.suc_pkt_num[i])) - info.FailPktNum = append(info.FailPktNum, uint(cInfo.fail_pkt_num[i])) - info.MaxTime = append(info.MaxTime, int(cInfo.max_time[i])) - info.MinTime = append(info.MinTime, int(cInfo.min_time[i])) - info.AvgTime = append(info.AvgTime, int(cInfo.avg_time[i])) - info.TP95Time = append(info.TP95Time, int(cInfo.tp95_time[i])) - info.ReplyStatNum = append(info.ReplyStatNum, int(cInfo.reply_stat_num[i])) - info.PingTotalNum = append(info.PingTotalNum, int(cInfo.ping_total_num[i])) - } - info.DestNum = int(cInfo.dest_num) - return info, nil -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DcManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidPortID(portID) { - return common.RetError, fmt.Errorf("portID(%d) is invalid", portID) - } - if !common.IsValidTaskID(taskID) { - return common.RetError, fmt.Errorf("taskID(%d) is invalid", taskID) - } - var state C.uint - if retCode := C.dcmi_get_hccsping_mesh_state(C.int(cardID), C.int(deviceID), C.int(portID), C.uint(taskID), - &state); retCode != common.Success { - return common.RetError, fmt.Errorf("dcmi get hccs ping mesh state failed cardID(%d) deviceID(%d) error "+ - "code: %d", cardID, deviceID, int32(retCode)) - } - return int(state), nil -} - -// DcInit load symbol and initialize dcmi -func (d *DcManager) DcInit() error { - dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) - if err != nil { - return err - } - cDcmiTemplateName := C.CString(dcmiLibPath) - defer C.free(unsafe.Pointer(cDcmiTemplateName)) - if retCode := C.dcmiInit_dl(cDcmiTemplateName); retCode != C.SUCCESS { - return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) - } - if retCode := C.dcmi_init_new(); retCode != C.SUCCESS { - return fmt.Errorf("dcmi init failed, error code: %d", int32(retCode)) - } - return nil -} - -// DcShutDown clean the dynamically loaded resource -func (d *DcManager) DcShutDown() error { - if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { - return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) - } - - return nil -} - -// DcGetCardList get card list -func (d *DcManager) DcGetCardList() (int32, []int32, error) { - var ids [common.HiAIMaxCardNum]C.int - var cNum C.int - if retCode := C.dcmi_get_card_list(&cNum, &ids[0], common.HiAIMaxCardNum); int32(retCode) != common. - Success { - return common.RetError, nil, fmt.Errorf("get card list failed, error code: %d", int32(retCode)) - } - // checking card's quantity - if cNum <= 0 || cNum > common.HiAIMaxCardNum { - return common.RetError, nil, fmt.Errorf("get error card quantity: %d", int32(cNum)) - } - var cardNum = int32(cNum) - var i int32 - var cardIDList []int32 - for i = 0; i < cardNum; i++ { - cardID := int32(ids[i]) - if cardID < 0 { - hwlog.RunLog.Errorf("get invalid card ID: %d", cardID) - continue - } - cardIDList = append(cardIDList, cardID) - } - return cardNum, cardIDList, nil -} - -// DcGetDeviceNumInCard get device number in the npu card -func (d *DcManager) DcGetDeviceNumInCard(cardID int32) (int32, error) { - if !common.IsValidCardID(cardID) { - return common.RetError, fmt.Errorf("cardID(%d) is invalid", cardID) - } - var deviceNum C.int - if retCode := C.dcmi_get_device_num_in_card_new(C.int(cardID), &deviceNum); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device count on the card failed, error code: %d", int32(retCode)) - } - if !common.IsValidDevNumInCard(int32(deviceNum)) { - return common.RetError, fmt.Errorf("get error device quantity: %d", int32(deviceNum)) - } - return int32(deviceNum), nil -} - -// DcGetDeviceLogicID get device logicID -func (d *DcManager) DcGetDeviceLogicID(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var logicID C.int - if retCode := C.dcmi_get_device_logic_id_new(&logicID, C.int(cardID), - C.int(deviceID)); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("failed to get logicID by cardID(%d) and deviceID(%d), error code: %d", - cardID, deviceID, int32(retCode)) - } - - // check whether logicID is invalid - if !common.IsValidLogicIDOrPhyID(int32(logicID)) { - return common.RetError, fmt.Errorf("get invalid logicID: %d", int32(logicID)) - } - return int32(logicID), nil -} - -// DcSetDestroyVirtualDevice destroy virtual device -func (d *DcManager) DcSetDestroyVirtualDevice(cardID, deviceID int32, vDevID uint32) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if retCode := C.dcmi_set_destroy_vdevice(C.int(cardID), C.int(deviceID), - C.uint(vDevID)); int32(retCode) != common.Success { - return fmt.Errorf("destroy virtual device failed, error code: %d", int32(retCode)) - } - return nil -} - -func convertCreateVDevOut(cCreateVDevOut C.struct_dcmi_create_vdev_out) common.CgoCreateVDevOut { - cgoCreateVDevOut := common.CgoCreateVDevOut{ - VDevID: uint32(cCreateVDevOut.vdev_id), - PcieBus: uint32(cCreateVDevOut.pcie_bus), - PcieDevice: uint32(cCreateVDevOut.pcie_device), - PcieFunc: uint32(cCreateVDevOut.pcie_func), - VfgID: uint32(cCreateVDevOut.vfg_id), - } - return cgoCreateVDevOut -} - -// DcCreateVirtualDevice create virtual device -func (d *DcManager) DcCreateVirtualDevice(cardID, deviceID int32, vDevInfo common.CgoCreateVDevRes) (common. - CgoCreateVDevOut, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoCreateVDevOut{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if len(vDevInfo.TemplateName) > templateNameLen { - return common.CgoCreateVDevOut{}, fmt.Errorf("the length of template name exceeds the upper limit") - } - cTemplateName := [templateNameLen]C.char{0} - for i := 0; i < len(vDevInfo.TemplateName); i++ { - cTemplateName[i] = C.char(vDevInfo.TemplateName[i]) - } - deviceCreateStr := C.struct_dcmi_create_vdev_res_stru{ - vdev_id: C.uint(vDevInfo.VDevID), - vfg_id: C.uint(vDevInfo.VfgID), - template_name: cTemplateName, - } - - var createVDevOut C.struct_dcmi_create_vdev_out - if retCode := C.dcmi_create_vdevice(C.int(cardID), C.int(deviceID), &deviceCreateStr, - &createVDevOut); int32(retCode) != common.Success { - return common.CgoCreateVDevOut{}, fmt.Errorf("create vdevice failed, error is: %d", int32(retCode)) - } - - return convertCreateVDevOut(createVDevOut), nil -} - -func convertToString(cgoArr [dcmiVDevResNameLen]C.char) string { - var charArr []rune - for _, v := range cgoArr { - if v == 0 { - break - } - charArr = append(charArr, rune(v)) - } - return string(charArr) -} - -func convertBaseResource(cBaseResource C.struct_dcmi_base_resource) common.CgoBaseResource { - baseResource := common.CgoBaseResource{ - Token: uint64(cBaseResource.token), - TokenMax: uint64(cBaseResource.token_max), - TaskTimeout: uint64(cBaseResource.task_timeout), - VfgID: uint32(cBaseResource.vfg_id), - VipMode: uint8(cBaseResource.vip_mode), - } - return baseResource -} - -func convertComputingResource(cComputingResource C.struct_dcmi_computing_resource) common.CgoComputingResource { - computingResource := common.CgoComputingResource{ - Aic: float32(cComputingResource.aic), - Aiv: float32(cComputingResource.aiv), - Dsa: uint16(cComputingResource.dsa), - Rtsq: uint16(cComputingResource.rtsq), - Acsq: uint16(cComputingResource.acsq), - Cdqm: uint16(cComputingResource.cdqm), - CCore: uint16(cComputingResource.c_core), - Ffts: uint16(cComputingResource.ffts), - Sdma: uint16(cComputingResource.sdma), - PcieDma: uint16(cComputingResource.pcie_dma), - MemorySize: uint64(cComputingResource.memory_size), - EventID: uint32(cComputingResource.event_id), - NotifyID: uint32(cComputingResource.notify_id), - StreamID: uint32(cComputingResource.stream_id), - ModelID: uint32(cComputingResource.model_id), - TopicScheduleAicpu: uint16(cComputingResource.topic_schedule_aicpu), - HostCtrlCPU: uint16(cComputingResource.host_ctrl_cpu), - HostAicpu: uint16(cComputingResource.host_aicpu), - DeviceAicpu: uint16(cComputingResource.device_aicpu), - TopicCtrlCPUSlot: uint16(cComputingResource.topic_ctrl_cpu_slot), - } - return computingResource -} - -func convertMediaResource(cMediaResource C.struct_dcmi_media_resource) common.CgoMediaResource { - mediaResource := common.CgoMediaResource{ - Jpegd: float32(cMediaResource.jpegd), - Jpege: float32(cMediaResource.jpege), - Vpc: float32(cMediaResource.vpc), - Vdec: float32(cMediaResource.vdec), - Pngd: float32(cMediaResource.pngd), - Venc: float32(cMediaResource.venc), - } - return mediaResource -} - -func convertVDevQueryInfo(cVDevQueryInfo C.struct_dcmi_vdev_query_info) common.CgoVDevQueryInfo { - name := convertToString(cVDevQueryInfo.name) - vDevQueryInfo := common.CgoVDevQueryInfo{ - Name: string(name), - Status: uint32(cVDevQueryInfo.status), - IsContainerUsed: uint32(cVDevQueryInfo.is_container_used), - Vfid: uint32(cVDevQueryInfo.vfid), - VfgID: uint32(cVDevQueryInfo.vfg_id), - ContainerID: uint64(cVDevQueryInfo.container_id), - Base: convertBaseResource(cVDevQueryInfo.base), - Computing: convertComputingResource(cVDevQueryInfo.computing), - Media: convertMediaResource(cVDevQueryInfo.media), - } - return vDevQueryInfo -} - -func convertVDevQueryStru(cVDevQueryStru C.struct_dcmi_vdev_query_stru) common.CgoVDevQueryStru { - vDevQueryStru := common.CgoVDevQueryStru{ - VDevID: uint32(cVDevQueryStru.vdev_id), - QueryInfo: convertVDevQueryInfo(cVDevQueryStru.query_info), - } - return vDevQueryStru -} - -// DcGetDeviceVDevResource get virtual device resource info -func (d *DcManager) DcGetDeviceVDevResource(cardID, deviceID int32, vDevID uint32) (common.CgoVDevQueryStru, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoVDevQueryStru{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetVDevResource - var vDevResource C.struct_dcmi_vdev_query_stru - size := C.uint(unsafe.Sizeof(vDevResource)) - vDevResource.vdev_id = C.uint(vDevID) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&vDevResource), &size); int32(retCode) != common.Success { - return common.CgoVDevQueryStru{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) - } - return convertVDevQueryStru(vDevResource), nil -} - -func convertSocTotalResource(cSocTotalResource C.struct_dcmi_soc_total_resource) common.CgoSocTotalResource { - socTotalResource := common.CgoSocTotalResource{ - VDevNum: uint32(cSocTotalResource.vdev_num), - VfgNum: uint32(cSocTotalResource.vfg_num), - VfgBitmap: uint32(cSocTotalResource.vfg_bitmap), - Base: convertBaseResource(cSocTotalResource.base), - Computing: convertComputingResource(cSocTotalResource.computing), - Media: convertMediaResource(cSocTotalResource.media), - } - for i := uint32(0); i < uint32(cSocTotalResource.vdev_num) && i < dcmiMaxVdevNum; i++ { - socTotalResource.VDevID = append(socTotalResource.VDevID, uint32(cSocTotalResource.vdev_id[i])) - } - return socTotalResource -} - -// DcGetDeviceTotalResource get device total resource info -func (d *DcManager) DcGetDeviceTotalResource(cardID, deviceID int32) (common.CgoSocTotalResource, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoSocTotalResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetTotalResource - var totalResource C.struct_dcmi_soc_total_resource - size := C.uint(unsafe.Sizeof(totalResource)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&totalResource), &size); int32(retCode) != common.Success { - return common.CgoSocTotalResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) - } - if uint32(totalResource.vdev_num) > dcmiMaxVdevNum { - return common.CgoSocTotalResource{}, fmt.Errorf("get error virtual quantity: %d", - uint32(totalResource.vdev_num)) - } - - return convertSocTotalResource(totalResource), nil -} - -func convertSuperPodInfo(cSuperPodInfo C.struct_dcmi_spod_info) common.CgoSuperPodInfo { - superPodInfo := common.CgoSuperPodInfo{ - SdId: uint32(cSuperPodInfo.sdid), - ScaleType: uint32(cSuperPodInfo.scale_type), - SuperPodId: uint32(cSuperPodInfo.super_pod_id), - ServerId: uint32(cSuperPodInfo.server_id), - } - - for i := uint32(0); i < dcmiMaxReserveNum; i++ { - superPodInfo.Reserve = append(superPodInfo.Reserve, uint32(cSuperPodInfo.reserve[i])) - } - - return superPodInfo -} - -// DcGetSuperPodInfo get device total resource info -func (d *DcManager) DcGetSuperPodInfo(cardID, deviceID int32) (common.CgoSuperPodInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoSuperPodInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var unitType C.enum_dcmi_unit_type - if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { - return common.CgoSuperPodInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) - } - if int32(unitType) != common.NpuType { - return common.CgoSuperPodInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) - } - - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdChipInf) - subCmd := CinfSubCmdGetSPodInfo - var sPodInfo C.struct_dcmi_spod_info - size := C.uint(unsafe.Sizeof(sPodInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&sPodInfo), &size); int32(retCode) != common.Success { - return common.CgoSuperPodInfo{}, fmt.Errorf("get super pod info failed, error is: %d", int32(retCode)) - } - - return convertSuperPodInfo(sPodInfo), nil -} - -func convertSocFreeResource(cSocFreeResource C.struct_dcmi_soc_free_resource) common.CgoSocFreeResource { - socFreeResource := common.CgoSocFreeResource{ - VfgNum: uint32(cSocFreeResource.vfg_num), - VfgBitmap: uint32(cSocFreeResource.vfg_bitmap), - Base: convertBaseResource(cSocFreeResource.base), - Computing: convertComputingResource(cSocFreeResource.computing), - Media: convertMediaResource(cSocFreeResource.media), - } - return socFreeResource -} - -// DcGetDeviceFreeResource get device free resource info -func (d *DcManager) DcGetDeviceFreeResource(cardID, deviceID int32) (common.CgoSocFreeResource, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.CgoSocFreeResource{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetFreeResource - var freeResource C.struct_dcmi_soc_free_resource - size := C.uint(unsafe.Sizeof(freeResource)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&freeResource), &size); int32(retCode) != common.Success { - return common.CgoSocFreeResource{}, fmt.Errorf("get device info failed, error is: %d", int32(retCode)) - } - return convertSocFreeResource(freeResource), nil -} - -// DcGetVDevActivityInfo get vir device activity info by virtual device id -func (d *DcManager) DcGetVDevActivityInfo(cardID, deviceID int32, vDevID uint32) (common.VDevActivityInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.VDevActivityInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if !common.IsValidVDevID(vDevID) { - return common.VDevActivityInfo{}, fmt.Errorf("vDevID(%d) invalid", vDevID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdVDevMng) - subCmd := VmngSubCmdGetVDevActivity - var vDevActivityInfo C.struct_dcmi_vdev_query_stru - size := C.uint(unsafe.Sizeof(vDevActivityInfo)) - vDevActivityInfo.vdev_id = C.uint(vDevID) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - unsafe.Pointer(&vDevActivityInfo), &size); int32(retCode) != common.Success { - return common.VDevActivityInfo{}, fmt.Errorf("retCode: %d", int32(retCode)) - } - totalMemSize := uint64(vDevActivityInfo.query_info.computing.vdev_memory_total) - usedMemSize := totalMemSize - uint64(vDevActivityInfo.query_info.computing.vdev_memory_free) - if usedMemSize < 0 { - return common.VDevActivityInfo{}, errors.New("used memory value abnormal") - } - return common.VDevActivityInfo{ - VDevID: vDevID, - VDevAiCoreRate: uint32(vDevActivityInfo.query_info.computing.vdev_aicore_utilization), - VDevTotalMem: totalMemSize, - VDevUsedMem: usedMemSize, - IsVirtualDev: true, - }, nil -} - -// DcVGetDeviceInfo get vdevice resource info -func (d *DcManager) DcVGetDeviceInfo(cardID, deviceID int32) (common.VirtualDevInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.VirtualDevInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var unitType C.enum_dcmi_unit_type - if retCode := C.dcmi_get_device_type(C.int(cardID), C.int(deviceID), &unitType); int32(retCode) != common.Success { - return common.VirtualDevInfo{}, fmt.Errorf("get device type failed, error is: %d", int32(retCode)) - } - if int32(unitType) != common.NpuType { - return common.VirtualDevInfo{}, fmt.Errorf("not support unit type: %d", int32(unitType)) - } - - cgoDcmiSocTotalResource, err := d.DcGetDeviceTotalResource(cardID, deviceID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get device total resource failed, error is: %v", err) - } - - cgoDcmiSocFreeResource, err := d.DcGetDeviceFreeResource(cardID, deviceID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get device free resource failed, error is: %v", err) - } - dcmiVDevInfo := common.VirtualDevInfo{ - TotalResource: cgoDcmiSocTotalResource, - FreeResource: cgoDcmiSocFreeResource, - } - for _, vDevID := range cgoDcmiSocTotalResource.VDevID { - cgoVDevQueryStru, err := d.DcGetDeviceVDevResource(cardID, deviceID, vDevID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get device virtual resource failed, error is: %v", err) - } - dcmiVDevInfo.VDevInfo = append(dcmiVDevInfo.VDevInfo, cgoVDevQueryStru) - vDevActivityInfo, err := d.DcGetVDevActivityInfo(cardID, deviceID, vDevID) - if err != nil { - hwlog.RunLog.Warnf("get cur vDev's activity info failed, err: %s", err) - continue - } - vDevActivityInfo.VDevAiCore = float64(cgoVDevQueryStru.QueryInfo.Computing.Aic) - dcmiVDevInfo.VDevActivityInfo = append(dcmiVDevInfo.VDevActivityInfo, vDevActivityInfo) - } - return dcmiVDevInfo, nil -} - -// DcGetCardIDDeviceID get card id and device id from logic id -func (d *DcManager) DcGetCardIDDeviceID(logicID int32) (int32, int32, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) - } - var cardID, deviceID C.int - if retCode := C.dcmi_get_card_id_device_id_from_logicid(&cardID, &deviceID, - C.uint(logicID)); int32(retCode) != common.Success { - return common.RetError, common.RetError, - fmt.Errorf("failed to get card id and device id by logicID(%d), errorcode is: %d", logicID, - int32(retCode)) - } - if !common.IsValidCardIDAndDeviceID(int32(cardID), int32(deviceID)) { - return common.RetError, common.RetError, fmt.Errorf("failed to get card id and device id, "+ - "cardID(%d) or deviceID(%d) is invalid", int32(cardID), int32(deviceID)) - } - - return int32(cardID), int32(deviceID), nil -} - -// DcCreateVDevice create virtual device by logic id -func (d *DcManager) DcCreateVDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. - CgoCreateVDevOut, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return common.CgoCreateVDevOut{}, fmt.Errorf("get card id and device id failed, error is: %v", err) - } - - createVDevOut, err := d.DcCreateVirtualDevice(cardID, deviceID, vDevInfo) - if err != nil { - return common.CgoCreateVDevOut{}, fmt.Errorf("create virtual device failed, error is: %v", err) - } - return createVDevOut, nil -} - -// DcGetVDeviceInfo get virtual device info by logic id -func (d *DcManager) DcGetVDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.VirtualDevInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get card id and device id failed, error is: %v", err) - } - - dcmiVDevInfo, err := d.DcVGetDeviceInfo(cardID, deviceID) - if err != nil { - return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v", err) - } - return dcmiVDevInfo, nil -} - -// DcDestroyVDevice destroy spec virtual device by logic id -func (d *DcManager) DcDestroyVDevice(logicID int32, vDevID uint32) error { - if !common.IsValidLogicIDOrPhyID(logicID) { - return fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return fmt.Errorf("get card id and device id failed, error is: %v", err) - } - - if err = d.DcSetDestroyVirtualDevice(cardID, deviceID, vDevID); err != nil { - return fmt.Errorf("destroy virtual device failed, error is: %v", err) - } - return nil -} - -// DcGetDeviceVoltage the accuracy is 0.01v. -func (d *DcManager) DcGetDeviceVoltage(cardID, deviceID int32) (float32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var vol C.uint - if retCode := C.dcmi_get_device_voltage(C.int(cardID), C.int(deviceID), &vol); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("failed to obtain the voltage based on card_id(%d) and "+ - "device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) - } - // the voltage's value is error if it's greater than or equal to MaxInt32 - if common.IsGreaterThanOrEqualInt32(int64(vol)) { - return common.RetError, fmt.Errorf("voltage value out of range(max is int32), "+ - "card_id(%d) and device_id(%d), voltage: %d", cardID, deviceID, int64(vol)) - } - - return float32(vol) * common.ReduceOnePercent, nil -} - -// DcGetDevicePowerInfo the accuracy is 0.1w, the result like: 8.2 -func (d *DcManager) DcGetDevicePowerInfo(cardID, deviceID int32) (float32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cpower C.int - if retCode := C.dcmi_get_device_power_info(C.int(cardID), C.int(deviceID), - &cpower); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("failed to obtain the power based on card_id(%d) and device_id(%d)"+ - ", error code: %d", cardID, deviceID, int32(retCode)) - } - parsedPower := float32(cpower) - if parsedPower < 0 { - return common.RetError, fmt.Errorf("get wrong device power, card_id(%d) and device_id(%d), power: %f", - cardID, deviceID, parsedPower) - } - - return parsedPower * common.ReduceTenth, nil - -} - -// DcGetDeviceFrequency get device frequency, unit MHz -// Ascend910B with frequency type: 2,6,7,9 -// Ascend910 with frequency type: 2,6,7,9 -// Ascend310 with frequency type: 1,2,6,7,9 -// Ascend310P with frequency type: 1,2,7,9,12 -// more information see common.DeviceType -func (d *DcManager) DcGetDeviceFrequency(cardID, deviceID int32, devType common.DeviceType) (uint32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cFrequency C.uint - if retCode := C.dcmi_get_device_frequency(C.int(cardID), C.int(deviceID), C.enum_dcmi_freq_type(devType.Code), - &cFrequency); int32(retCode) != common.Success { - return common.UnRetError, - buildDcmiErr(cardID, deviceID, fmt.Sprintf("frequency (name: %v, code:%d)", devType.Name, devType.Code), retCode) - } - // check whether cFrequency is too big - if common.IsGreaterThanOrEqualInt32(int64(cFrequency)) || int64(cFrequency) < 0 { - return common.UnRetError, fmt.Errorf("frequency value out of range [0, int32),card_id(%d) and device_id(%d), "+ - "frequency (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, int64(cFrequency)) - } - return uint32(cFrequency), nil -} - -// DcGetMemoryInfo use v3 interface to query memory info -func (d *DcManager) DcGetMemoryInfo(cardID, deviceID int32) (*common.MemoryInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cmInfoV3 CDcmiMemoryInfoV3 - if retCode := C.dcmi_get_device_memory_info_v3(C.int(cardID), C.int(deviceID), - &cmInfoV3); int32(retCode) != common.Success { - return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ - "%d) and device_id(%d), error code: %d", cardID, deviceID, int32(retCode)) - } - - if uint64(cmInfoV3.memory_size) < uint64(cmInfoV3.memory_available) { - return nil, fmt.Errorf("failed to obtain the memory info by v3 interface based on card_id("+ - "%d) and device_id(%d), total memory is less than available memory", cardID, deviceID) - } - - return &common.MemoryInfo{ - MemorySize: uint64(cmInfoV3.memory_size), - MemoryAvailable: uint64(cmInfoV3.memory_available), - Frequency: uint32(cmInfoV3.freq), - Utilization: uint32(cmInfoV3.utiliza), - }, nil - -} - -// FuncDcmiGetDeviceHbmInfo dcmi_get_device_hbm_info function for outer invoke, only for Ascend910 -func FuncDcmiGetDeviceHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cHbmInfo C.struct_dcmi_hbm_info - if retCode := C.dcmi_get_device_hbm_info(C.int(cardID), C.int(deviceID), - &cHbmInfo); int32(retCode) != common.Success { - return nil, buildDcmiErr(cardID, deviceID, "high bandwidth memory info", retCode) - } - hbmTemp := int32(cHbmInfo.temp) - if hbmTemp < 0 { - return nil, fmt.Errorf("get wrong device HBM temporary, card_id(%d) and device_id(%d), HBM.temp: %d", - cardID, deviceID, hbmTemp) - } - return &common.HbmInfo{ - MemorySize: uint64(cHbmInfo.memory_size), - Frequency: uint32(cHbmInfo.freq), - Usage: uint64(cHbmInfo.memory_usage), - Temp: hbmTemp, - BandWidthUtilRate: uint32(cHbmInfo.bandwith_util_rate)}, nil -} - -// DcGetHbmInfo get HBM information A310/A310P not support -func (d *DcManager) DcGetHbmInfo(cardID, deviceID int32) (*common.HbmInfo, error) { - return &common.HbmInfo{ - MemorySize: 0, - Frequency: 0, - Usage: 0, - Temp: 0, - BandWidthUtilRate: 0}, nil -} - -// DcGetDeviceErrorCode get the error count and errorcode of the device,only return the first errorcode -func (d *DcManager) DcGetDeviceErrorCode(cardID, deviceID int32) (int32, int64, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, - deviceID) - } - var errCount C.int - var errCodeArray [common.MaxErrorCodeCount]C.uint - if retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], - common.MaxErrorCodeCount); int32(retCode) != common.Success { - return common.RetError, common.RetError, fmt.Errorf("failed to obtain the device errorcode based on "+ - "card_id(%d) and device_id(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), - int32(errCount)) - } - - if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { - return common.RetError, common.RetError, fmt.Errorf("get wrong errorcode count, "+ - "card_id(%d) and device_id(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) - } - - return int32(errCount), int64(errCodeArray[0]), nil -} - -// DcGetDeviceCount get device count -func (d *DcManager) DcGetDeviceCount() (int32, error) { - devNum, _, err := d.DcGetLogicIDList() - if err != nil { - return common.RetError, fmt.Errorf("get device count failed, error: %v", err) - } - return devNum, nil -} - -// DcGetLogicIDList get device logic id list -func (d *DcManager) DcGetLogicIDList() (int32, []int32, error) { - logicIDs := make([]int32, 0) - var totalNum int32 - _, cardList, err := d.DcGetCardList() - if err != nil { - return common.RetError, logicIDs, fmt.Errorf("get card list failed, error: %v", err) - } - for _, cardID := range cardList { - devNumInCard, err := d.DcGetDeviceNumInCard(cardID) - if err != nil { - return common.RetError, logicIDs, fmt.Errorf("get device num by cardID: %d failed, error: %v", - cardID, err) - } - totalNum += devNumInCard - if totalNum > common.HiAIMaxDeviceNum*common.HiAIMaxCardNum { - return common.RetError, nil, fmt.Errorf("get device num: %d greater than %d", - totalNum, common.HiAIMaxDeviceNum*common.HiAIMaxCardNum) - } - for devID := int32(0); devID < devNumInCard; devID++ { - logicID, err := d.DcGetDeviceLogicID(cardID, devID) - if err != nil { - return common.RetError, nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ - "failed, error: %v", cardID, devID, err) - } - logicIDs = append(logicIDs, logicID) - } - } - return totalNum, logicIDs, nil -} - -// DcGetDeviceHealth get device health -func (d *DcManager) DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var health C.uint - if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), - &health); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ - "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) - } - if common.IsGreaterThanOrEqualInt32(int64(health)) { - return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ - "health: %d", cardID, deviceID, int64(health)) - } - return int32(health), nil -} - -// DcGetDeviceUtilizationRate get device utils rate by id -func (d *DcManager) DcGetDeviceUtilizationRate(cardID, deviceID int32, devType common.DeviceType) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var rate C.uint - if retCode := C.dcmi_get_device_utilization_rate(C.int(cardID), C.int(deviceID), C.int(devType.Code), - &rate); int32(retCode) != common.Success { - return common.RetError, - buildDcmiErr(cardID, deviceID, fmt.Sprintf("utilization (name: %v, code:%d)", devType.Name, devType.Code), retCode) - } - if !common.IsValidUtilizationRate(uint32(rate)) { - return common.RetError, fmt.Errorf("get wrong device (cardID: %d, deviceID: %d) "+ - "utilization (name: %v, code:%d): %d", cardID, deviceID, devType.Name, devType.Code, uint32(rate)) - } - return int32(rate), nil -} - -// DcGetDeviceTemperature get the device temperature -func (d *DcManager) DcGetDeviceTemperature(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var temp C.int - if retCode := C.dcmi_get_device_temperature(C.int(cardID), C.int(deviceID), - &temp); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) temperature failed, error "+ - "code is : %d", cardID, deviceID, int32(retCode)) - } - parsedTemp := int32(temp) - if parsedTemp < int32(common.DefaultTemperatureWhenQueryFailed) { - return common.RetError, fmt.Errorf("get wrong device temperature, devcie (cardID: %d, deviceID: %d), "+ - "temperature: %d", cardID, deviceID, parsedTemp) - } - return parsedTemp, nil -} - -func convertUCharToCharArr(cgoArr [maxChipNameLen]C.uchar) []byte { - var charArr []byte - for _, v := range cgoArr { - if v == 0 { - break - } - charArr = append(charArr, byte(v)) - } - return charArr -} - -// DcGetChipInfo get the chip info by cardID and deviceID -func (d *DcManager) DcGetChipInfo(cardID, deviceID int32) (*common.ChipInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var chipInfo C.struct_dcmi_chip_info_v2 - chip := &common.ChipInfo{} - if rCode := C.dcmi_get_device_chip_info_v2(C.int(cardID), C.int(deviceID), &chipInfo); int32(rCode) != common.Success { - hwlog.RunLog.Debugf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ - " error code: %d", cardID, deviceID, int32(rCode)) - var oldChipInfo C.struct_dcmi_chip_info - if rCode = C.dcmi_get_device_chip_info(C.int(cardID), C.int(deviceID), &oldChipInfo); int32(rCode) != common.Success { - return nil, fmt.Errorf("get device ChipInfo information failed, cardID(%d), deviceID(%d),"+ - " error code: %d", cardID, deviceID, int32(rCode)) - } - chip.Name = string(convertUCharToCharArr(oldChipInfo.chip_name)) - chip.Type = string(convertUCharToCharArr(oldChipInfo.chip_type)) - chip.Version = string(convertUCharToCharArr(oldChipInfo.chip_ver)) - chip.AICoreCnt = int(oldChipInfo.aicore_cnt) - } else { - chip.Name = string(convertUCharToCharArr(chipInfo.chip_name)) - chip.Type = string(convertUCharToCharArr(chipInfo.chip_type)) - chip.Version = string(convertUCharToCharArr(chipInfo.chip_ver)) - chip.AICoreCnt = int(chipInfo.aicore_cnt) - chip.NpuName = string(convertUCharToCharArr(chipInfo.npu_name)) - } - if !common.IsValidChipInfo(chip) { - return nil, fmt.Errorf("get device ChipInfo information failed, chip info is empty,"+ - " cardID(%d), deviceID(%d)", cardID, deviceID) - } - - return chip, nil -} - -// DcGetPhysicIDFromLogicID get physicID from logicID -func (d *DcManager) DcGetPhysicIDFromLogicID(logicID int32) (int32, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, fmt.Errorf("logicID(%d) is invalid", logicID) - } - var physicID C.uint - if rCode := C.dcmi_get_device_phyid_from_logicid(C.uint(logicID), &physicID); int32(rCode) != common.Success { - return common.RetError, fmt.Errorf("get physic id from logicID(%d) failed, error code: %d", logicID, int32(rCode)) - } - if !common.IsValidLogicIDOrPhyID(int32(physicID)) { - return common.RetError, fmt.Errorf("get wrong physicID(%d) from logicID(%d)", uint32(physicID), logicID) - } - return int32(physicID), nil -} - -// DcGetDeviceIPAddress get device IP address by cardID and deviceID -func (d *DcManager) DcGetDeviceIPAddress(cardID, deviceID, ipType int32) (string, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var portType C.enum_dcmi_port_type = 1 - var portID C.int - var ipAddress C.struct_dcmi_ip_addr - var maskAddress C.struct_dcmi_ip_addr - if ipType == ipAddrTypeV6 { - ipAddress.ip_type = ipAddrTypeV6 - } - rCode := C.dcmi_get_device_ip(C.int(cardID), C.int(deviceID), portType, portID, &ipAddress, &maskAddress) - if int32(rCode) != common.Success { - return "", fmt.Errorf("get device IP address failed, cardID(%d), deviceID(%d), error code: %d", - cardID, deviceID, int32(rCode)) - } - if ipType == ipAddrTypeV6 { - return d.buildIPv6Addr(ipAddress) - } - return d.buildIPv4Addr(ipAddress) -} - -func (d *DcManager) buildIPv4Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { - deviceIP := make([]string, 0, net.IPv4len) - for key, val := range ipAddress.u_addr { - if key >= net.IPv4len { - break - } - deviceIP = append(deviceIP, fmt.Sprintf("%v", val)) - } - if netIP := net.ParseIP(strings.Join(deviceIP, ".")); netIP != nil { - return netIP.String(), nil - } - return "", fmt.Errorf("the device IPv4 address is invalid, value: %v", deviceIP) -} - -func (d *DcManager) buildIPv6Addr(ipAddress C.struct_dcmi_ip_addr) (string, error) { - deviceIP := make([]byte, 0, net.IPv6len) - for key, val := range ipAddress.u_addr { - if key >= net.IPv6len { - break - } - deviceIP = append(deviceIP, byte(val)) - } - if netIP := net.IP(deviceIP); netIP != nil { - return netIP.String(), nil - } - return "", fmt.Errorf("the device IPv6 address is invalid, value: %v", deviceIP) -} - -func callDcmiGetDeviceNetworkHealth(cardID, deviceID int32, result chan<- common.DeviceNetworkHealth) { - var healthCode C.enum_dcmi_rdfx_detect_result - rCode := C.dcmi_get_device_network_health(C.int(cardID), C.int(deviceID), &healthCode) - result <- common.DeviceNetworkHealth{HealthCode: uint32(healthCode), RetCode: int32(rCode)} -} - -// DcGetDeviceNetWorkHealth get device network health by cardID and deviceID -func (d *DcManager) DcGetDeviceNetWorkHealth(cardID, deviceID int32) (uint32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.UnRetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - result := make(chan common.DeviceNetworkHealth, 1) - go callDcmiGetDeviceNetworkHealth(cardID, deviceID, result) - select { - case res := <-result: - if res.RetCode != common.Success { - return common.UnRetError, fmt.Errorf("get device network healthCode failed, cardID(%d),"+ - " deviceID(%d), ret code: %d, health code: %d", cardID, deviceID, res.RetCode, res.HealthCode) - } - - if int32(res.HealthCode) < 0 || int32(res.HealthCode) > int32(math.MaxInt8) { - return common.UnRetError, fmt.Errorf("get wrong device network healthCode, cardID(%d), deviceID(%d),"+ - " error healthCode: %d", cardID, deviceID, int32(res.HealthCode)) - } - - return res.HealthCode, nil - // dcmi_get_device_network_health is occasionally blocked for a long time, because of retrying, - // after the card dropped. This method is used to interrupt the execution of the dcmi interface, - // if invoking time excceeds 1 second. - case <-time.After(common.DcmiApiTimeout * time.Second): - return common.UnRetError, fmt.Errorf("accessing dcmi_get_device_network_health interface timeout, "+ - "cardID(%d), deviceID(%d)", cardID, deviceID) - } -} - -// DcGetLogicIDFromPhysicID get logicID from physicID -func (d *DcManager) DcGetLogicIDFromPhysicID(physicID int32) (int32, error) { - if !common.IsValidLogicIDOrPhyID(physicID) { - return common.RetError, fmt.Errorf("physicID(%d) is invalid", physicID) - } - var logicID C.uint - if rCode := C.dcmi_get_device_logicid_from_phyid(C.uint(physicID), &logicID); int32(rCode) != common.Success { - return common.RetError, fmt.Errorf("get logicID from physicID(%d) failed, error code: %d", - physicID, int32(rCode)) - } - - if !common.IsValidLogicIDOrPhyID(int32(logicID)) { - return common.RetError, fmt.Errorf("get wrong logicID(%d) from physicID(%d)", uint32(logicID), physicID) - } - return int32(logicID), nil -} - -// FuncDcmiMcuGetPowerInfo dcmi_mcu_get_power_info_new function for outer invoke -func FuncDcmiMcuGetPowerInfo(cardID int32) (float32, error) { - var power C.int - if retCode := C.dcmi_mcu_get_power_info_new(C.int(cardID), &power); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("mcu_get_power_info failed, error code is:%d", int32(retCode)) - } - parsedPower := float32(power) - if parsedPower < 0 { - return common.RetError, fmt.Errorf("get wrong mcu_get_power_info, cardID: %d, power: %f", cardID, - parsedPower) - } - return parsedPower * common.ReduceTenth, nil -} - -// DcGetMcuPowerInfo this function is only for Ascend310P, A910/A310 not support -func (d *DcManager) DcGetMcuPowerInfo(cardID int32) (float32, error) { - return 0, nil -} - -// DcGetProductType get product type by dcmi interface -func (d *DcManager) DcGetProductType(cardID, deviceID int32) (string, error) { - cProductType := C.CString(string(make([]byte, productTypeLen))) - defer C.free(unsafe.Pointer(cProductType)) - err := C.dcmi_get_product_type(C.int(cardID), C.int(deviceID), (*C.char)(cProductType), productTypeLen+1) - if err != 0 { - return "", fmt.Errorf("get product type failed, errCode: %d", int32(err)) - } - return C.GoString(cProductType), nil -} - -// DcGetNpuWorkMode get npu work mode, this function is only for Ascend910, A310/310P not support -func (d *DcManager) DcGetNpuWorkMode(cardID int32) (int, error) { - var cWorkMode C.uchar - err := C.dcmi_get_npu_work_mode(C.int(cardID), &cWorkMode) - if err != 0 { - return common.RetError, fmt.Errorf("get npu work mode failed, errCode: %d", int32(err)) - } - return int(cWorkMode), nil -} - -// DcSetDeviceReset reset spec device chip -func (d *DcManager) DcSetDeviceReset(cardID, deviceID int32) error { - var channelType C.enum_dcmi_reset_channel = C.INBAND_CHANNEL - return d.setDeviceReset(cardID, deviceID, channelType) -} - -// DcGetBrotherCardID get brother card id -func (d *DcManager) DcGetBrotherCardID(cardID, deviceID int32) (int32, error) { - var broCardID C.int - errCode := C.dcmi_get_netdev_brother_device(C.int(cardID), C.int(deviceID), &broCardID) - if errCode != common.Success { - return common.RetError, fmt.Errorf("unable to get brother card, errCode: %v", errCode) - } - return int32(broCardID), nil -} - -// DcGetOutBandChannelState get out band channel state -func (d *DcManager) DcGetOutBandChannelState(cardID, deviceID int32) error { - var channelState C.int - errCode := C.dcmi_get_device_outband_channel_state(C.int(cardID), C.int(deviceID), &channelState) - if errCode != common.Success { - return fmt.Errorf("get out band channel state error, errCode: %v", errCode) - } - if channelState != common.ChannelStateOk { - return fmt.Errorf("chip reset not support, channel state: %v", channelState) - } - return nil -} - -// DcPreResetSoc pre reset soc, used before reset out band -func (d *DcManager) DcPreResetSoc(cardID, deviceID int32) error { - errCode := C.dcmi_pre_reset_soc(C.int(cardID), C.int(deviceID)) - if errCode != common.Success { - return fmt.Errorf("pre reset failed, cardID: %v, deviceID: %v, errCode: %v", cardID, deviceID, errCode) - } - return nil -} - -// DcSetDeviceResetOutBand reset spec device chip out band -func (d *DcManager) DcSetDeviceResetOutBand(cardID, deviceID int32) error { - var channelType C.enum_dcmi_reset_channel = C.OUTBAND_CHANNEL - return d.setDeviceReset(cardID, deviceID, channelType) -} - -// DcRescanSoc trigger soc rescan, non-blocking -func (d *DcManager) DcRescanSoc(cardID, deviceID int32) error { - errCode := C.dcmi_rescan_soc(C.int(cardID), C.int(deviceID)) - if errCode != common.Success { - return fmt.Errorf("fail to rescan chip cardID %d, deviceID %v, errCode: %v", cardID, deviceID, errCode) - } - return nil -} - -func (d *DcManager) setDeviceReset(cardID, deviceID int32, channelType C.enum_dcmi_reset_channel) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if errCode := C.dcmi_set_device_reset(C.int(cardID), C.int(deviceID), channelType); errCode != 0 { - return fmt.Errorf("cardID(%d) and deviceID(%d) hot reset errCode: %v", cardID, deviceID, errCode) - } - return nil -} - -// DcGetDeviceBootStatus get NPU boot status -func (d *DcManager) DcGetDeviceBootStatus(logicID int32) (int, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) - } - cardID, deviceID, err := d.DcGetCardIDDeviceID(logicID) - if err != nil { - return common.RetError, fmt.Errorf("failed to get cardID and deviceID by logicID(%d)", logicID) - } - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var bootStatus C.enum_dcmi_boot_status = C.DCMI_BOOT_STATUS_FINISH - if errCode := C.dcmi_get_device_boot_status(C.int(cardID), C.int(deviceID), &bootStatus); errCode != 0 { - return common.RetError, fmt.Errorf("device boot status errCode: %v", errCode) - } - return int(bootStatus), nil -} - -// DcGetDeviceAllErrorCode get the error count and all error codes of the device -func (d *DcManager) DcGetDeviceAllErrorCode(cardID, deviceID int32) (int32, []int64, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, - deviceID) - } - var errCount C.int - var errCodeArray [common.MaxErrorCodeCount]C.uint - retCode := C.dcmi_get_device_errorcode_v2(C.int(cardID), C.int(deviceID), &errCount, &errCodeArray[0], - common.MaxErrorCodeCount) - - var health C.uint - healthRetCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), &health) - - if int32(retCode) != common.Success && int32(healthRetCode) != common.DeviceNotReadyErrCode { - return common.RetError, nil, fmt.Errorf("failed to obtain the device errorcode based on cardID("+ - "%d) and deviceID(%d), error code: %d, error count: %d", cardID, deviceID, int32(retCode), int32(errCount)) - } - - errCodes := make([]int64, 0, len(errCodeArray)) - for _, errCode := range errCodeArray { - if int64(errCode) != 0 { - errCodes = append(errCodes, int64(errCode)) - } - } - - if int32(healthRetCode) == common.DeviceNotReadyErrCode { - hwlog.RunLog.Errorf("device errorcode v2 ret code: %d, device health ret code: %d, device not ready, "+ - "maybe a card drop fault occurred on cardID(%d) and deviceID(%d)", int32(retCode), int32(healthRetCode), - cardID, deviceID) - errCount += 1 - errCodes = append(errCodes, common.CardDropFaultCode) - } - - if int32(errCount) < 0 || int32(errCount) > common.MaxErrorCodeCount { - return common.RetError, nil, fmt.Errorf("get wrong errorcode count, "+ - "cardID(%d) and deviceID(%d), errorcode count: %d", cardID, deviceID, int32(errCount)) - } - - return int32(errCount), errCodes, nil -} - -// DcSubscribeDeviceFaultEvent subscribe device fault, callback with func 'faultEventCallFunc' -func (d *DcManager) DcSubscribeDeviceFaultEvent(cardID, deviceID int32) error { - if faultEventCallFunc == nil { - return errors.New("callFunc is invalid, can't start subscribe") - } - - var filter C.struct_dcmi_event_filter - if rCode := C.dcmi_subscribe_fault_event(C.int(cardID), C.int(deviceID), filter); int32(rCode) != common.Success { - return fmt.Errorf("subscribe fault event failed, cardID(%d) and deviceID(%d), error code: %d", - cardID, deviceID, int32(rCode)) - } - return nil -} - -// DcSetFaultEventCallFunc set fault event call back func -func (d *DcManager) DcSetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) { - faultEventCallFunc = businessFunc -} - -//export goEventFaultCallBack -func goEventFaultCallBack(event C.struct_dcmi_dms_fault_event) { - if faultEventCallFunc == nil { - hwlog.RunLog.Errorf("no fault event call back func") - return - } - // recovery event recorded fault event occurrence time, the recovery event time cannot be obtained. - // Therefore, all event occurrence time is recorded as the current host time when the event is received. - devFaultInfo := common.DevFaultInfo{ - EventID: int64(event.event_id), - LogicID: int32(event.deviceid), - ModuleType: int8(event.node_type), - ModuleID: int8(event.node_id), - SubModuleType: int8(event.sub_node_type), - SubModuleID: int8(event.sub_node_id), - Severity: int8(event.severity), - Assertion: int8(event.assertion), - AlarmRaisedTime: time.Now().UnixMilli(), - } - faultEventCallFunc(devFaultInfo) -} - -// DcGetDieID get chip die ID, like VDieID or NDieID, only Ascend910 has NDieID -func (d *DcManager) DcGetDieID(cardID, deviceID int32, dcmiDieType DieType) (string, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - if dcmiDieType != VDIE && dcmiDieType != NDIE { - return "", fmt.Errorf("dcmi die type can only be one of %d or %d", VDIE, NDIE) - } - - var dieIDObj C.struct_dcmi_die_id - if retCode := C.dcmi_get_device_die_v2(C.int(cardID), C.int(deviceID), - C.enum_dcmi_die_type(dcmiDieType), &dieIDObj); int32(retCode) != common.Success { - return "", buildDcmiErr(cardID, deviceID, "chip die ID", retCode) - } - - const hexBase = 16 - dieIDStr := make([]string, DieIDCount) - - hwlog.RunLog.Debugf("cardID(%d), deviceID(%d) get die type(%d) value %v", cardID, deviceID, dcmiDieType, - dieIDObj.soc_die) - for i := 0; i < DieIDCount; i++ { - s := strconv.FormatUint(uint64(dieIDObj.soc_die[i]), hexBase) - // Each part of the die id consists of 8 characters, and if the length is not enough, - // zero is added at the beginning - dieIDStr[i] = fmt.Sprintf("%08s", s) - } - return strings.ToUpper(strings.Join(dieIDStr, "-")), nil -} - -// DcGetDevProcessInfo chip process info -func (d *DcManager) DcGetDevProcessInfo(cardID, deviceID int32) (*common.DevProcessInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info - var procNum C.int - - if retCode := C.dcmi_get_device_resource_info(C.int(cardID), C.int(deviceID), &procList[0], - &procNum); int32(retCode) != common.Success { - return nil, buildDcmiErr(cardID, deviceID, "device resource", retCode) - } - - if int32(procNum) < 0 || int32(procNum) > common.MaxProcNum { - return nil, fmt.Errorf("get invalid proccess num (%d), cardID(%d) and deviceID(%d)", int32(procNum), cardID, - deviceID) - } - - return convertToDevResourceInfo(procList, int32(procNum)), nil -} - -func convertToDevResourceInfo(procList [common.MaxProcNum]C.struct_dcmi_proc_mem_info, - procNum int32) *common.DevProcessInfo { - if procNum < 0 || procNum > common.MaxProcNum { - hwlog.RunLog.Errorf("process num %v is not within in the range [0~%v]", procNum, common.MaxProcNum) - return nil - } - - info := new(common.DevProcessInfo) - if procNum == 0 { - return info - } - - info.ProcNum = procNum - for i := int32(0); i < procNum; i++ { - proc := common.DevProcInfo{ - Pid: int32(procList[i].proc_id), - MemUsage: float64(procList[i].proc_mem_usage) / common.UnitMB, // convert byte to MB - } - info.DevProcArray = append(info.DevProcArray, proc) - } - - return info -} - -// DcGetPCIeBusInfo pcie bus info -func (d *DcManager) DcGetPCIeBusInfo(cardID, deviceID int32) (string, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return "", fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var pcieInfo C.struct_dcmi_pcie_info_all - - if retCode := C.dcmi_get_device_pcie_info_v2(C.int(cardID), - C.int(deviceID), &pcieInfo); int32(retCode) != common.Success { - return "", buildDcmiErr(cardID, deviceID, "pcie bus", retCode) - } - - info := fmt.Sprintf("%04X:%02X:%02X.%-4X", int32(pcieInfo.domain), uint32(pcieInfo.bdf_busid), - uint32(pcieInfo.bdf_deviceid), uint32(pcieInfo.bdf_funcid)) - hwlog.RunLog.Debugf("pcie bus info is: '%s'", info) - - return strings.TrimRight(info, " "), nil -} - -// DcGetDeviceBoardInfo return board info of device -func (d *DcManager) DcGetDeviceBoardInfo(cardID, deviceID int32) (common.BoardInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.BoardInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - - var cBoardInfo C.struct_dcmi_board_info - - if retCode := C.dcmi_get_device_board_info(C.int(cardID), C.int(deviceID), - &cBoardInfo); int32(retCode) != common.Success { - return common.BoardInfo{}, buildDcmiErr(cardID, deviceID, "board info", retCode) - } - - return common.BoardInfo{ - BoardId: uint32(cBoardInfo.board_id), - PcbId: uint32(cBoardInfo.pcb_id), - BomId: uint32(cBoardInfo.bom_id), - SlotId: uint32(cBoardInfo.slot_id), - }, nil -} - -// DcGetPCIEBandwidth get pcie bandwidth value -func (d *DcManager) DcGetPCIEBandwidth(cardID, deviceID int32, profilingTime int) (common.PCIEBwStat, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.PCIEBwStat{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var dcmiPCIEBandwidth C.struct_dcmi_pcie_link_bandwidth_info - var pcieBandwidth common.PCIEBwStat - dcmiPCIEBandwidth.profiling_time = C.int(profilingTime) - retCode := C.dcmi_get_pcie_link_bandwidth_info(C.int(cardID), C.int(deviceID), &dcmiPCIEBandwidth) - if int32(retCode) != common.Success { - return pcieBandwidth, buildDcmiErr(cardID, deviceID, "PCIEBandwidth", retCode) - } - - pcieBandwidth.PcieRxPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_p_bw) - pcieBandwidth.PcieRxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_np_bw) - pcieBandwidth.PcieRxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.rx_cpl_bw) - - pcieBandwidth.PcieTxPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_p_bw) - pcieBandwidth.PcieTxNPBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_np_bw) - pcieBandwidth.PcieTxCPLBw = d.convertPcieBw(dcmiPCIEBandwidth.tx_cpl_bw) - - return pcieBandwidth, nil -} - -func (d *DcManager) convertPcieBw(pcieBwArr [agentdrvProfDataNum]C.uint) common.PcieStatValue { - return common.PcieStatValue{ - PcieMinBw: int32(pcieBwArr[0]), - PcieMaxBw: int32(pcieBwArr[1]), - PcieAvgBw: int32(pcieBwArr[agentdrvProfDataNum-1]), - } -} - -// DcGetDcmiVersion return dcmi version -func (d *DcManager) DcGetDcmiVersion() (string, error) { - cDcmiVer := C.CString(string(make([]byte, dcmiVersionLen))) - defer C.free(unsafe.Pointer(cDcmiVer)) - if retCode := C.dcmi_get_dcmi_version((*C.char)(cDcmiVer), dcmiVersionLen+1); int32(retCode) != common.Success { - return "", fmt.Errorf("get dcmi version failed, errCode: %d", int32(retCode)) - } - return C.GoString(cDcmiVer), nil -} - -// DcGetDeviceEccInfo get ECC info -func (d *DcManager) DcGetDeviceEccInfo(cardID, deviceID int32, inputType common.DcmiDeviceType) ( - *common.ECCInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return nil, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - dcmiDeviceType, err := d.getInputType(inputType) - if err != nil { - return nil, err - } - var deviceEccInfo C.struct_dcmi_ecc_info - if retCode := C.dcmi_get_device_ecc_info(C.int(cardID), C.int(deviceID), dcmiDeviceType, - &deviceEccInfo); retCode != 0 { - return nil, buildDcmiErr(cardID, deviceID, "dcmi device ECC", retCode) - } - eccInfo := &common.ECCInfo{ - EnableFlag: int32(deviceEccInfo.enable_flag), - SingleBitErrorCnt: int64(deviceEccInfo.single_bit_error_cnt), - DoubleBitErrorCnt: int64(deviceEccInfo.double_bit_error_cnt), - TotalSingleBitErrorCnt: int64(deviceEccInfo.total_single_bit_error_cnt), - TotalDoubleBitErrorCnt: int64(deviceEccInfo.total_double_bit_error_cnt), - SingleBitIsolatedPagesCnt: int64(deviceEccInfo.single_bit_isolated_pages_cnt), - DoubleBitIsolatedPagesCnt: int64(deviceEccInfo.double_bit_isolated_pages_cnt), - } - return eccInfo, nil -} - -// DcGetHccsStatisticInfo get HCCS statistic info -func (d *DcManager) DcGetHccsStatisticInfo(cardID, deviceID int32) (common.HccsStatisticInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) - subCmd := HccsSubCmdGetStatisticInfo - var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info - // Use a secure function to get the address (for cleanCode) - addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) - if err != nil { - return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) - } - size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - addr, &size); int32(retCode) != common.Success { - return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) - } - return convertHccsStatisticInfoStruct(hccsStatisticInfo), nil -} - -// DcGetHccsStatisticInfoU64 get HCCS statistic info -func (d *DcManager) DcGetHccsStatisticInfoU64(cardID, deviceID int32) (common.HccsStatisticInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.HccsStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdHccs) - subCmd := HccsSubCmdGetStatisticInfoU64 - var hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64 - // Use a secure function to get the address (for cleanCode) - addr, err := getAddrWithOffset(unsafe.Pointer(&hccsStatisticInfo), unsafe.Sizeof(hccsStatisticInfo), 0) - if err != nil { - return common.HccsStatisticInfo{}, fmt.Errorf("get hccsStatisticInfo addr failed, error is: %v", err) - } - size := C.uint(unsafe.Sizeof(hccsStatisticInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - addr, &size); int32(retCode) != common.Success { - return common.HccsStatisticInfo{}, buildDcmiErr(cardID, deviceID, "hccs statistic", retCode) - } - return convertHccsStatisticInfoStructU64(hccsStatisticInfo), nil -} - -func convertHccsStatisticInfoStruct(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info) common.HccsStatisticInfo { - cgoHccsStatisticInfo := common.HccsStatisticInfo{} - for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { - cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) - cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) - cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) - } - return cgoHccsStatisticInfo -} - -func convertHccsStatisticInfoStructU64(hccsStatisticInfo C.struct_dcmi_hccs_statistic_info_u64) common.HccsStatisticInfo { - cgoHccsStatisticInfo := common.HccsStatisticInfo{} - for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { - cgoHccsStatisticInfo.TxCnt = append(cgoHccsStatisticInfo.TxCnt, uint64(hccsStatisticInfo.tx_cnt[i])) - cgoHccsStatisticInfo.CrcErrCnt = append(cgoHccsStatisticInfo.CrcErrCnt, uint64(hccsStatisticInfo.crc_err_cnt[i])) - cgoHccsStatisticInfo.RxCnt = append(cgoHccsStatisticInfo.RxCnt, uint64(hccsStatisticInfo.rx_cnt[i])) - } - return cgoHccsStatisticInfo -} - -// DcGetHccsBandwidthInfo get HCCS bandwidth info -func (d *DcManager) DcGetHccsBandwidthInfo(cardID int32, deviceID int32, - profilingTime int) (common.HccsBandwidthInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.HccsBandwidthInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info - hccsBandwidthInfo.profiling_time = C.int(profilingTime) - if retCode := C.dcmi_get_hccs_link_bandwidth_info(C.int(cardID), C.int(deviceID), - &hccsBandwidthInfo); int32(retCode) != common.Success { - return common.HccsBandwidthInfo{}, buildDcmiErr(cardID, deviceID, "hccs bandwidth", retCode) - } - return convertHccsBandwidthInfoStruct(hccsBandwidthInfo), nil -} - -func convertHccsBandwidthInfoStruct(hccsBandwidthInfo C.struct_dcmi_hccs_bandwidth_info) common.HccsBandwidthInfo { - cgoHccsBWInfo := common.HccsBandwidthInfo{} - cgoHccsBWInfo.ProfilingTime = uint32(hccsBandwidthInfo.profiling_time) - cgoHccsBWInfo.TotalTxbw = float64(hccsBandwidthInfo.total_txbw) - cgoHccsBWInfo.TotalRxbw = float64(hccsBandwidthInfo.total_rxbw) - for i := uint32(0); i < dcmiHccsMaxPcsNum; i++ { - cgoHccsBWInfo.TxBandwidth = append(cgoHccsBWInfo.TxBandwidth, float64(hccsBandwidthInfo.tx_bandwidth[i])) - cgoHccsBWInfo.RxBandwidth = append(cgoHccsBWInfo.RxBandwidth, float64(hccsBandwidthInfo.rx_bandwidth[i])) - } - return cgoHccsBWInfo -} - -// DcGetSioInfo get SIO info -func (d *DcManager) DcGetSioInfo(cardID, deviceID int32) (common.SioCrcErrStatisticInfo, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.SioCrcErrStatisticInfo{}, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainCmd = C.enum_dcmi_main_cmd(MainCmdSio) - subCmd := SioSubCmdCrcErrStatistics - var sioInfo C.struct_dcmi_sio_crc_err_statistic_info - // Use a secure function to get the address (for cleanCode) - addr, err := getAddrWithOffset(unsafe.Pointer(&sioInfo), unsafe.Sizeof(sioInfo), 0) - if err != nil { - return common.SioCrcErrStatisticInfo{}, fmt.Errorf("get sioInfo addr failed, error is: %v", err) - } - size := C.uint(unsafe.Sizeof(sioInfo)) - if retCode := C.dcmi_get_device_info(C.int(cardID), C.int(deviceID), cMainCmd, C.uint(subCmd), - addr, &size); int32(retCode) != common.Success { - return common.SioCrcErrStatisticInfo{}, buildDcmiErr(cardID, deviceID, "super pod sio", retCode) - } - return convertSioInfoStruct(sioInfo), nil -} - -func convertSioInfoStruct(sPodSioInfo C.struct_dcmi_sio_crc_err_statistic_info) common.SioCrcErrStatisticInfo { - cgoSPodSioInfo := common.SioCrcErrStatisticInfo{ - TxErrCnt: int64(sPodSioInfo.tx_error_count), - RxErrCnt: int64(sPodSioInfo.rx_error_count), - } - for i := uint32(0); i < dcmiMaxReserveNum; i++ { - cgoSPodSioInfo.Reserved = append(cgoSPodSioInfo.Reserved, uint32(sPodSioInfo.reserved[i])) - } - return cgoSPodSioInfo -} - -func (d *DcManager) getInputType(inputType common.DcmiDeviceType) (C.enum_dcmi_device_type, error) { - switch inputType { - case common.DcmiDeviceTypeDDR: - return C.DCMI_DEVICE_TYPE_DDR, nil - case common.DcmiDeviceTypeSRAM: - return C.DCMI_DEVICE_TYPE_SRAM, nil - case common.DcmiDeviceTypeHBM: - return C.DCMI_DEVICE_TYPE_HBM, nil - case common.DcmiDeviceTypeNPU: - return C.DCMI_DEVICE_TYPE_NPU, nil - case common.DcmiDeviceTypeNONE: - return C.DCMI_DEVICE_TYPE_NONE, nil - default: - return C.DCMI_DEVICE_TYPE_NONE, fmt.Errorf("invalid input type for getting device ecc info") - } -} - -// Define a safe function to get address offsets (for cleanCode) -func getAddrWithOffset(addr unsafe.Pointer, length uintptr, offset uintptr) (unsafe.Pointer, error) { - if offset > length { - return nil, fmt.Errorf("offset(%d) is invalid, length(%d)", offset, length) - } - return (unsafe.Pointer)(uintptr(addr) + offset), nil -} - -// DcGetDeviceMainBoardInfo return mainboardId of device -func (d *DcManager) DcGetDeviceMainBoardInfo(cardID, deviceID int32) (uint32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var cMainBoardId C.uint - if retCode := C.dcmi_get_mainboard_id(C.int(cardID), C.int(deviceID), - &cMainBoardId); int32(retCode) != common.Success { - return 0, buildDcmiErr(cardID, deviceID, "mainBoardId", retCode) - } - - return uint32(cMainBoardId), nil -} -func buildDcmiErr(cardID, deviceID int32, msg string, errCode C.int) error { - errDesc, ok := dcmiErrMap[int32(errCode)] - if !ok { - errDesc = "unknown error code" - } - return fmt.Errorf("cardID(%d),deviceID(%d):get %s info failed,error code: %v,error desc: %v", - cardID, deviceID, msg, errCode, errDesc) -} - -// DcGetSuperPodStatus get super pod status -func (d *DcManager) DcGetSuperPodStatus(cardID, deviceID int32, sdid uint32) (int, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return 0, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var status C.uint - if retCode := C.dcmi_get_spod_node_status(C.int(cardID), C.int(deviceID), - C.unsigned(sdid), &status); int32(retCode) != common.Success { - return 0, buildDcmiErr(cardID, deviceID, "GetSuperPodStatus", retCode) - } - return int(status), nil -} - -// DcSetSuperPodStatus set super pod status -func (d *DcManager) DcSetSuperPodStatus(cardID, deviceID int32, sdid, status uint32) error { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - if retCode := C.dcmi_set_spod_node_status(C.int(cardID), C.int(deviceID), - C.uint(sdid), C.uint(status)); int32(retCode) != common.Success { - return buildDcmiErr(cardID, deviceID, "DcSetSuperPodStatus", retCode) - } - return nil -} - -// DcGetCardElabelV2 get card elabel information -func (d *DcManager) DcGetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - if !common.IsValidCardID(cardID) { - return common.ElabelInfo{}, fmt.Errorf("cardID(%d) is invalid", cardID) - } - var elabelInfo C.struct_dcmi_elabel_info - if retCode := C.dcmi_get_card_elabel_v2(C.int(cardID), &elabelInfo); int32(retCode) != common.Success { - return common.ElabelInfo{}, fmt.Errorf("cardID(%d): get elabel info failed, error code: %v", cardID, retCode) - } - return common.ElabelInfo{ - ProductName: C.GoString(&elabelInfo.product_name[0]), - Model: C.GoString(&elabelInfo.model[0]), - Manufacturer: C.GoString(&elabelInfo.manufacturer[0]), - ManufacturerDate: C.GoString(&elabelInfo.manufacturer_date[0]), - SerialNumber: C.GoString(&elabelInfo.serial_number[0]), - }, nil -} diff --git a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h b/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h deleted file mode 100644 index 7ffe468..0000000 --- a/mind-cluster/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h +++ /dev/null @@ -1,596 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef __DCMI_INTERFACE_API_H__ -#define __DCMI_INTERFACE_API_H__ - -#ifdef __cplusplus -#if __cplusplus -extern "C" { -#endif -#endif /* __cplusplus */ - -#define DCMIDLLEXPORT static - -#define MAX_CHIP_NAME_LEN 32 // Maximum length of chip name -#define TEMPLATE_NAME_LEN 32 -#define DIE_ID_COUNT 5 // Number of die ID characters -#define AGENTDRV_PROF_DATA_NUM 3 -#define MAX_LENGTH 256 // Maximum length for elabel info fields - -/*----------------------------------------------* - * Structure description * - *----------------------------------------------*/ -struct dcmi_chip_info { - unsigned char chip_type[MAX_CHIP_NAME_LEN]; - unsigned char chip_name[MAX_CHIP_NAME_LEN]; - unsigned char chip_ver[MAX_CHIP_NAME_LEN]; - unsigned int aicore_cnt; -}; - -struct dcmi_chip_info_v2 { - unsigned char chip_type[MAX_CHIP_NAME_LEN]; - unsigned char chip_name[MAX_CHIP_NAME_LEN]; - unsigned char chip_ver[MAX_CHIP_NAME_LEN]; - unsigned int aicore_cnt; - unsigned char npu_name[MAX_CHIP_NAME_LEN]; -}; - -struct dcmi_pcie_info_all { - unsigned int venderid; /* 厂商id */ - unsigned int subvenderid; /* 厂商子id */ - unsigned int deviceid; /* 设备id */ - unsigned int subdeviceid; /* 设备子id */ - int domain; - unsigned int bdf_busid; - unsigned int bdf_deviceid; - unsigned int bdf_funcid; - unsigned char reserve[32]; /* the size of dcmi_pcie_info_all is 64 */ -}; - -struct dcmi_die_id { - unsigned int soc_die[DIE_ID_COUNT]; -}; - -struct dcmi_ecc_info { - int enable_flag; - unsigned int single_bit_error_cnt; - unsigned int double_bit_error_cnt; - unsigned int total_single_bit_error_cnt; - unsigned int total_double_bit_error_cnt; - unsigned int single_bit_isolated_pages_cnt; - unsigned int double_bit_isolated_pages_cnt; - unsigned int single_bit_next_isolated_pages_cnt; - unsigned int double_bit_next_isolated_pages_cnt; -}; - -struct dcmi_hbm_info { - unsigned long long memory_size; - unsigned int freq; - unsigned long long memory_usage; - int temp; - unsigned int bandwith_util_rate; -}; - -struct dcmi_get_memory_info_stru { - unsigned long long memory_size; /* unit:MB */ - unsigned long long memory_available; /* free + hugepages_free * hugepagesize */ - unsigned int freq; - unsigned long hugepagesize; /* unit:KB */ - unsigned long hugepages_total; - unsigned long hugepages_free; - unsigned int utiliza; /* ddr memory info usages */ - unsigned char reserve[60]; /* the size of dcmi_memory_info is 96 */ -}; - -enum dcmi_ip_addr_type { - DCMI_IPADDR_TYPE_V4 = 0, /** IPv4 */ - DCMI_IPADDR_TYPE_V6 = 1, /** IPv6 */ - DCMI_IPADDR_TYPE_ANY = 2 /** IPv4+IPv6 ("dual-stack") */ -}; - -struct dcmi_ip_addr { - union { - unsigned char ip6[16]; - unsigned char ip4[4]; - } u_addr; - enum dcmi_ip_addr_type ip_type; -}; - -enum dcmi_unit_type { - NPU_TYPE = 0, - MCU_TYPE = 1, - CPU_TYPE = 2, - INVALID_TYPE = 0xFF -}; - -enum dcmi_rdfx_detect_result { - DCMI_RDFX_DETECT_OK = 0, - DCMI_RDFX_DETECT_SOCK_FAIL = 1, - DCMI_RDFX_DETECT_RECV_TIMEOUT = 2, - DCMI_RDFX_DETECT_UNREACH = 3, - DCMI_RDFX_DETECT_TIME_EXCEEDED = 4, - DCMI_RDFX_DETECT_FAULT = 5, - DCMI_RDFX_DETECT_INIT = 6, - DCMI_RDFX_DETECT_THREAD_ERR = 7, - DCMI_RDFX_DETECT_IP_SET = 8, - DCMI_RDFX_DETECT_MAX = 0xFF -}; - -enum dcmi_port_type { - DCMI_VNIC_PORT = 0, - DCMI_ROCE_PORT = 1, - DCMI_INVALID_PORT -}; - -enum dcmi_main_cmd { - DCMI_MAIN_CMD_DVPP = 0, - DCMI_MAIN_CMD_ISP, - DCMI_MAIN_CMD_TS_GROUP_NUM, - DCMI_MAIN_CMD_CAN, - DCMI_MAIN_CMD_UART, - DCMI_MAIN_CMD_UPGRADE = 5, - DCMI_MAIN_CMD_HCCS = 16, - DCMI_MAIN_CMD_TEMP = 50, - DCMI_MAIN_CMD_SVM = 51, - DCMI_MAIN_CMD_VDEV_MNG, - DCMI_MAIN_CMD_SIO = 56, - DCMI_MAIN_CMD_DEVICE_SHARE = 0x8001, - DCMI_MAIN_CMD_MAX -}; - -enum dcmi_freq_type { - DCMI_FREQ_DDR = 1, - DCMI_FREQ_CTRLCPU = 2, - DCMI_FREQ_HBM = 6, - DCMI_FREQ_AICORE_CURRENT_ = 7, - DCMI_FREQ_AICORE_MAX = 9, - DCMI_FREQ_VECTORCORE_CURRENT = 12 -}; - -enum dcmi_reset_channel { - OUTBAND_CHANNEL = 0, // out-of-band reset - INBAND_CHANNEL // in-band reset -}; - -enum dcmi_boot_status { - DCMI_BOOT_STATUS_UNINIT = 0, // not init - DCMI_BOOT_STATUS_BIOS, // BIOS starting - DCMI_BOOT_STATUS_OS, // OS starting - DCMI_BOOT_STATUS_FINISH // started -}; - -enum dcmi_device_type { - DCMI_DEVICE_TYPE_DDR, - DCMI_DEVICE_TYPE_SRAM, - DCMI_DEVICE_TYPE_HBM, - DCMI_DEVICE_TYPE_NPU, - DCMI_DEVICE_TYPE_NONE = 0xff -}; - -enum dcmi_event_type { - DCMI_DMS_FAULT_EVENT = 0, -}; - -enum dcmi_die_type { - NDIE, - VDIE -}; - -#define DCMI_VDEV_RES_NAME_LEN 16 -#define DCMI_VDEV_SIZE 20 -#define DCMI_VDEV_FOR_RESERVE 32 -#define DCMI_SOC_SPLIT_MAX 32 -#define DCMI_MAX_EVENT_NAME_LENGTH 256 -#define DCMI_MAX_EVENT_DATA_LENGTH 32 -#define DCMI_EVENT_FILTER_FLAG_EVENT_ID (1UL << 0) -#define DCMI_EVENT_FILTER_FLAG_SERVERITY (1UL << 1) -#define DCMI_EVENT_FILTER_FLAG_NODE_TYPE (1UL << 2) -#define DCMI_MAX_EVENT_RESV_LENGTH 32 -#define HCCS_MAX_PCS_NUM 16 -#define HCCS_RES_PCS_NUM 64 -#define IP_ADDR_LIST_LEN 1024 -#define HCCS_PING_MESH_MAX_NUM 48 -#define ADDR_MAX_LEN 16 - -struct dcmi_base_resource { - unsigned long long token; - unsigned long long token_max; - unsigned long long task_timeout; - unsigned int vfg_id; - unsigned char vip_mode; - unsigned char reserved[DCMI_VDEV_FOR_RESERVE - 1]; /* bytes aligned */ -}; - -/* total types of computing resource */ -struct dcmi_computing_resource { - /* accelator resource */ - float aic; - float aiv; - unsigned short dsa; - unsigned short rtsq; - unsigned short acsq; - unsigned short cdqm; - unsigned short c_core; - unsigned short ffts; - unsigned short sdma; - unsigned short pcie_dma; - - /* memory resource, MB as unit */ - unsigned long long memory_size; - - /* id resource */ - unsigned int event_id; - unsigned int notify_id; - unsigned int stream_id; - unsigned int model_id; - - /* cpu resource */ - unsigned short topic_schedule_aicpu; - unsigned short host_ctrl_cpu; - unsigned short host_aicpu; - unsigned short device_aicpu; - unsigned short topic_ctrl_cpu_slot; - - /* vnpu resource */ - unsigned int vdev_aicore_utilization; - unsigned long long vdev_memory_total; - unsigned long long vdev_memory_free; - - unsigned char reserved[DCMI_VDEV_FOR_RESERVE-DCMI_VDEV_SIZE]; -}; - -struct dcmi_media_resource { - /* dvpp resource */ - float jpegd; - float jpege; - float vpc; - float vdec; - float pngd; - float venc; - unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; -}; - -struct dcmi_create_vdev_out { - unsigned int vdev_id; - unsigned int pcie_bus; - unsigned int pcie_device; - unsigned int pcie_func; - unsigned int vfg_id; - unsigned char reserved[DCMI_VDEV_FOR_RESERVE]; -}; - -struct dcmi_create_vdev_res_stru { - unsigned int vdev_id; - unsigned int vfg_id; - char template_name[TEMPLATE_NAME_LEN]; - unsigned char reserved[64]; -}; - -struct dcmi_vdev_query_info { - char name[DCMI_VDEV_RES_NAME_LEN]; - unsigned int status; - unsigned int is_container_used; - unsigned int vfid; - unsigned int vfg_id; - unsigned long long container_id; - struct dcmi_base_resource base; - struct dcmi_computing_resource computing; - struct dcmi_media_resource media; -}; - -/* for single search */ -struct dcmi_vdev_query_stru { - unsigned int vdev_id; - struct dcmi_vdev_query_info query_info; -}; - -struct dcmi_soc_free_resource { - unsigned int vfg_num; - unsigned int vfg_bitmap; - struct dcmi_base_resource base; - struct dcmi_computing_resource computing; - struct dcmi_media_resource media; -}; - -struct dcmi_soc_total_resource { - unsigned int vdev_num; - unsigned int vdev_id[DCMI_SOC_SPLIT_MAX]; - unsigned int vfg_num; - unsigned int vfg_bitmap; - struct dcmi_base_resource base; - struct dcmi_computing_resource computing; - struct dcmi_media_resource media; -}; - -struct dcmi_spod_info { - unsigned int sdid; - unsigned int scale_type; - unsigned int super_pod_id; - unsigned int server_id; - unsigned int reserve[8]; -}; - -struct dcmi_dms_fault_event { - unsigned int event_id; /* Event ID */ - unsigned short deviceid; /* Device ID */ - unsigned char node_type; /* Node type */ - unsigned char node_id; /* Node ID */ - unsigned char sub_node_type; /* Subnode type */ - unsigned char sub_node_id; /* Subnode ID */ - unsigned char severity; /* Event severity. 0: warning; 1: minor; 2: major; 3: critical */ - unsigned char assertion; /* Event type. 0: fault recovery; 1: fault generation; 2: one-off event */ - int event_serial_num; /* Alarm serial number */ - int notify_serial_num; /* Notification serial number*/ - /* Time when the event occurs, presenting as the number of seconds that have elapsed since the Unix epoch. */ - unsigned long long alarm_raised_time; - char event_name[DCMI_MAX_EVENT_NAME_LENGTH]; /* Event description */ - char additional_info[DCMI_MAX_EVENT_DATA_LENGTH]; /* Additional event information */ - unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /**< Reserves 32 bytes */ -}; - -struct dcmi_event { - enum dcmi_event_type type; /* Event type */ - union { - struct dcmi_dms_fault_event dms_event; /* Event content */ - } event_t; -}; - -struct dcmi_event_filter { - /* It can be used to enable one or all filter criteria. The filter criteria are as follows: - 0: disables the filter criteria. - DCMI_EVENT_FILTER_FLAG_EVENT_ID: receives only specified events. - DCMI_EVENT_FILTER_FLAG_SERVERITY: receives only the events of a specified level and higher levels. - DCMI_EVENT_FILTER_FLAG_NODE_TYPE: receives only events of a specified node type. */ - unsigned long long filter_flag; - /* Receives a specified event. For details, see the Health Management Error Definition. */ - unsigned int event_id; - /* Receives events of a specified level and higher levels. For details, - see the severity definition in the struct dcmi_dms_fault_event structure. */ - unsigned char severity; - /* Receives only events of a specified node type. For details, see the Health Management Error Definition. */ - unsigned char node_type; - unsigned char resv[DCMI_MAX_EVENT_RESV_LENGTH]; /* < Reserves 32 bytes. */ -}; - -struct dcmi_proc_mem_info { - int proc_id; - // unit is byte - unsigned long proc_mem_usage; -}; - -struct dcmi_board_info { - unsigned int board_id; - unsigned int pcb_id; - unsigned int bom_id; - unsigned int slot_id; // slot_id indicates pcie slot ID of the chip -}; - -struct dcmi_pcie_link_bandwidth_info { - int profiling_time; - unsigned int tx_p_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int tx_np_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int tx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int tx_np_lantency[AGENTDRV_PROF_DATA_NUM]; - unsigned int rx_p_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int rx_np_bw[AGENTDRV_PROF_DATA_NUM]; - unsigned int rx_cpl_bw[AGENTDRV_PROF_DATA_NUM]; -}; - -struct dcmi_hccs_statistic_info { - unsigned int tx_cnt[HCCS_MAX_PCS_NUM]; - unsigned int rx_cnt[HCCS_MAX_PCS_NUM]; - unsigned int crc_err_cnt[HCCS_MAX_PCS_NUM]; - unsigned int retry_cnt[HCCS_MAX_PCS_NUM]; - unsigned int reserved_field_cnt[HCCS_RES_PCS_NUM]; -}; - -struct dcmi_hccs_statistic_info_u64 { - unsigned long long tx_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long rx_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long crc_err_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long retry_cnt[HCCS_MAX_PCS_NUM]; - unsigned long long reserved[HCCS_RES_PCS_NUM]; -}; - -struct dcmi_hccs_bandwidth_info { - int profiling_time; - double total_txbw; - double total_rxbw; - double tx_bandwidth[HCCS_MAX_PCS_NUM]; - double rx_bandwidth[HCCS_MAX_PCS_NUM]; -}; - -struct dcmi_sio_crc_err_statistic_info { - unsigned short tx_error_count; - unsigned short rx_error_count; - unsigned char reserved[8]; -}; - -struct dcmi_elabel_info { - char product_name[MAX_LENGTH]; - char model[MAX_LENGTH]; - char manufacturer[MAX_LENGTH]; - char manufacturer_date[MAX_LENGTH]; - char serial_number[MAX_LENGTH]; -}; - -struct dcmi_hccsping_mesh_operate { - char dst_addr_list[IP_ADDR_LIST_LEN]; - int pkt_size; - int pkt_send_num; - int pkt_interval; - int timeout; - int task_interval; - int task_id; -}; - -struct dcmi_hccsping_mesh_info { - char dst_addr[HCCS_PING_MESH_MAX_NUM][ADDR_MAX_LEN]; - unsigned int suc_pkt_num[HCCS_PING_MESH_MAX_NUM]; - unsigned int fail_pkt_num[HCCS_PING_MESH_MAX_NUM]; - long max_time[HCCS_PING_MESH_MAX_NUM]; - long min_time[HCCS_PING_MESH_MAX_NUM]; - long avg_time[HCCS_PING_MESH_MAX_NUM]; - long tp95_time[HCCS_PING_MESH_MAX_NUM]; - int reply_stat_num[HCCS_PING_MESH_MAX_NUM]; - unsigned long long ping_total_num[HCCS_PING_MESH_MAX_NUM]; - int dest_num; -}; - -#define DCMI_VERSION_1 -#define DCMI_VERSION_2 - -#if defined DCMI_VERSION_2 - -DCMIDLLEXPORT int dcmi_init(void); - -DCMIDLLEXPORT int dcmi_get_card_list(int *card_num, int *card_list, int list_len); - -DCMIDLLEXPORT int dcmi_get_device_num_in_card(int card_id, int *device_num); - -DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); - -DCMIDLLEXPORT int dcmi_get_device_type(int card_id, int device_id, enum dcmi_unit_type *device_type); - -DCMIDLLEXPORT int dcmi_get_device_pcie_info_v2(int card_id, int device_id, struct dcmi_pcie_info_all *pcie_info); - -DCMIDLLEXPORT int dcmi_get_device_chip_info(int card_id, int device_id, struct dcmi_chip_info *chip_info); - -DCMIDLLEXPORT int dcmi_get_device_chip_info_v2(int card_id, int device_id, struct dcmi_chip_info_v2 *chip_info); - -DCMIDLLEXPORT int dcmi_get_device_power_info(int card_id, int device_id, int *power); - -DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); - -DCMIDLLEXPORT int dcmi_get_device_errorcode_v2( - int card_id, int device_id, int *error_count, unsigned int *error_code_list, unsigned int list_len); - -DCMIDLLEXPORT int dcmi_get_device_temperature(int card_id, int device_id, int *temperature); - -DCMIDLLEXPORT int dcmi_get_device_voltage(int card_id, int device_id, unsigned int *voltage); - -DCMIDLLEXPORT int dcmi_get_device_ecc_info(int card_id, int device_id, enum dcmi_device_type input_type, - struct dcmi_ecc_info *device_ecc_info); - -DCMIDLLEXPORT int dcmi_get_device_frequency( - int card_id, int device_id, enum dcmi_freq_type input_type, unsigned int *frequency); - -DCMIDLLEXPORT int dcmi_get_device_hbm_info(int card_id, int device_id, struct dcmi_hbm_info *hbm_info); - -DCMIDLLEXPORT int dcmi_get_device_memory_info_v3(int card_id, int device_id, - struct dcmi_get_memory_info_stru *memory_info); - -DCMIDLLEXPORT int dcmi_get_device_utilization_rate( - int card_id, int device_id, int input_type, unsigned int *utilization_rate); - -DCMIDLLEXPORT int dcmi_get_device_info( - int card_id, int device_id, enum dcmi_main_cmd main_cmd, unsigned int sub_cmd, void *buf, unsigned int *size); - -DCMIDLLEXPORT int dcmi_get_device_ip(int card_id, int device_id, enum dcmi_port_type input_type, int port_id, - struct dcmi_ip_addr *ip, struct dcmi_ip_addr *mask); - -DCMIDLLEXPORT int dcmi_get_device_network_health(int card_id, int device_id, enum dcmi_rdfx_detect_result *result); - -DCMIDLLEXPORT int dcmi_get_device_logic_id(int *device_logic_id, int card_id, int device_id); - -DCMIDLLEXPORT int dcmi_create_vdevice(int card_id, int device_id, struct dcmi_create_vdev_res_stru *vdev, - struct dcmi_create_vdev_out *out); - -DCMIDLLEXPORT int dcmi_set_destroy_vdevice(int card_id, int device_id, unsigned int vdevid); - -DCMIDLLEXPORT int dcmi_get_device_phyid_from_logicid(unsigned int logicid, unsigned int *phyid); - -DCMIDLLEXPORT int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid); - -DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_logicid(int *card_id, int *device_id, unsigned int device_logic_id); - -DCMIDLLEXPORT int dcmi_get_card_id_device_id_from_phyid(int *card_id, int *device_id, unsigned int device_phy_id); - -DCMIDLLEXPORT int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size); - -DCMIDLLEXPORT int dcmi_set_device_reset(int card_id, int device_id, enum dcmi_reset_channel channel_type); - -DCMIDLLEXPORT int dcmi_get_device_outband_channel_state(int card_id, int device_id, int* channel_state); - -DCMIDLLEXPORT int dcmi_pre_reset_soc(int card_id, int device_id); - -DCMIDLLEXPORT int dcmi_rescan_soc(int card_id, int device_id); - -DCMIDLLEXPORT int dcmi_get_netdev_brother_device(int card_id, int device_id, int* brother_card_id); - -DCMIDLLEXPORT int dcmi_get_device_boot_status(int card_id, int device_id, enum dcmi_boot_status *boot_status); - -DCMIDLLEXPORT int dcmi_subscribe_fault_event(int card_id, int device_id, struct dcmi_event_filter filter); - -DCMIDLLEXPORT int dcmi_get_npu_work_mode(int card_id, unsigned char *work_mode); - -DCMIDLLEXPORT int dcmi_get_device_die_v2( - int card_id, int device_id, enum dcmi_die_type input_type, struct dcmi_die_id *die_id); - -DCMIDLLEXPORT int dcmi_get_device_resource_info (int card_id, int device_id, struct dcmi_proc_mem_info *proc_info, - int *proc_num); - -DCMIDLLEXPORT int dcmi_get_device_board_info (int card_id, int device_id, struct dcmi_board_info *board_info); - -DCMIDLLEXPORT int dcmi_get_pcie_link_bandwidth_info(int card_id, int device_id, - struct dcmi_pcie_link_bandwidth_info *pcie_link_bandwidth_info); - -DCMIDLLEXPORT int dcmi_get_dcmi_version (char *dcmi_ver, int buf_size); - -DCMIDLLEXPORT int dcmi_get_mainboard_id (int card_id, int device_id, unsigned int *mainboard_id); - -DCMIDLLEXPORT int dcmi_get_hccs_link_bandwidth_info (int card_id, int device_id, struct dcmi_hccs_bandwidth_info *hccs_bandwidth_info); - -DCMIDLLEXPORT int dcmi_start_hccsping_mesh(int card_id, int device_id, int port_id, struct dcmi_hccsping_mesh_operate *hccsping_mesh); - -DCMIDLLEXPORT int dcmi_stop_hccsping_mesh(int card_id, int device_id, int port_id, unsigned int task_id); - -DCMIDLLEXPORT int dcmi_get_hccsping_mesh_info(int card_id, int device_id, int port_id, unsigned int task_id, struct dcmi_hccsping_mesh_info *hccsping_mesh_reply); - -DCMIDLLEXPORT int dcmi_get_hccsping_mesh_state(int card_id, int device_id, int port_id, unsigned int task_id, unsigned int *state); - -DCMIDLLEXPORT int dcmi_get_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int *status); - -DCMIDLLEXPORT int dcmi_set_spod_node_status(int card_id, int device_id, unsigned int sdid, unsigned int status); - -#endif - -#if defined DCMI_VERSION_1 -/* The following interfaces are V1 version interfaces. In order to ensure the compatibility is temporarily reserved, - * the later version will be deleted. Please switch to the V2 version interface as soon as possible */ - -struct dcmi_memory_info_stru { - unsigned long long memory_size; - unsigned int freq; - unsigned int utiliza; -}; - -DCMIDLLEXPORT int dcmi_get_memory_info(int card_id, int device_id, struct dcmi_memory_info_stru *device_memory_info); - -DCMIDLLEXPORT int dcmi_get_device_errorcode( - int card_id, int device_id, int *error_count, unsigned int *error_code, int *error_width); - -DCMIDLLEXPORT int dcmi_mcu_get_power_info(int card_id, int *power); - -DCMIDLLEXPORT int dcmi_get_card_elabel_v2(int card_id, struct dcmi_elabel_info *elabel_info); -#endif - -#ifdef __cplusplus -#if __cplusplus -} -#endif -#endif /* __cplusplus */ - -#endif /* __DCMI_INTERFACE_API_H__ */ diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager.go b/mind-cluster/component/ascend-common/devmanager/devmanager.go deleted file mode 100644 index fe21931..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager.go +++ /dev/null @@ -1,1197 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager -package devmanager - -import ( - "errors" - "fmt" - "math" - "strings" - "sync" - "time" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// DeviceInterface for common device interface -type DeviceInterface interface { - Init() error - ShutDown() error - GetDcmiVersion() string - GetDeviceCount() (int32, error) - GetCardList() (int32, []int32, error) - GetDeviceNumInCard(cardID int32) (int32, error) - GetDeviceList() (int32, []int32, error) - GetChipBaseInfos() ([]*common.ChipBaseInfo, error) - GetDeviceHealth(logicID int32) (uint32, error) - GetDeviceNetWorkHealth(logicID int32) (uint32, error) - GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) - GetDeviceTemperature(logicID int32) (int32, error) - GetDeviceVoltage(logicID int32) (float32, error) - GetDevicePowerInfo(logicID int32) (float32, error) - GetMcuPowerInfo(cardID int32) (float32, error) - GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) - GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) - GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) - GetDeviceErrorCode(logicID int32) (int32, int64, error) - GetChipInfo(logicID int32) (*common.ChipInfo, error) - GetPhysicIDFromLogicID(logicID int32) (int32, error) - GetLogicIDFromPhysicID(physicID int32) (int32, error) - GetDeviceLogicID(cardID, deviceID int32) (int32, error) - GetCardIDDeviceID(logicID int32) (int32, int32, error) - GetDeviceIPAddress(logicID, ipType int32) (string, error) - CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) - GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) - DestroyVirtualDevice(logicID int32, vDevID uint32) error - GetDevType() string - GetProductTypeArray() []string - GetProductType(cardID, deviceID int32) (string, error) - GetAllProductType() ([]string, error) - GetNpuWorkMode() string - SetDeviceReset(cardID, deviceID int32) error - GetBrotherCardID(int32, int32) (int32, error) - PreResetSoc(int32, int32) error - GetOutBandChannelState(int32, int32) error - SetDeviceResetOutBand(int32, int32) error - RescanSoc(int32, int32) error - GetDeviceBootStatus(logicID int32) (int, error) - GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) - SubscribeDeviceFaultEvent(logicID int32) error - SetFaultEventCallFunc(func(common.DevFaultInfo)) error - GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) - GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) - GetPCIeBusInfo(logicID int32) (string, error) - GetBoardInfo(logicID int32) (common.BoardInfo, error) - GetCardElabelV2(cardID int32) (common.ElabelInfo, error) - GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) - SetIsTrainingCard() error - IsTrainingCard() bool - GetValidChipInfo() (common.ChipInfo, error) - GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) - GetSuperPodInfo(int32) (common.CgoSuperPodInfo, error) - GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) - GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) - GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) - GetMainBoardId() uint32 - GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) - - DcStartHccsPingMesh(int32, int32, int, common.HccspingMeshOperate) error - DcStopHccsPingMesh(int32, int32, int, uint) error - DcGetHccsPingMeshInfo(int32, int32, int, uint) (*common.HccspingMeshInfo, error) - DcGetHccsPingMeshState(int32, int32, int, uint) (int, error) - DcGetSuperPodStatus(int32, int32, uint32) (int, error) - DcSetSuperPodStatus(int32, int32, uint32, uint32) error -} - -const ( - // init dcmi interface max retry times - maxRetries = 6 - // init dcmi interface retry delay - defaultRetryDelay = 10 -) - -var ( - devManager *DeviceManager = nil - devManagerOnce sync.Once - idCache sync.Map -) - -// npuIdMapping the mapping between the three IDs -type npuIdMapping struct { - logicId int32 - cardId int32 - deviceId int32 -} - -// GetDeviceManager singleton to init global device manager and init dcmi interface -func GetDeviceManager(resetTimeout int) (*DeviceManager, error) { - devManagerOnce.Do(func() { - // a common dcmi Manager is initiated for init dcmi interface, you can specify an specific manager in later - dcMgr := dcmi.DcManager{} - var retryDelay time.Duration = defaultRetryDelay - hwlog.RunLog.Infof("get card list from dcmi reset timeout is %d", resetTimeout) - for currentTime, retryCount := 0, 0; currentTime <= resetTimeout; currentTime += int(retryDelay) { - if err := dcMgr.DcInit(); err != nil { - hwlog.RunLog.Errorf("deviceManager init failed, prepare dcmi failed, err: %v", err) - return - } - cardNum, cardList, err := dcMgr.DcGetCardList() - if err == nil && int(cardNum) == len(cardList) { - hwlog.RunLog.Infof("deviceManager get cardList is %v, cardList length equal to cardNum: %v", - cardList, cardNum) - break - } - if diffTime := float64(resetTimeout - currentTime); diffTime > 0 { - retryDelay = time.Duration(math.Min(float64(defaultRetryDelay), diffTime)) - } - retryCount++ - hwlog.RunLog.Warnf("deviceManager get card list failed (attempt %d), cardNum=%d, cardList=%v, "+ - "err: %v", retryCount, cardNum, cardList, err) - if currentTime+int(retryDelay) <= resetTimeout { - if err = dcMgr.DcShutDown(); err != nil { - hwlog.RunLog.Errorf("deviceManager shut down failed, err: %v", err) - return - } - time.Sleep(retryDelay * time.Second) - continue - } - if int(cardNum) != len(cardList) { - hwlog.RunLog.Warnf("deviceManager get cardList is %v, but cardNum is %v, "+ - "please check whether the real number of npu matches the cardList", cardList, cardNum) - } - } - devManager = &DeviceManager{} - devManager.DcMgr = &dcMgr - dcmiVer, err := dcMgr.DcGetDcmiVersion() - if err != nil { - hwlog.RunLog.Warnf("deviceManager get dcmi version failed, err: %v", err) - } - hwlog.RunLog.Infof("the dcmi version is %s", dcmiVer) - devManager.dcmiVersion = dcmiVer - }) - if devManager == nil { - return nil, errors.New("device Manager is nil, may encounter an exception during initialization. " + - "You can check the system log to confirm") - } - return devManager, nil -} - -// DeviceManager common device manager for Ascend910/310P/310 -type DeviceManager struct { - // DcMgr for common dev manager - DcMgr dcmi.DcDriverInterface - // DevType the value is the same as the device type corresponding to the DcMgr variable. - // Options: api.Ascend310,api.Ascend310P,api.Ascend910 - DevType string - // ProductTypes product type in server, multi type will be in 310P mix scene - ProductTypes []string - // isTrainingCard whether the device is used for training - isTrainingCard bool - dcmiVersion string - // mainBoardId used to distinguish between A900A3SuperPod and A9000A3SuperPod - mainBoardId uint32 -} - -// GetProductTypeArray return product types -func (d *DeviceManager) GetProductTypeArray() []string { - return d.ProductTypes -} - -// GetDevType return dev type -func (d *DeviceManager) GetDevType() string { - return d.DevType -} - -// AutoInit auto detect npu chip type and return the corresponding processing object -func AutoInit(dType string, resetTimeout int) (*DeviceManager, error) { - chipInfo, boardInfo, err := getDeviceInfoForInit(resetTimeout) - if err != nil { - return nil, fmt.Errorf("auto init failed, err: %s", err) - } - var devMgr *DeviceManager - if devMgr, err = GetDeviceManager(resetTimeout); err != nil || devMgr == nil { - return nil, err - } - mainBoardId, err := getValidMainBoardInfo(devMgr.DcMgr) - if err != nil { - // Non-blocking when the main board ID is not found - hwlog.RunLog.Warn(err) - } - devMgr.mainBoardId = mainBoardId - var devType = common.GetDevType(chipInfo.Name, boardInfo.BoardId) - - switch devType { - case api.Ascend910A, api.Ascend910B, api.Ascend910A3: - devMgr.DcMgr = &A910Manager{} - case api.Ascend310P: - devMgr.DcMgr = &A310PManager{} - case api.Ascend310, api.Ascend310B: - devMgr.DcMgr = &A310Manager{} - default: - return nil, fmt.Errorf("unsupport device type (%s)", devType) - } - hwlog.RunLog.Infof("chipName: %v, devType: %v", chipInfo.Name, devType) - if dType != "" && devType != dType { - return nil, fmt.Errorf("the value of dType(%s) is inconsistent with the actual chip type(%s)", - dType, devType) - } - devMgr.DevType = devType - if err := devMgr.SetIsTrainingCard(); err != nil { - hwlog.RunLog.Errorf("auto recognize training card failed, err: %s", err) - } - - pTypes, err := devMgr.GetAllProductType() - if err != nil { - hwlog.RunLog.Debugf("auto init product types failed, err: %s", err) - } - devMgr.ProductTypes = pTypes - return devMgr, nil -} - -func getDeviceInfoForInit(resetTimeout int) (common.ChipInfo, common.BoardInfo, error) { - var mgr *DeviceManager - var err error - if mgr, err = GetDeviceManager(resetTimeout); err != nil || mgr == nil { - return common.ChipInfo{}, common.BoardInfo{}, fmt.Errorf("get chip info failed, err: %v", err) - } - dcMgr := mgr.DcMgr - chipInfo, err := getValidChipInfo(dcMgr) - if err != nil { - hwlog.RunLog.Error(err) - return common.ChipInfo{}, common.BoardInfo{}, err - } - boardInfo, err := getValidBoardInfo(dcMgr) - if err != nil { - hwlog.RunLog.Error(err) - return chipInfo, common.BoardInfo{}, err - } - - return chipInfo, boardInfo, nil -} - -func getValidChipInfo(dcMgr dcmi.DcDriverInterface) (common.ChipInfo, error) { - // get card list - cardNum, cardList, err := dcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return common.ChipInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) - } - if cardNum == 0 { - return common.ChipInfo{}, fmt.Errorf("get chip info failed, no card found") - } - // get device in card, then get chip info by cardID and deviceID - for _, cardID := range cardList { - devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) - if err != nil || devNum == 0 { - hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) - continue - } - for devID := int32(0); devID < devNum; devID++ { - chipInfo, err := dcMgr.DcGetChipInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Debugf("get chip info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - if !common.IsValidChipInfo(chipInfo) { - hwlog.RunLog.Debugf("invalid chip info by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - return *chipInfo, nil - } - } - return common.ChipInfo{}, errors.New("cannot get valid chip info") -} - -func getValidBoardInfo(dcMgr dcmi.DcDriverInterface) (common.BoardInfo, error) { - // get card list - cardNum, cardList, err := dcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return common.BoardInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed) - } - if cardNum == 0 { - return common.BoardInfo{}, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) - } - // get device in card, then get board info by cardID and deviceID - for _, cardID := range cardList { - devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) - if err != nil || devNum == 0 { - hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) - continue - } - for devID := int32(0); devID < devNum; devID++ { - boardInfo, err := dcMgr.DcGetDeviceBoardInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Debugf("get board info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - if !common.IsValidBoardInfo(&boardInfo) { - hwlog.RunLog.Debugf("invalid board info by cardID(%d), deviceID(%d), error: %v", cardID, devID, - err) - continue - } - return boardInfo, nil - } - } - return common.BoardInfo{}, errors.New("cannot get valid board info") -} -func getValidMainBoardInfo(dcMgr dcmi.DcDriverInterface) (uint32, error) { - // get card list - cardNum, cardList, err := dcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return 0, fmt.Errorf(common.ErrMsgInitCardListFailed) - } - if cardNum == 0 { - return 0, fmt.Errorf(common.ErrMsgGetBoardInfoFailed) - } - // get device in card, then get board info by cardID and deviceID - for _, cardID := range cardList { - devNum, err := dcMgr.DcGetDeviceNumInCard(cardID) - if err != nil || devNum == 0 { - hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err) - continue - } - for devID := int32(0); devID < devNum; devID++ { - mainBoardId, err := dcMgr.DcGetDeviceMainBoardInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Debug(err) - continue - } - if !common.IsValidMainBoardInfo(mainBoardId) { - hwlog.RunLog.Warnf("invalid mainBoardId info by cardID(%d), deviceID(%d), error: %v", cardID, devID, err) - continue - } - return mainBoardId, nil - } - } - return 0, errors.New("cannot get main board id") -} - -// Init load symbol and initialize dcmi -func (d *DeviceManager) Init() error { - return d.DcMgr.DcInit() -} - -// ShutDown clean the dynamically loaded resource -func (d *DeviceManager) ShutDown() error { - return d.DcMgr.DcShutDown() -} - -// GetDeviceCount get npu device count -func (d *DeviceManager) GetDeviceCount() (int32, error) { - return d.DcMgr.DcGetDeviceCount() -} - -// GetCardList get all card list -func (d *DeviceManager) GetCardList() (int32, []int32, error) { - return d.DcMgr.DcGetCardList() -} - -// GetDeviceNumInCard get all device list in one card -func (d *DeviceManager) GetDeviceNumInCard(cardID int32) (int32, error) { - return d.DcMgr.DcGetDeviceNumInCard(cardID) -} - -// GetDeviceList get all device logicID list -func (d *DeviceManager) GetDeviceList() (int32, []int32, error) { - return d.DcMgr.DcGetLogicIDList() -} - -// GetDeviceHealth query npu device health status -func (d *DeviceManager) GetDeviceHealth(logicID int32) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get health code by logicID(%d)", logicID) - } - healthCode, err := d.DcMgr.DcGetDeviceHealth(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, err - } - - return uint32(healthCode), nil -} - -// GetDeviceNetWorkHealth query npu device network health status -func (d *DeviceManager) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get network health code by logicID(%d)", logicID) - } - healthCode, err := d.DcMgr.DcGetDeviceNetWorkHealth(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, err - } - - return healthCode, nil -} - -// GetDeviceUtilizationRate get npu device utilization -func (d *DeviceManager) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get utilization by logicID(%d)", logicID) - } - rate, err := d.DcMgr.DcGetDeviceUtilizationRate(cardID, deviceID, deviceType) - if err != nil { - return common.UnRetError, err - } - - return uint32(rate), nil -} - -// GetDeviceTemperature get npu device temperature -func (d *DeviceManager) GetDeviceTemperature(logicID int32) (int32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) - } - temp, err := d.DcMgr.DcGetDeviceTemperature(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID) - } - - return temp, nil -} - -// GetDeviceVoltage get npu device voltage -func (d *DeviceManager) GetDeviceVoltage(logicID int32) (float32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) - } - voltage, err := d.DcMgr.DcGetDeviceVoltage(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID) - } - - return voltage, nil -} - -// GetDevicePowerInfo get npu device power info -func (d *DeviceManager) GetDevicePowerInfo(logicID int32) (float32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) - } - power, err := d.DcMgr.DcGetDevicePowerInfo(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID) - } - - return power, nil -} - -// GetDeviceFrequency get npu device work frequency -func (d *DeviceManager) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) - } - frequency, err := d.DcMgr.DcGetDeviceFrequency(cardID, deviceID, deviceType) - if err != nil { - hwlog.RunLog.Error(err) - return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID) - } - - return frequency, nil -} - -// GetDeviceMemoryInfo get npu memory information -func (d *DeviceManager) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) - } - - // 910B and 910A3 don't have DDR module. Therefore, DDR information cannot be queried. - if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { - hwlog.RunLog.Debugf("%v doesn't have DDR module. Therefore, DDR information cannot be queried", d.DevType) - return nil, nil - } - - memInfo, err := d.DcMgr.DcGetMemoryInfo(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID) - } - - return memInfo, nil -} - -// GetDeviceHbmInfo get npu HBM module memory and frequency information -func (d *DeviceManager) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get hbm info by logicID(%d)", logicID) - } - hbmInfo, err := d.DcMgr.DcGetHbmInfo(cardID, deviceID) - if err != nil { - return nil, err - } - - return hbmInfo, nil -} - -// GetDeviceErrorCode get npu device error code -func (d *DeviceManager) GetDeviceErrorCode(logicID int32) (int32, int64, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", - logicID) - } - errCount, errCode, err := d.DcMgr.DcGetDeviceErrorCode(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)", - logicID) - } - - return errCount, errCode, nil -} - -// GetChipInfo get npu device error code -func (d *DeviceManager) GetChipInfo(logicID int32) (*common.ChipInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d), error: %v", logicID, err) - } - chipInfo, err := d.DcMgr.DcGetChipInfo(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get chip info code by logicID(%d)", logicID) - } - - return chipInfo, nil -} - -// GetPhysicIDFromLogicID get device physic id from logic id -func (d *DeviceManager) GetPhysicIDFromLogicID(logicID int32) (int32, error) { - physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get physicID by logicID(%d)", logicID) - } - - return physicID, nil -} - -// GetLogicIDFromPhysicID get device logic id from physic id -func (d *DeviceManager) GetLogicIDFromPhysicID(physicID int32) (int32, error) { - logicID, err := d.DcMgr.DcGetLogicIDFromPhysicID(physicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, fmt.Errorf("failed to get logicID by physicID(%d)", physicID) - } - - return logicID, nil -} - -// GetDeviceLogicID get device logic id from card id and device id -func (d *DeviceManager) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { - return d.DcMgr.DcGetDeviceLogicID(cardID, deviceID) -} - -// GetDeviceIPAddress get device ip address -func (d *DeviceManager) GetDeviceIPAddress(logicID, ipType int32) (string, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return "", fmt.Errorf("failed to get cardID and deviceID by logicID(%d), %w", logicID, err) - } - return d.DcMgr.DcGetDeviceIPAddress(cardID, deviceID, ipType) -} - -// CreateVirtualDevice create virtual device -func (d *DeviceManager) CreateVirtualDevice( - logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { - if !common.IsValidTemplateName(d.DevType, vDevInfo.TemplateName) { - return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid template name: %s", vDevInfo.TemplateName) - } - return d.DcMgr.DcCreateVDevice(logicID, vDevInfo) -} - -// GetVirtualDeviceInfo get virtual device info -func (d *DeviceManager) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - cgoVDevInfo, err := d.DcMgr.DcGetVDeviceInfo(logicID) - if err != nil { - hwlog.RunLog.Debug(err) - return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v "+ - "and vdev num is: %d", err, int32(cgoVDevInfo.TotalResource.VDevNum)) - } - for _, vDevInfo := range cgoVDevInfo.VDevInfo { - if !common.IsValidTemplateName(d.DevType, vDevInfo.QueryInfo.Name) { - return common.VirtualDevInfo{}, fmt.Errorf("vdevice id %d, it's template name is invalid: %s", - vDevInfo.VDevID, vDevInfo.QueryInfo.Name) - } - } - return cgoVDevInfo, nil -} - -// DestroyVirtualDevice destroy virtual device -func (d *DeviceManager) DestroyVirtualDevice(logicID int32, vDevID uint32) error { - return d.DcMgr.DcDestroyVDevice(logicID, vDevID) -} - -// GetMcuPowerInfo get mcu power info for cardID -func (d *DeviceManager) GetMcuPowerInfo(cardID int32) (float32, error) { - return d.DcMgr.DcGetMcuPowerInfo(cardID) -} - -// GetCardIDDeviceID get cardID and deviceID by logicID -func (d *DeviceManager) GetCardIDDeviceID(logicID int32) (int32, int32, error) { - return d.getCardIdAndDeviceId(logicID) -} - -// GetProductType get product type by cardID and deviceID -func (d *DeviceManager) GetProductType(cardID, deviceID int32) (string, error) { - return d.DcMgr.DcGetProductType(cardID, deviceID) -} - -// GetAllProductType get all product type -func (d *DeviceManager) GetAllProductType() ([]string, error) { - productTypes := make([]string, 0) - cardNum, cardList, err := d.GetCardList() - if err != nil || cardNum == 0 { - hwlog.RunLog.Errorf("failed to get card list, err: %v", err) - return productTypes, err - } - for _, cardID := range cardList { - devNum, err := d.GetDeviceNumInCard(cardID) - if err != nil { - hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err) - continue - } - if devNum == 0 { - hwlog.RunLog.Debugf("not found device on card %d", cardID) - continue - } - for devID := int32(0); devID < devNum; devID++ { - productType, err := d.GetProductType(cardID, devID) - if err != nil { - hwlog.RunLog.Debugf("get product type by card %d deviceID %d failed, err: %v", cardID, devID, err) - continue - } - productTypes = append(productTypes, productType) - break - } - } - if len(productTypes) != 0 { - productTypes = common.RemoveDuplicate(&productTypes) - } - return productTypes, nil -} - -// GetNpuWorkMode get work mode of NPU -func (d *DeviceManager) GetNpuWorkMode() string { - if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 { - hwlog.RunLog.Warnf("only AMP mode is available on %s", d.DevType) - return common.AMPMode - } - - _, cardList, err := d.DcMgr.DcGetCardList() - if err != nil { - hwlog.RunLog.Error(err) - return "" - } - if len(cardList) > 0 { - mode, err := d.DcMgr.DcGetNpuWorkMode(cardList[0]) - if err != nil { - hwlog.RunLog.Error(err) - return "" - } - if mode == 0 { - return common.AMPMode - } - return common.SMPMode - } - return "" -} - -// SetDeviceReset reset spec device -func (d *DeviceManager) SetDeviceReset(cardID, deviceID int32) error { - return d.DcMgr.DcSetDeviceReset(cardID, deviceID) -} - -// GetBrotherCardID get brother card id -func (d *DeviceManager) GetBrotherCardID(cardID, deviceID int32) (int32, error) { - return d.DcMgr.DcGetBrotherCardID(cardID, deviceID) -} - -// GetOutBandChannelState get out band channel state -func (d *DeviceManager) GetOutBandChannelState(cardID, deviceID int32) error { - return d.DcMgr.DcGetOutBandChannelState(cardID, deviceID) -} - -// PreResetSoc pre reset soc, used before reset out band -func (d *DeviceManager) PreResetSoc(cardID, deviceID int32) error { - return d.DcMgr.DcPreResetSoc(cardID, deviceID) -} - -// SetDeviceResetOutBand reset spec device out band -func (d *DeviceManager) SetDeviceResetOutBand(cardID, deviceID int32) error { - return d.DcMgr.DcSetDeviceResetOutBand(cardID, deviceID) -} - -// RescanSoc trigger soc rescan, non-blocking -func (d *DeviceManager) RescanSoc(cardID, deviceID int32) error { - return d.DcMgr.DcRescanSoc(cardID, deviceID) -} - -// GetDeviceBootStatus get device boot status -func (d *DeviceManager) GetDeviceBootStatus(logicID int32) (int, error) { - return d.DcMgr.DcGetDeviceBootStatus(logicID) -} - -// GetDeviceAllErrorCode get npu device all error code -func (d *DeviceManager) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", - logicID) - } - errCount, errCodes, err := d.DcMgr.DcGetDeviceAllErrorCode(cardID, deviceID) - if err != nil { - hwlog.RunLog.Error(err) - return common.RetError, nil, fmt.Errorf("failed to get device error code by logicID(%d)", logicID) - } - return errCount, errCodes, nil -} - -// SubscribeDeviceFaultEvent get npu device error code by subscribe -func (d *DeviceManager) SubscribeDeviceFaultEvent(logicID int32) error { - var cardID, deviceID int32 - if logicID == common.SubscribeAllDevice { - cardID = common.SubscribeAllDevice - deviceID = common.SubscribeAllDevice - } else { - var err error - cardID, deviceID, err = d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return fmt.Errorf("failed to get cardID in subscribe device error code by logicID(%d)", logicID) - } - } - if err := d.DcMgr.DcSubscribeDeviceFaultEvent(cardID, deviceID); err != nil { - hwlog.RunLog.Error(err) - return fmt.Errorf("failed to subscribe device error code by logicID(%d)", logicID) - } - return nil -} - -// SetFaultEventCallFunc set fault event call func -func (d *DeviceManager) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { - if businessFunc == nil { - return errors.New("business func can't be nil") - } - d.DcMgr.DcSetFaultEventCallFunc(businessFunc) - return nil -} - -// GetDieID return die id by dcmi die type, vdie id or ndie id -func (d *DeviceManager) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetDieID(cardID, deviceID, dcmiDieType) -} - -// GetDevProcessInfo get process and process memory in device side -func (d *DeviceManager) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetDevProcessInfo(cardID, deviceID) -} - -// GetPCIeBusInfo pcie bus info -func (d *DeviceManager) GetPCIeBusInfo(logicID int32) (string, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetPCIeBusInfo(cardID, deviceID) -} - -// GetBoardInfo return board info of device -func (d *DeviceManager) GetBoardInfo(logicID int32) (common.BoardInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.BoardInfo{}, fmt.Errorf("failed to get cardID in "+ - "get device error code by logicID(%d)", logicID) - } - - return d.DcMgr.DcGetDeviceBoardInfo(cardID, deviceID) -} - -// GetCardElabelV2 get card elabel information -func (d *DeviceManager) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - return d.DcMgr.DcGetCardElabelV2(cardID) -} - -// GetPCIEBandwidth get pcie bandwidth -func (d *DeviceManager) GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Error(err) - return common.PCIEBwStat{}, fmt.Errorf("get cardID(deviceID) failed, error by logicID(%d)", logicID) - } - pciePCIEBw, err := d.DcMgr.DcGetPCIEBandwidth(cardID, deviceID, profilingTime) - if err != nil { - return common.PCIEBwStat{}, err - } - return pciePCIEBw, nil -} - -// SetIsTrainingCard identifies whether it is a training card according to the usage of card -func (d *DeviceManager) SetIsTrainingCard() error { - devType := d.GetDevType() - if strings.HasPrefix(devType, api.Ascend310) { - d.isTrainingCard = false - return nil - } - - boardInfo := common.BoardInfo{} - cardNum, cardList, err := d.GetCardList() - if err != nil || cardNum == 0 { - hwlog.RunLog.Errorf("failed to get card list when set 'IsTrainingCard' err: %v", err) - return err - } - for _, cardID := range cardList { - devNum, err := d.GetDeviceNumInCard(cardID) - if err != nil { - hwlog.RunLog.Warnf("get device num by cardID(%d) failed when set 'IsTrainingCard', error: %v", cardID, err) - continue - } - if devNum == 0 { - hwlog.RunLog.Warnf("not found device on card %d when set 'IsTrainingCard'", cardID) - continue - } - - for devID := int32(0); devID < devNum; devID++ { - boardInfo, err = d.DcMgr.DcGetDeviceBoardInfo(cardID, devID) - if err != nil { - hwlog.RunLog.Warnf("get board info by card %d deviceID %d failed, err: %v", cardID, devID, err) - continue - } - break - } - if err == nil { - break - } - } - - if devType == api.Ascend910B && - (boardInfo.BoardId == common.A300IA2BoardId || boardInfo.BoardId == common.A300IA2GB64BoardId) { - d.isTrainingCard = false - return nil - } - - d.isTrainingCard = true - return nil -} - -// IsTrainingCard return true if it is a training card -func (d *DeviceManager) IsTrainingCard() bool { - return d.isTrainingCard -} - -// GetDcmiVersion get dcmi version -func (d *DeviceManager) GetDcmiVersion() string { - return d.dcmiVersion -} - -// GetMainBoardId get mainBoardId -func (d *DeviceManager) GetMainBoardId() uint32 { - return d.mainBoardId -} - -// GetValidChipInfo find a valid chip info from all cards -func (d *DeviceManager) GetValidChipInfo() (common.ChipInfo, error) { - chipInfo, err := getValidChipInfo(d.DcMgr) - if err != nil { - hwlog.RunLog.Error("failed to get valid chip info") - return common.ChipInfo{}, err - } - return chipInfo, nil -} - -// GetDeviceEccInfo query device ECC info -func (d *DeviceManager) GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - hwlog.RunLog.Errorf("get cardID and deviceID by logicID(%d) failed, error: %v", logicID, err) - return nil, err - } - return d.DcMgr.DcGetDeviceEccInfo(cardID, deviceID, dcmiDeviceType) -} - -// GetSuperPodInfo get 910A3 super pod info -func (d *DeviceManager) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.CgoSuperPodInfo{}, fmt.Errorf("input invalid logicID: %d", logicID) - } - - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get super pod info, error: %v", logicID, err) - } - cgoSuperPodInfo, err := d.DcMgr.DcGetSuperPodInfo(cardID, deviceID) - if err != nil { - return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get super pod info by logicID(%d), error: %v", - logicID, err) - } - - return cgoSuperPodInfo, nil -} - -// GetSioInfo get SIO info -func (d *DeviceManager) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return nil, fmt.Errorf("input invalid logicID when get sio info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) when get sio info , error: %v", logicID, err) - } - cgoSPodSioInfo, err := d.DcMgr.DcGetSioInfo(cardID, deviceID) - if err != nil { - return nil, err - } - - return &cgoSPodSioInfo, nil -} - -// GetHccsStatisticInfo get HCCS statistic info -func (d *DeviceManager) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get hccs statistic info, error: %v", logicID, err) - } - cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfo(cardID, deviceID) - if err != nil { - return buildFailedHccsInfo(), err - - } - - return &cgoHccsStatusInfo, nil -} - -// GetHccsStatisticInfoInU64 get hccs statistic info in u64 -func (d *DeviceManager) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { - if !common.IsValidLogicIDOrPhyID(logicID) { - return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get hccs statistic info, error: %v", logicID, err) - } - cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfoU64(cardID, deviceID) - if err != nil { - return buildFailedHccsInfo(), err - } - return &cgoHccsStatusInfo, nil -} - -// GetHccsBandwidthInfo get hccs bandwidth info -func (d *DeviceManager) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - - if !common.IsValidLogicIDOrPhyID(logicID) { - return buildFailedHccsBWInfo(), fmt.Errorf("input invalid logicID when get hccs bandwidth info: %d", logicID) - } - cardID, deviceID, err := d.getCardIdAndDeviceId(logicID) - if err != nil { - return buildFailedHccsBWInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+ - "when get hccs bandwidth info, error: %v", logicID, err) - } - cgoHccsBandwidthInfo, err := d.DcMgr.DcGetHccsBandwidthInfo(cardID, deviceID, common.HccsBWProfilingTime) - if err != nil { - return buildFailedHccsBWInfo(), fmt.Errorf("failed to get hccs bandwidth info by cardId(%d) deviceID(%d), error: %v", - cardID, deviceID, err) - } - - return &cgoHccsBandwidthInfo, nil -} - -// buildFailedHccsInfo build failed hccs info -func buildFailedHccsInfo() *common.HccsStatisticInfo { - errorResult := &common.HccsStatisticInfo{ - TxCnt: make([]uint64, 8), - RxCnt: make([]uint64, 8), - CrcErrCnt: make([]uint64, 8), - } - for i := 0; i < 8; i++ { - errorResult.TxCnt[i] = common.FailedValue - errorResult.RxCnt[i] = common.FailedValue - errorResult.CrcErrCnt[i] = common.FailedValue - } - return errorResult -} - -// buildFailedHccsBWInfo build failed hccs bandwidth info -func buildFailedHccsBWInfo() *common.HccsBandwidthInfo { - errorResult := &common.HccsBandwidthInfo{ - ProfilingTime: uint32(common.HccsBWProfilingTime), - TotalTxbw: common.FailedValue, - TotalRxbw: common.FailedValue, - TxBandwidth: make([]float64, 8), - RxBandwidth: make([]float64, 8), - } - for i := 0; i < 8; i++ { - errorResult.TxBandwidth[i] = common.FailedValue - errorResult.RxBandwidth[i] = common.FailedValue - } - return errorResult -} - -func (d *DeviceManager) getCardIdAndDeviceId(logicID int32) (int32, int32, error) { - - if !common.IsValidLogicIDOrPhyID(logicID) { - return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID) - } - - result, ok := idCache.Load(logicID) - if !ok { - return d.doGetCardIDAndDeviceID(logicID) - } - idMapping, ok := result.(npuIdMapping) - if !ok { - idCache.Delete(logicID) - return d.doGetCardIDAndDeviceID(logicID) - } - hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from cache, cardId:%v, deviceId:%v", - logicID, idMapping.cardId, idMapping.deviceId) - return idMapping.cardId, idMapping.deviceId, nil -} - -func (d *DeviceManager) doGetCardIDAndDeviceID(logicID int32) (int32, int32, error) { - cardId, deviceId, err := d.DcMgr.DcGetCardIDDeviceID(logicID) - if err != nil { - hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID, - "failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err) - return common.RetError, common.RetError, err - } - hwlog.ResetErrCnt(common.DomainForLogicIdErr, logicID) - hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from dcmi, cardId:%v, deviceId:%v", - logicID, cardId, deviceId) - idCache.Store(logicID, npuIdMapping{logicId: logicID, cardId: cardId, deviceId: deviceId}) - return cardId, deviceId, nil -} - -// GetChipBaseInfos get chip base info -func (d *DeviceManager) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { - _, cardList, err := d.DcMgr.DcGetCardList() - if err != nil { - return nil, fmt.Errorf("get card list failed, error: %v", err) - } - var chips = []*common.ChipBaseInfo{} - for _, cardID := range cardList { - devNumInCard, err := d.DcMgr.DcGetDeviceNumInCard(cardID) - if err != nil { - return nil, fmt.Errorf("get device num by cardID: %d failed, error: %v", - cardID, err) - } - for devID := int32(0); devID < devNumInCard; devID++ { - logicID, err := d.DcMgr.DcGetDeviceLogicID(cardID, devID) - if err != nil { - return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+ - "failed, error: %v", cardID, devID, err) - } - physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID) - if err != nil { - return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) physic id "+"failed, error: %v", - cardID, devID, err) - } - hwlog.RunLog.Infof("get chip base info, cardID: %d, deviceID: %d, logicID: %d, physicID: %d", cardID, - devID, logicID, physicID) - chips = append(chips, &common.ChipBaseInfo{ - PhysicID: physicID, - LogicID: logicID, - CardID: cardID, - DeviceID: devID, - }) - } - } - return chips, nil -} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DeviceManager) DcStartHccsPingMesh(cardID int32, deviceID int32, portID int, - operate common.HccspingMeshOperate) error { - return d.DcMgr.DcStartHccsPingMesh(cardID, deviceID, portID, operate) -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DeviceManager) DcStopHccsPingMesh(cardID int32, deviceID int32, portID int, taskID uint) error { - return d.DcMgr.DcStopHccsPingMesh(cardID, deviceID, portID, taskID) -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DeviceManager) DcGetHccsPingMeshInfo(cardID int32, deviceID int32, portID int, - taskID uint) (*common.HccspingMeshInfo, error) { - return d.DcMgr.DcGetHccsPingMeshInfo(cardID, deviceID, portID, taskID) -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DeviceManager) DcGetHccsPingMeshState(cardID int32, deviceID int32, portID int, taskID uint) (int, error) { - return d.DcMgr.DcGetHccsPingMeshState(cardID, deviceID, portID, taskID) -} - -// DcGetSuperPodStatus get super pod status -func (d *DeviceManager) DcGetSuperPodStatus(cardID int32, deviceID int32, sdid uint32) (int, error) { - var err error - var status int - for i := 0; i < maxRetries; i++ { - if status, err = d.DcMgr.DcGetSuperPodStatus(cardID, deviceID, sdid); err != nil { - hwlog.RunLog.Errorf("get super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ - "sdid: %d, error: %v", i, cardID, deviceID, sdid, err) - continue - } - break - } - return status, err -} - -// DcSetSuperPodStatus set super pod status -func (d *DeviceManager) DcSetSuperPodStatus(cardID int32, deviceID int32, sdid, status uint32) error { - var err error - for i := 0; i < maxRetries; i++ { - if err = d.DcMgr.DcSetSuperPodStatus(cardID, deviceID, sdid, status); err != nil { - hwlog.RunLog.Errorf("set super pod status failed, retry %d, cardID: %d, deviceID: %d, "+ - "sdid: %d, status: %d, error: %v", i, cardID, deviceID, sdid, status, err) - continue - } - break - } - return err -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go deleted file mode 100644 index ca7121b..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock.go +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager mock -package devmanager - -import ( - "ascend-common/api" -) - -// DeviceManager910A3Mock common device manager mock for Ascend910A3 -type DeviceManager910A3Mock struct { - DeviceManagerMock -} - -// GetDevType return mock type -func (d *DeviceManager910A3Mock) GetDevType() string { - return api.Ascend910A3 -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go deleted file mode 100644 index 817f06e..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_910a3_mock_err.go +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager error mock -package devmanager - -import ( - "errors" - - "ascend-common/api" - "ascend-common/devmanager/common" -) - -// DeviceManager910A3MockErr common device manager mock error for Ascend910A3 -type DeviceManager910A3MockErr struct { - DeviceManagerMockErr -} - -// GetDevType return mock type -func (d *DeviceManager910A3MockErr) GetDevType() string { - return api.Ascend910A3 -} - -// GetHccsStatisticInfo get hccs statistic info -func (d *DeviceManager910A3MockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - return &common.HccsStatisticInfo{}, errors.New(errorMsg) -} - -// GetHccsBandwidthInfo get hccs statistic info -func (d *DeviceManager910A3MockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - return &common.HccsBandwidthInfo{}, errors.New(errorMsg) -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go deleted file mode 100644 index 3d7fff4..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_hccs_test.go +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager for device driver manager -package devmanager - -import ( - "errors" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -const ( - mockLogicID int32 = 0 - mockCardID int32 = 0 - mockDeviceID int32 = 0 - invalidLogicID int32 = -1 - mockErrorMsg string = "mock error" - hccsArrayLen int = 8 -) - -type getHccsStatisticInfoInU64TestCase struct { - name string - logicID int32 - isValidID bool - getCardIDErr error - dcmiCallErr error - expectedErr bool -} - -func TestGetHccsStatisticInfoInU64(t *testing.T) { - testCases := buildGetHccsStatisticInfoInU64TestCases() - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - - clearIdCache(tc.logicID) - manager := createMockDeviceManager() - setupGetHccsStatisticInfoInU64Patches(patches, manager, tc) - result, err := manager.GetHccsStatisticInfoInU64(tc.logicID) - verifyGetHccsStatisticInfoInU64Result(result, err, tc) - }) - } -} - -func clearIdCache(logicID int32) { - idCache.Delete(logicID) -} - -func buildGetHccsStatisticInfoInU64TestCases() []getHccsStatisticInfoInU64TestCase { - return []getHccsStatisticInfoInU64TestCase{ - {name: "should return failed info when logicID is invalid", - logicID: invalidLogicID, - isValidID: false, - expectedErr: true}, - {name: "should return failed info when getCardIdAndDeviceId fails", - logicID: mockLogicID, - isValidID: true, - getCardIDErr: errors.New(mockErrorMsg), - expectedErr: true}, - {name: "should return failed info when DcGetHccsStatisticInfoU64 fails", - logicID: mockLogicID, - isValidID: true, - dcmiCallErr: errors.New(mockErrorMsg), - expectedErr: true}, - {name: "should return success info when all operations succeed", - logicID: mockLogicID, - isValidID: true, - expectedErr: false}, - } -} - -func createMockDeviceManager() *DeviceManager { - return &DeviceManager{ - DcMgr: &dcmi.DcManager{}, - } -} - -func setupGetHccsStatisticInfoInU64Patches(patches *gomonkey.Patches, - manager *DeviceManager, tc getHccsStatisticInfoInU64TestCase) { - patches.ApplyFuncReturn(common.IsValidLogicIDOrPhyID, tc.isValidID) - if !tc.isValidID { - return - } - if tc.getCardIDErr != nil { - patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", - mockCardID, mockDeviceID, tc.getCardIDErr) - } else { - patches.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", - mockCardID, mockDeviceID, nil) - if tc.dcmiCallErr != nil { - patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", - common.HccsStatisticInfo{}, tc.dcmiCallErr) - } else { - mockHccsInfo := createMockHccsStatisticInfo() - patches.ApplyMethodReturn(manager.DcMgr, "DcGetHccsStatisticInfoU64", - mockHccsInfo, nil) - } - } -} - -func createMockHccsStatisticInfo() common.HccsStatisticInfo { - txCnt := make([]uint64, hccsArrayLen) - rxCnt := make([]uint64, hccsArrayLen) - crcErrCnt := make([]uint64, hccsArrayLen) - for i := 0; i < hccsArrayLen; i++ { - txCnt[i] = uint64(i + 1) - rxCnt[i] = uint64(i + 1) - crcErrCnt[i] = 0 - } - return common.HccsStatisticInfo{ - TxCnt: txCnt, - RxCnt: rxCnt, - CrcErrCnt: crcErrCnt, - } -} - -func verifyGetHccsStatisticInfoInU64Result(result *common.HccsStatisticInfo, - err error, tc getHccsStatisticInfoInU64TestCase) { - if tc.expectedErr { - convey.So(err, convey.ShouldNotBeNil) - convey.So(result, convey.ShouldNotBeNil) - verifyFailedHccsInfo(result) - } else { - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldNotBeNil) - verifySuccessHccsInfo(result) - } -} - -func verifyFailedHccsInfo(result *common.HccsStatisticInfo) { - convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) - for i := 0; i < hccsArrayLen; i++ { - convey.So(result.TxCnt[i], convey.ShouldEqual, common.FailedValue) - convey.So(result.RxCnt[i], convey.ShouldEqual, common.FailedValue) - convey.So(result.CrcErrCnt[i], convey.ShouldEqual, common.FailedValue) - } -} - -func verifySuccessHccsInfo(result *common.HccsStatisticInfo) { - convey.So(len(result.TxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.RxCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(len(result.CrcErrCnt), convey.ShouldEqual, hccsArrayLen) - convey.So(result.TxCnt[0], convey.ShouldEqual, uint64(1)) - convey.So(result.RxCnt[0], convey.ShouldEqual, uint64(1)) -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go deleted file mode 100644 index c3bde2b..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_mock.go +++ /dev/null @@ -1,370 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager mock -package devmanager - -import ( - "ascend-common/api" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// DeviceManagerMock common device manager mock for Ascend910/310P/310 -type DeviceManagerMock struct { -} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DeviceManagerMock) DcStartHccsPingMesh(i int32, i2 int32, i3 int, operate common.HccspingMeshOperate) error { - return nil -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DeviceManagerMock) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { - return nil -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DeviceManagerMock) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, error) { - return &common.HccspingMeshInfo{}, nil -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DeviceManagerMock) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { - return 0, nil -} - -// Init load symbol and initialize dcmi -func (d *DeviceManagerMock) Init() error { - return nil -} - -// ShutDown clean the dynamically loaded resource -func (d *DeviceManagerMock) ShutDown() error { - return nil -} - -// GetDevType return mock type -func (d *DeviceManagerMock) GetDevType() string { - return api.Ascend910A -} - -// GetDeviceCount get npu device count -func (d *DeviceManagerMock) GetDeviceCount() (int32, error) { - return 1, nil -} - -// GetCardList get all card list -func (d *DeviceManagerMock) GetCardList() (int32, []int32, error) { - return 1, []int32{0}, nil -} - -// GetDeviceNumInCard get all device list in one card -func (d *DeviceManagerMock) GetDeviceNumInCard(cardID int32) (int32, error) { - return 1, nil -} - -// GetDeviceList get all device logicID list -func (d *DeviceManagerMock) GetDeviceList() (int32, []int32, error) { - return 1, []int32{0}, nil -} - -// GetDeviceHealth query npu device health status -func (d *DeviceManagerMock) GetDeviceHealth(logicID int32) (uint32, error) { - return 0, nil -} - -// GetDeviceNetWorkHealth query npu device network health status -func (d *DeviceManagerMock) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { - return 0, nil -} - -// GetDeviceUtilizationRate get npu device utilization -func (d *DeviceManagerMock) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, nil -} - -// GetDeviceTemperature get npu device temperature -func (d *DeviceManagerMock) GetDeviceTemperature(logicID int32) (int32, error) { - return 1, nil -} - -// GetDeviceVoltage get npu device voltage -func (d *DeviceManagerMock) GetDeviceVoltage(logicID int32) (float32, error) { - return 1, nil -} - -// GetDevicePowerInfo get npu device power info -func (d *DeviceManagerMock) GetDevicePowerInfo(logicID int32) (float32, error) { - return 1, nil -} - -// GetDeviceFrequency get npu device work frequency -func (d *DeviceManagerMock) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, nil -} - -// GetDeviceMemoryInfo get npu memory information -func (d *DeviceManagerMock) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { - return &common.MemoryInfo{ - MemorySize: 1, - MemoryAvailable: 1, - Frequency: 1, - Utilization: 1, - }, nil -} - -// GetDeviceHbmInfo get npu HBM module memory and frequency information -func (d *DeviceManagerMock) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { - return &common.HbmInfo{ - MemorySize: 1, - Frequency: 1, - Usage: 1, - Temp: 1, - BandWidthUtilRate: 1, - }, nil -} - -// GetDeviceErrorCode get npu device error code -func (d *DeviceManagerMock) GetDeviceErrorCode(logicID int32) (int32, int64, error) { - return int32(0), int64(0), nil -} - -// GetChipInfo get npu device error code -func (d *DeviceManagerMock) GetChipInfo(logicID int32) (*common.ChipInfo, error) { - chip := &common.ChipInfo{ - Type: "ascend", - Name: common.Chip910, - Version: "v1", - } - return chip, nil -} - -// GetPhysicIDFromLogicID get device physic id from logic id -func (d *DeviceManagerMock) GetPhysicIDFromLogicID(logicID int32) (int32, error) { - return 1, nil -} - -// GetLogicIDFromPhysicID get device logic id from physic id -func (d *DeviceManagerMock) GetLogicIDFromPhysicID(physicID int32) (int32, error) { - return 1, nil -} - -// GetDeviceLogicID get device logic id from card id and device id -func (d *DeviceManagerMock) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { - return 1, nil -} - -// GetDeviceIPAddress get device ip address -func (d *DeviceManagerMock) GetDeviceIPAddress(logicID, ipType int32) (string, error) { - if ipType == 0 { - return "127.0.0.1", nil - } - return "::1", nil -} - -// CreateVirtualDevice create virtual device -func (d *DeviceManagerMock) CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common. - CgoCreateVDevOut, error) { - return common.CgoCreateVDevOut{}, nil -} - -// GetVirtualDeviceInfo get virtual device info -func (d *DeviceManagerMock) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - return common.VirtualDevInfo{}, nil -} - -// DestroyVirtualDevice destroy virtual device -func (d *DeviceManagerMock) DestroyVirtualDevice(logicID int32, vDevID uint32) error { - return nil -} - -// GetMcuPowerInfo get mcu power info for cardID -func (d *DeviceManagerMock) GetMcuPowerInfo(cardID int32) (float32, error) { - return 1, nil -} - -// GetCardIDDeviceID get cardID and deviceID by logicID -func (d *DeviceManagerMock) GetCardIDDeviceID(logicID int32) (int32, int32, error) { - return 0, 0, nil -} - -// GetProductType get product type success -func (d *DeviceManagerMock) GetProductType(cardID, deviceID int32) (string, error) { - return "", nil -} - -// GetAllProductType get all product type success -func (d *DeviceManagerMock) GetAllProductType() ([]string, error) { - return []string{}, nil -} - -// GetNpuWorkMode get npu chip work mode SMP success -func (d *DeviceManagerMock) GetNpuWorkMode() string { - return common.SMPMode -} - -// SetDeviceReset set device reset success -func (d *DeviceManagerMock) SetDeviceReset(cardID, deviceID int32) error { - return nil -} - -// GetDeviceBootStatus get device boot status success -func (d *DeviceManagerMock) GetDeviceBootStatus(logicID int32) (int, error) { - return common.BootStartFinish, nil -} - -// GetDeviceAllErrorCode get device all error code success -func (d *DeviceManagerMock) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { - return 0, []int64{}, nil -} - -// SubscribeDeviceFaultEvent subscribe device fault event success -func (d *DeviceManagerMock) SubscribeDeviceFaultEvent(logicID int32) error { - return nil -} - -// SetFaultEventCallFunc set fault event call func success -func (d *DeviceManagerMock) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { - return nil -} - -// GetDieID get die id success -func (d *DeviceManagerMock) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { - return "ABCDEFGHIGKLMNOPQRSTUVWXYZ01234567890123", nil -} - -// GetDevProcessInfo get process info -func (d *DeviceManagerMock) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { - return &common.DevProcessInfo{}, nil -} - -// GetPCIeBusInfo get pcie bus info -func (d *DeviceManagerMock) GetPCIeBusInfo(logicID int32) (string, error) { - return "0000:61:00.0", nil -} - -// GetBoardInfo Get board info -func (d *DeviceManagerMock) GetBoardInfo(logicID int32) (common.BoardInfo, error) { - return common.BoardInfo{}, nil -} - -// GetCardElabelV2 get card elabel information -func (d *DeviceManagerMock) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - return common.ElabelInfo{}, nil -} - -// GetProductTypeArray test for get product type array -func (d *DeviceManagerMock) GetProductTypeArray() []string { - return []string{common.Atlas200ISoc} -} - -// GetPCIEBandwidth get pcie bandwidth -func (d *DeviceManagerMock) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { - return common.PCIEBwStat{}, nil -} - -// SetIsTrainingCard set IsTrainingCard -func (d *DeviceManagerMock) SetIsTrainingCard() error { - return nil -} - -// IsTrainingCard get IsTrainingCard -func (d *DeviceManagerMock) IsTrainingCard() bool { - return true -} - -// GetDcmiVersion get dcmi version -func (d *DeviceManagerMock) GetDcmiVersion() string { - return "v1" -} - -// GetValidChipInfo get valid chip info from all npu -func (d *DeviceManagerMock) GetValidChipInfo() (common.ChipInfo, error) { - return common.ChipInfo{}, nil -} - -// GetDeviceEccInfo get device ECC info -func (d *DeviceManagerMock) GetDeviceEccInfo(logicID int32, - dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { - return &common.ECCInfo{EnableFlag: 1}, nil -} - -// GetSuperPodInfo get super pod info -func (d *DeviceManagerMock) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { - return common.CgoSuperPodInfo{}, nil -} - -// GetSioInfo get sio info -func (d *DeviceManagerMock) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { - return &common.SioCrcErrStatisticInfo{ - TxErrCnt: 0, - RxErrCnt: 0, - }, nil -} - -// GetHccsStatisticInfo get hccs statistic info -func (d *DeviceManagerMock) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - return &common.HccsStatisticInfo{}, nil -} - -// GetHccsStatisticInfoInU64 get hccs statistic info in u64 -func (d *DeviceManagerMock) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { - return &common.HccsStatisticInfo{}, nil -} - -// GetMainBoardId get main board id -func (d *DeviceManagerMock) GetMainBoardId() uint32 { - return 0 -} - -// GetHccsBandwidthInfo get hccs statistic info -func (d *DeviceManagerMock) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - return &common.HccsBandwidthInfo{}, nil -} - -// GetBrotherCardID get brother card id -func (d *DeviceManagerMock) GetBrotherCardID(cardID, deviceID int32) (int32, error) { - const noneBroCard = -1 - return noneBroCard, nil -} - -// GetOutBandChannelState get out band channel state -func (d *DeviceManagerMock) GetOutBandChannelState(cardID, deviceID int32) error { - return nil -} - -// PreResetSoc pre reset soc, used before reset out band -func (d *DeviceManagerMock) PreResetSoc(cardID, deviceID int32) error { - return nil -} - -// SetDeviceResetOutBand reset spec device out band -func (d *DeviceManagerMock) SetDeviceResetOutBand(cardID, deviceID int32) error { - return nil -} - -// RescanSoc trigger soc rescan, non-blocking -func (d *DeviceManagerMock) RescanSoc(cardID, deviceID int32) error { - return nil -} - -// GetChipBaseInfos get chip base info -func (d *DeviceManagerMock) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { - return nil, nil -} - -func (d *DeviceManagerMock) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } - -func (d *DeviceManagerMock) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go b/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go deleted file mode 100644 index 8ad8d7c..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_mock_err.go +++ /dev/null @@ -1,369 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package devmanager this for device driver manager error mock -package devmanager - -import ( - "errors" - - "ascend-common/api" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -var errorMsg = "mock error" - -// DeviceManagerMockErr common device manager mock error for Ascend910/310P/310 -type DeviceManagerMockErr struct { -} - -// DcStartHccsPingMesh start hccs ping mesh -func (d *DeviceManagerMockErr) DcStartHccsPingMesh(i int32, i2 int32, i3 int, - operate common.HccspingMeshOperate) error { - return errors.New(errorMsg) -} - -// DcStopHccsPingMesh stop hccs ping mesh -func (d *DeviceManagerMockErr) DcStopHccsPingMesh(i int32, i2 int32, i3 int, u uint) error { - return errors.New(errorMsg) -} - -// DcGetHccsPingMeshInfo get hccs ping mesh info -func (d *DeviceManagerMockErr) DcGetHccsPingMeshInfo(i int32, i2 int32, i3 int, u uint) (*common.HccspingMeshInfo, - error) { - return nil, errors.New(errorMsg) -} - -// DcGetHccsPingMeshState get hccs ping mesh state -func (d *DeviceManagerMockErr) DcGetHccsPingMeshState(i int32, i2 int32, i3 int, u uint) (int, error) { - return 1, errors.New(errorMsg) -} - -// Init load symbol and initialize dcmi -func (d *DeviceManagerMockErr) Init() error { - return errors.New(errorMsg) -} - -// ShutDown clean the dynamically loaded resource -func (d *DeviceManagerMockErr) ShutDown() error { - return errors.New(errorMsg) -} - -// GetDevType return mock type -func (d *DeviceManagerMockErr) GetDevType() string { - return api.Ascend910A -} - -// GetDeviceCount get npu device count -func (d *DeviceManagerMockErr) GetDeviceCount() (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetCardList get all card list -func (d *DeviceManagerMockErr) GetCardList() (int32, []int32, error) { - return 1, []int32{0}, errors.New(errorMsg) -} - -// GetDeviceNumInCard get all device list in one card -func (d *DeviceManagerMockErr) GetDeviceNumInCard(cardID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceList get all device logicID list -func (d *DeviceManagerMockErr) GetDeviceList() (int32, []int32, error) { - return 1, []int32{0}, errors.New(errorMsg) -} - -// GetDeviceHealth query npu device health status -func (d *DeviceManagerMockErr) GetDeviceHealth(logicID int32) (uint32, error) { - return 0, errors.New(errorMsg) -} - -// GetDeviceNetWorkHealth query npu device network health status -func (d *DeviceManagerMockErr) GetDeviceNetWorkHealth(logicID int32) (uint32, error) { - return 0, errors.New(errorMsg) -} - -// GetDeviceUtilizationRate get npu device utilization -func (d *DeviceManagerMockErr) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceTemperature get npu device temperature -func (d *DeviceManagerMockErr) GetDeviceTemperature(logicID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceVoltage get npu device voltage -func (d *DeviceManagerMockErr) GetDeviceVoltage(logicID int32) (float32, error) { - return 1, errors.New(errorMsg) -} - -// GetDevicePowerInfo get npu device power info -func (d *DeviceManagerMockErr) GetDevicePowerInfo(logicID int32) (float32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceFrequency get npu device work frequency -func (d *DeviceManagerMockErr) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceMemoryInfo get npu memory information -func (d *DeviceManagerMockErr) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) { - return &common.MemoryInfo{ - MemorySize: 1, - MemoryAvailable: 1, - Frequency: 1, - Utilization: 1, - }, errors.New(errorMsg) -} - -// GetDeviceHbmInfo get npu HBM module memory and frequency information -func (d *DeviceManagerMockErr) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) { - return &common.HbmInfo{ - MemorySize: 1, - Frequency: 1, - Usage: 1, - Temp: 1, - BandWidthUtilRate: 1, - }, errors.New(errorMsg) -} - -// GetDeviceErrorCode get npu device error code -func (d *DeviceManagerMockErr) GetDeviceErrorCode(logicID int32) (int32, int64, error) { - return int32(0), int64(0), errors.New(errorMsg) -} - -// GetChipInfo get npu device error code -func (d *DeviceManagerMockErr) GetChipInfo(logicID int32) (*common.ChipInfo, error) { - chip := &common.ChipInfo{ - Type: "ascend", - Name: common.Chip910, - Version: "v1", - } - return chip, errors.New(errorMsg) -} - -// GetPhysicIDFromLogicID get device physic id from logic id -func (d *DeviceManagerMockErr) GetPhysicIDFromLogicID(logicID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetLogicIDFromPhysicID get device logic id from physic id -func (d *DeviceManagerMockErr) GetLogicIDFromPhysicID(physicID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceLogicID get device logic id from card id and device id -func (d *DeviceManagerMockErr) GetDeviceLogicID(cardID, deviceID int32) (int32, error) { - return 1, errors.New(errorMsg) -} - -// GetDeviceIPAddress get device ip address -func (d *DeviceManagerMockErr) GetDeviceIPAddress(logicID, ipType int32) (string, error) { - return "127.0.0.1", errors.New(errorMsg) -} - -// CreateVirtualDevice create virtual device -func (d *DeviceManagerMockErr) CreateVirtualDevice(logicID int32, - vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) { - return common.CgoCreateVDevOut{}, errors.New(errorMsg) -} - -// GetVirtualDeviceInfo get virtual device info -func (d *DeviceManagerMockErr) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) { - return common.VirtualDevInfo{}, errors.New(errorMsg) -} - -// DestroyVirtualDevice destroy virtual device -func (d *DeviceManagerMockErr) DestroyVirtualDevice(logicID int32, vDevID uint32) error { - return errors.New(errorMsg) -} - -// GetMcuPowerInfo get mcu power info for cardID -func (d *DeviceManagerMockErr) GetMcuPowerInfo(cardID int32) (float32, error) { - return 1, errors.New(errorMsg) -} - -// GetCardIDDeviceID get cardID and deviceID by logicID -func (d *DeviceManagerMockErr) GetCardIDDeviceID(logicID int32) (int32, int32, error) { - return 0, 0, errors.New(errorMsg) -} - -// GetProductType get product type failed -func (d *DeviceManagerMockErr) GetProductType(cardID, deviceID int32) (string, error) { - return "", errors.New("not found product type name") -} - -// GetAllProductType get all product type failed -func (d *DeviceManagerMockErr) GetAllProductType() ([]string, error) { - return []string{}, errors.New("not found product type name") -} - -// GetNpuWorkMode get npu work mode failed -func (d *DeviceManagerMockErr) GetNpuWorkMode() string { - return "" -} - -// SetDeviceReset set device reset failed -func (d *DeviceManagerMockErr) SetDeviceReset(cardID, deviceID int32) error { - return errors.New(errorMsg) -} - -// GetDeviceBootStatus get device boot status failed -func (d *DeviceManagerMockErr) GetDeviceBootStatus(logicID int32) (int, error) { - return common.RetError, errors.New(errorMsg) -} - -// GetDeviceAllErrorCode get device all error code failed -func (d *DeviceManagerMockErr) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) { - return common.RetError, nil, errors.New(errorMsg) -} - -// SubscribeDeviceFaultEvent subscribe device fault event failed -func (d *DeviceManagerMockErr) SubscribeDeviceFaultEvent(logicID int32) error { - return errors.New(errorMsg) -} - -// SetFaultEventCallFunc set fault event call func failed -func (d *DeviceManagerMockErr) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error { - return errors.New(errorMsg) -} - -// GetDieID get die id failed -func (d *DeviceManagerMockErr) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) { - return "", errors.New(errorMsg) -} - -// GetDevProcessInfo get process info -func (d *DeviceManagerMockErr) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetPCIeBusInfo get PCIe bus info -func (d *DeviceManagerMockErr) GetPCIeBusInfo(logicID int32) (string, error) { - return "", errors.New(errorMsg) -} - -// GetBoardInfo get board info -func (d *DeviceManagerMockErr) GetBoardInfo(logicID int32) (common.BoardInfo, error) { - return common.BoardInfo{}, errors.New(errorMsg) -} - -// GetProductTypeArray test for get empty product type array -func (d *DeviceManagerMockErr) GetProductTypeArray() []string { - return nil -} - -// GetPCIEBandwidth get pcie bandwidth -func (d *DeviceManagerMockErr) GetPCIEBandwidth(logicID int32, _ int) (common.PCIEBwStat, error) { - return common.PCIEBwStat{}, errors.New(errorMsg) -} - -// SetIsTrainingCard set IsTrainingCard -func (d *DeviceManagerMockErr) SetIsTrainingCard() error { - return errors.New(errorMsg) -} - -// IsTrainingCard get IsTrainingCard -func (d *DeviceManagerMockErr) IsTrainingCard() bool { - return false -} - -// GetDcmiVersion get dcmi version failed -func (d *DeviceManagerMockErr) GetDcmiVersion() string { - return "" -} - -// GetValidChipInfo get valid chip info from all npu -func (d *DeviceManagerMockErr) GetValidChipInfo() (common.ChipInfo, error) { - return common.ChipInfo{}, errors.New("failed to find chip info") -} - -// GetDeviceEccInfo get device ECC info -func (d *DeviceManagerMockErr) GetDeviceEccInfo(logicID int32, - dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) { - return nil, errors.New("failed to get device ECC info") -} - -// GetSuperPodInfo get super pod info -func (d *DeviceManagerMockErr) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) { - return common.CgoSuperPodInfo{}, nil -} - -// GetSioInfo get sio info -func (d *DeviceManagerMockErr) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetHccsStatisticInfo get hccs statistic info -func (d *DeviceManagerMockErr) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetHccsStatisticInfoInU64 get hccs statistic info in u64 -func (d *DeviceManagerMockErr) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetMainBoardId get main board id -func (d *DeviceManagerMockErr) GetMainBoardId() uint32 { - return 0 -} - -// GetHccsBandwidthInfo get hccs statistic info -func (d *DeviceManagerMockErr) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) { - return nil, errors.New(errorMsg) -} - -// GetBrotherCardID get brother card id -func (d *DeviceManagerMockErr) GetBrotherCardID(cardID, deviceID int32) (int32, error) { - return -1, nil -} - -// GetOutBandChannelState get out band channel state -func (d *DeviceManagerMockErr) GetOutBandChannelState(cardID, deviceID int32) error { - return nil -} - -// PreResetSoc pre reset soc, used before reset out band -func (d *DeviceManagerMockErr) PreResetSoc(cardID, deviceID int32) error { - return nil -} - -// SetDeviceResetOutBand reset spec device out band -func (d *DeviceManagerMockErr) SetDeviceResetOutBand(cardID, deviceID int32) error { - return nil -} - -// RescanSoc trigger soc rescan, non-blocking -func (d *DeviceManagerMockErr) RescanSoc(cardID, deviceID int32) error { - return nil -} - -// GetChipBaseInfos get chip base info -func (d *DeviceManagerMockErr) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) { - return nil, errors.New(errorMsg) -} - -func (d *DeviceManagerMockErr) DcGetSuperPodStatus(int32, int32, uint32) (int, error) { return 0, nil } - -func (d *DeviceManagerMockErr) DcSetSuperPodStatus(int32, int32, uint32, uint32) error { return nil } - -// GetCardElabelV2 get card elabel information -func (d *DeviceManagerMockErr) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) { - return common.ElabelInfo{}, nil -} diff --git a/mind-cluster/component/ascend-common/devmanager/devmanager_test.go b/mind-cluster/component/ascend-common/devmanager/devmanager_test.go deleted file mode 100644 index 221a812..0000000 --- a/mind-cluster/component/ascend-common/devmanager/devmanager_test.go +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package devmanager for device driver manager -package devmanager - -import ( - "errors" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" -) - -// TestGetCardIdAndDeviceId test the getCardIdAndDeviceId function -func TestGetCardIdAndDeviceId(t *testing.T) { - - var ( - cardId, deviceId = int32(0), int32(0) - err error - returnValue = int32(0) - errReturnValue = int32(-1) - ) - manager := &DeviceManager{DcMgr: &dcmi.DcManager{}} - convey.Convey("failed to get info by dcmi", t, func() { - mk2 := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", - errReturnValue, errReturnValue, errors.New("mock err")) - defer mk2.Reset() - cardId, deviceId, err = manager.getCardIdAndDeviceId(0) - - convey.So(cardId, convey.ShouldEqual, common.RetError) - convey.So(deviceId, convey.ShouldEqual, common.RetError) - convey.So(err, convey.ShouldNotBeNil) - - }) - - mk := gomonkey.ApplyMethodReturn(manager.DcMgr, "DcGetCardIDDeviceID", returnValue, returnValue, nil) - defer mk.Reset() - - convey.Convey("get info from dcmi", t, func() { - testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) - }) - convey.Convey("get info from cache", t, func() { - testGetCardIdAndDeviceId(t, cardId, deviceId, err, manager) - }) - -} - -func testGetCardIdAndDeviceId(t *testing.T, cardId int32, deviceId int32, err error, manager *DeviceManager) { - cardId, deviceId, err = manager.getCardIdAndDeviceId(0) - - convey.So(cardId, convey.ShouldEqual, 0) - convey.So(deviceId, convey.ShouldEqual, 0) - convey.So(err, convey.ShouldBeNil) - -} -func init() { - config := hwlog.LogConfig{ - OnlyToStdout: true, - } - hwlog.InitRunLogger(&config, nil) -} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go deleted file mode 100644 index b6388f4..0000000 --- a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool.go +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hccn this for npu hccn info -package hccn - -import ( - "fmt" - "os" - "os/exec" - "strconv" - "strings" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/limiter" - "ascend-common/common-utils/utils" - "ascend-common/devmanager/common" -) - -const ( - space = " " - newLine = "\n" - colon = ":" - - // LinkUp npu interface up - LinkUp string = "UP" - // LinkDown npu interface down - LinkDown string = "DOWN" - - opticalPartLen = 2 - secondIndex = 2 - linkStatusPart = 3 - base64 = 64 - - cardHealthy = 0 - - normalCode = 1 - abnormalCode = 0 - - naValue = "NA" - notSupport = "not supported" - unknownStr = "Unknown!" - - limitSize = 1024 * 1024 -) - -func getInfoFromHccnTool(args ...string) (string, error) { - const hccnTool = "/usr/local/Ascend/driver/tools/hccn_tool" - if _, err := utils.CheckPath(hccnTool); err != nil { - return "", err - } - cmd := exec.Command(hccnTool, args...) - cmd.Env = []string{ - "PATH=" + os.Getenv("PATH"), - utils.LdLibPath + "=" + os.Getenv(utils.LdLibPath), - } - limitStdout := limiter.NewLimitedWriter(limitSize) - cmd.Stdout = limitStdout - cmd.Stderr = limiter.NewLimitedWriter(limitSize) - err := cmd.Run() - if err != nil { - return "", err - } - - return string(limitStdout.GetBufferBytes()), nil -} - -// GetNPULinkStatus exec "hccn_tool -i * -link -g" to get link status -func GetNPULinkStatus(phyID int32) (string, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-link", "-g"} - // command example: hccn_tool -i 0 -link -g - // success result example is: link status: DOWN - outStr, err := getInfoFromHccnTool(args...) - hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) - if err != nil { - return common.Abnormal, buildHccnErr(phyID, "link status", err) - } - replacedStr := strings.ReplaceAll(outStr, newLine, "") - outArr := strings.Split(replacedStr, space) - if len(outArr) != linkStatusPart { - return common.Abnormal, buildHccnErr(phyID, "link status", - fmt.Errorf("length of output %v is not equal to %v", outArr, linkStatusPart)) - } - - status := outArr[secondIndex] - hwlog.RunLog.Debugf("hccn_tool get npu link status: %s", status) - return status, nil -} - -// GetNPULinkSpeed exec "hccn_tool -i * -speed -g" to get link speed -func GetNPULinkSpeed(phyID int32) (int, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-speed", "-g"} - // command example: hccn_tool -i 0 -speed -g - // success result example is: Speed: 100000 Mb/s - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return common.RetError, buildHccnErr(phyID, "link speed", err) - } - return getSpeedFromOutStr(outStr, phyID) -} - -func getSpeedFromOutStr(outStr string, phyID int32) (int, error) { - if strings.Contains(outStr, unknownStr) { - return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("npu link speed is unknown")) - } - replacedStr := strings.ReplaceAll(outStr, newLine, "") - outArr := strings.Split(replacedStr, space) - if len(outArr) != linkStatusPart { - return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("length of output %v is not equal to %v", - outArr, linkStatusPart)) - } - const midIndex = 1 - speed, err := strconv.Atoi(outArr[midIndex]) - if err != nil { - return common.RetError, buildHccnErr(phyID, "link speed", fmt.Errorf("covert speed from string failed: %s", err)) - } - - return speed, nil -} - -// GetNPULinkUpNum exec "hccn_tool -i * -link_stat -g" to get link up count -func GetNPULinkUpNum(phyID int32) (int, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-link_stat", "-g"} - // command example: hccn_tool -i 0 -link_stat -g - // success result include: [device x]link up count : y - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return common.RetError, buildHccnErr(phyID, "link stat", err) - } - - const ( - linkUpArrLen = 6 - linkUpStr = "link up count" - ) - linkUPCount := 0 - lines := strings.Split(outStr, newLine) - for _, line := range lines { - if line == "" || !strings.Contains(line, linkUpStr) { - continue - } - - linkUpArr := strings.Fields(line) - if len(linkUpArr) != linkUpArrLen { - return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("length of output %v is not "+ - "equal to %v", linkUpArr, linkUpArrLen)) - } - if linkUPCount, err = strconv.Atoi(linkUpArr[linkUpArrLen-1]); err != nil { - return common.RetError, buildHccnErr(phyID, "link up num", - fmt.Errorf("covert link up num from string failed: %s", err)) - } - return linkUPCount, nil - } - - return common.RetError, buildHccnErr(phyID, "link up num", fmt.Errorf("did not find link up count")) -} - -// GetNPUStatInfo exec "hccn_tool -i * -stat -g" to get stat info -func GetNPUStatInfo(phyID int32) (map[string]int, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-stat", "-g"} - // command example: hccn_tool -i 0 -stat -g - // success result include: [device x]link up count : y - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return nil, buildHccnErr(phyID, "stat", err) - } - lines := strings.Split(outStr, newLine) - statInfoMap := make(map[string]int) - const statPartLen = 2 - for _, line := range lines { - statParts := strings.Split(line, colon) - if len(statParts) != statPartLen || statParts[1] == "" { - continue - } - statNum, err := strconv.Atoi(statParts[1]) - if err != nil { - hwlog.RunLog.Errorf("covert stat num of [%s] from string failed: %s", statParts[1], err) - continue - } - statInfoMap[statParts[0]] = statNum - } - - return statInfoMap, nil -} - -// GetNPUOpticalInfo exec "hccn_tool -i * -optical -g" to get optical info -func GetNPUOpticalInfo(phyID int32) (map[string]string, error) { - args := []string{"-i", strconv.Itoa(int(phyID)), "-optical", "-g"} - // command example: hccn_tool -i 0 -optical -g - // success result include: [device x]link up count : y - outStr, err := getInfoFromHccnTool(args...) - if err != nil { - return nil, buildHccnErr(phyID, "optical", err) - } - lines := strings.Split(outStr, newLine) - opticalInfoMap := make(map[string]string) - for _, line := range lines { - opticalParts := strings.Split(line, colon) - if len(opticalParts) != opticalPartLen { - continue - } - opticalKey := strings.ReplaceAll(strings.TrimSpace(opticalParts[0]), space, "_") - opticalValue := strings.TrimSpace(opticalParts[1]) - opticalInfoMap[opticalKey] = opticalValue - } - - return opticalInfoMap, nil -} - -// GetNPUInterfaceTraffic exec "hccn_tool -i * -bandwidth -g" to get bandwidth info -func GetNPUInterfaceTraffic(phyID int32) (float64, float64, error) { - const ( - noTraffic = common.RetError - trafficPartLen = 4 - txStr = "TX:" - rxStr = "RX:" - ) - - args := []string{"-i", strconv.Itoa(int(phyID)), "-bandwidth", "-g"} - // command example: hccn_tool -i 0 -bandwidth -g - // success result has two lines: - // Bandwidth TX: 0.00 MB/sec - // Bandwidth RX: 0.00 MB/sec - outStr, err := getInfoFromHccnTool(args...) - hwlog.RunLog.Debugf("hccn_tool command exec result: %v", outStr) - if err != nil { - return noTraffic, noTraffic, buildHccnErr(phyID, "interface traffic", err) - } - - var ( - tx = float64(noTraffic) - rx = float64(noTraffic) - ) - - lines := strings.Split(outStr, newLine) - for _, line := range lines { - if line == "" { - continue - } - - trafficArr := strings.Fields(line) - hwlog.RunLog.Debugf("npu bandwidth split as: %v", trafficArr) - if len(trafficArr) != trafficPartLen { - continue - } - if strings.Contains(line, txStr) { - tmpTx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) - if err != nil { - hwlog.RunLog.Errorf("get float data from Bandwidth TX err: %s", err) - continue - } - tx = tmpTx - } - if strings.Contains(line, rxStr) { - tmpRx, err := strconv.ParseFloat(trafficArr[secondIndex], base64) - if err != nil { - hwlog.RunLog.Errorf("get float data from Bandwidth RX err: %s", err) - continue - } - rx = tmpRx - } - } - return tx, rx, nil -} - -// GetFloatDataFromStr get float data from string with space -func GetFloatDataFromStr(str, dataType string) float64 { - if str == "" || strings.Contains(str, naValue) || strings.Contains(str, notSupport) { - return common.RetError - } - dataParts := strings.Split(str, space) - if len(dataParts) != opticalPartLen { - errMsg := fmt.Sprintf("convert %v optical data type failed, "+ - "the length of optical data %v is %v not equal to %d. ", dataType, dataParts, len(dataParts), opticalPartLen) - hwlog.RunLog.Error(errMsg) - return common.RetError - } - floatData, err := strconv.ParseFloat(dataParts[0], base64) - if err != nil { - hwlog.RunLog.Errorf("convert %v optical data type to a floating-point number failed, "+ - "get float data from string %v failed, err: %v", dataType, dataParts[0], err) - return common.RetError - } - return floatData -} - -// GetHealthCode return union healthy code -func GetHealthCode(healthCode uint32) int { - if healthCode == common.UnRetError { - return common.RetError - } - - if healthCode == cardHealthy { - return normalCode - } - return abnormalCode -} - -// GetLinkStatusCode return union link status code -func GetLinkStatusCode(status string) int { - if status == common.Abnormal { - return common.RetError - } - - if status == LinkUp { - return normalCode - } - return abnormalCode -} - -// GetNetworkHealthy return union network healthy code -func GetNetworkHealthy(netCode uint32) int { - if netCode == common.UnRetError { - return common.RetError - } - - if netCode == common.NetworkInit || netCode == common.NetworkSuccess { - return normalCode - } - return abnormalCode -} - -func buildHccnErr(phyID int32, msg string, err error) error { - return fmt.Errorf("phyID(%d),get npu %s info failed,error is :%v", phyID, msg, err) -} diff --git a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go b/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go deleted file mode 100644 index 7d4fe17..0000000 --- a/mind-cluster/component/ascend-common/devmanager/hccn/hccn_tool_test.go +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package hccn this for npu hccn info -package hccn - -import ( - "fmt" - "strings" - "testing" -) - -func TestBuildHccnErr(t *testing.T) { - t.Run("normal error", func(t *testing.T) { - phyID := int32(1) - msg := "status" - originalErr := fmt.Errorf("permission denied") - - err := buildHccnErr(phyID, msg, originalErr) - - if !strings.Contains(err.Error(), "phyID(1)") { - t.Error("should contain phyID") - } - if !strings.Contains(err.Error(), "npu status") { - t.Error("should contain npu message") - } - if !strings.Contains(err.Error(), "permission denied") { - t.Error("should contain original error") - } - }) - - t.Run("nil error", func(t *testing.T) { - err := buildHccnErr(0, "", nil) - if !strings.Contains(err.Error(), "error is :nil") { - t.Error("should handle nil error") - } - }) -} diff --git a/mind-cluster/component/ascend-common/go.mod b/mind-cluster/component/ascend-common/go.mod deleted file mode 100644 index e1e3bbb..0000000 --- a/mind-cluster/component/ascend-common/go.mod +++ /dev/null @@ -1,55 +0,0 @@ -module ascend-common - -go 1.18 - -require ( - github.com/agiledragon/gomonkey/v2 v2.8.0 - github.com/fsnotify/fsnotify v1.6.0 - github.com/kubeflow/common v0.4.3 - github.com/smartystreets/goconvey v1.6.4 - k8s.io/api v0.25.3 - k8s.io/apimachinery v0.25.3 - k8s.io/client-go v0.25.3 -) - -require ( - github.com/PuerkitoBio/purell v1.1.1 // indirect - github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/emicklei/go-restful/v3 v3.8.0 // indirect - github.com/go-logr/logr v1.2.3 // indirect - github.com/go-openapi/jsonpointer v0.19.5 // indirect - github.com/go-openapi/jsonreference v0.19.5 // indirect - github.com/go-openapi/swag v0.19.14 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.2 // indirect - github.com/google/gnostic v0.5.7-v3refs // indirect - github.com/google/go-cmp v0.5.8 // indirect - github.com/google/gofuzz v1.1.0 // indirect - github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect - github.com/josharian/intern v1.0.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect - github.com/jtolds/gls v4.20.0+incompatible // indirect - github.com/mailru/easyjson v0.7.6 // indirect - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect - golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect - golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect - golang.org/x/text v0.3.7 // indirect - golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect - google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.0 // indirect - gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/klog/v2 v2.70.1 // indirect - k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect - k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect - sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect - sigs.k8s.io/yaml v1.3.0 // indirect -) diff --git a/mind-cluster/component/ascend-common/go.sum b/mind-cluster/component/ascend-common/go.sum deleted file mode 100644 index 000ced7..0000000 --- a/mind-cluster/component/ascend-common/go.sum +++ /dev/null @@ -1,492 +0,0 @@ -cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= -cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= -cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= -cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= -cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= -cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= -cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= -cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= -cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= -cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= -cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= -cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= -cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= -cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= -cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= -cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= -cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= -cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= -cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= -cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= -cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= -cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= -cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= -cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= -cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= -cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= -cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= -cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= -cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= -cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= -dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= -github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= -github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= -github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= -github.com/emicklei/go-restful/v3 v3.8.0 h1:eCZ8ulSerjdAiaNpF7GxXIE7ZCMo1moN1qX+S609eVw= -github.com/emicklei/go-restful/v3 v3.8.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= -github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= -github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= -github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= -github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= -github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= -github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= -github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= -github.com/go-openapi/jsonreference v0.19.5 h1:1WJP/wi4OjB4iV8KVbH73rQaoialJrqv8gitZLxGLtM= -github.com/go-openapi/jsonreference v0.19.5/go.mod h1:RdybgQwPxbL4UEjuAruzK1x3nE69AqPYEJeo/TWfEeg= -github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= -github.com/go-openapi/swag v0.19.14 h1:gm3vOOXfiuw5i9p5N9xJvfjvuofpyvLA9Wr6QfK5Fng= -github.com/go-openapi/swag v0.19.14/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= -github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= -github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= -github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= -github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= -github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= -github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= -github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/gnostic v0.5.7-v3refs h1:FhTMOKj2VhjpouxvWJAV1TL304uMlb9zcDqkl6cEI54= -github.com/google/gnostic v0.5.7-v3refs/go.mod h1:73MKFl6jIHelAJNaBGFzt3SPtZULs9dYrGFt8OiIsHQ= -github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= -github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= -github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= -github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= -github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= -github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= -github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= -github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= -github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubeflow/common v0.4.3 h1:vVoOMNPOZK4wzZvQ4rsRLvC3SDi+J1fVKNHSXC/QRvU= -github.com/kubeflow/common v0.4.3/go.mod h1:Qb/5aON7/OWVkN8OnjRqqT0i8X/XzMekRIZ8lkLosj4= -github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= -github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= -github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= -github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= -go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= -go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= -golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= -golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= -golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= -golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= -golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= -golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= -golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= -golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= -golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= -golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= -golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= -golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 h1:RerP+noqYHUQ8CMRcPlC2nvTa4dcBIjegkuWdcUDuqg= -golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= -golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= -golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= -golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= -golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= -golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= -golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= -google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= -google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= -google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= -google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= -google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= -google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= -google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= -google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= -google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= -google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= -google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= -google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= -google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= -google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= -google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= -google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= -google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= -google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= -google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= -google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= -google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= -google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= -google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= -google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= -google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= -google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= -google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw= -google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= -gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= -honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -k8s.io/api v0.25.3 h1:Q1v5UFfYe87vi5H7NU0p4RXC26PPMT8KOpr1TLQbCMQ= -k8s.io/api v0.25.3/go.mod h1:o42gKscFrEVjHdQnyRenACrMtbuJsVdP+WVjqejfzmI= -k8s.io/apimachinery v0.25.3 h1:7o9ium4uyUOM76t6aunP0nZuex7gDf8VGwkR5RcJnQc= -k8s.io/apimachinery v0.25.3/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= -k8s.io/client-go v0.25.3 h1:oB4Dyl8d6UbfDHD8Bv8evKylzs3BXzzufLiO27xuPs0= -k8s.io/client-go v0.25.3/go.mod h1:t39LPczAIMwycjcXkVc+CB+PZV69jQuNx4um5ORDjQA= -k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= -k8s.io/klog/v2 v2.70.1 h1:7aaoSdahviPmR+XkS7FyxlkkXs6tHISSG03RxleQAVQ= -k8s.io/klog/v2 v2.70.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= -k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkIFQtZShWqoha7snGixVgEA= -k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= -k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= -k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= -rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= -rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= -rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= -sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= -sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= -sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/mind-cluster/component/npu-exporter/.gitignore b/mind-cluster/component/npu-exporter/.gitignore deleted file mode 100644 index 723ef36..0000000 --- a/mind-cluster/component/npu-exporter/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.idea \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/LICENSE b/mind-cluster/component/npu-exporter/LICENSE deleted file mode 100644 index f49a4e1..0000000 --- a/mind-cluster/component/npu-exporter/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/README.md b/mind-cluster/component/npu-exporter/README.md deleted file mode 100644 index 4bde4a9..0000000 --- a/mind-cluster/component/npu-exporter/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# NPU-Exporter - -# 组件介绍 - - -Prometheus(普罗米修斯)是一个开源的系统监测和警报工具包,Exporter就是专门为Prometheus提供数据源的组件。由于Prometheus社区的活跃和大量的使用,已经有很多厂商或者服务提供了Exporter,如Prometheus官方的Node Exporter,MySQL官方出的MySQL Server Exporter和NVIDA的NVIDIA GPU Exporter。这些Exporter负责将特定监测对象的指标,转成Prometheus能够识别的数据格式,供Prometheus集成。NPU-Expoter是华为自研的专门收集华为NPU各种监测信息和指标,并封装成Prometheus专用数据格式的一个服务组件。 - - -# 编译NPU-Exporter - -1. 通过git拉取源码,获得npu-exporter。 - - 示例:Npu-Exporter源码放在/home/mind-cluster/component/npu-exporter目录下 - -2. 执行以下命令,进入Npu-Exporter构建目录,执行构建脚本,在“output“目录下生成二进制npu-exporter、yaml文件和Dockerfile等文件。 - - **cd** _/home/mind-cluster/component/_**npu-exporter/build/** - - **chmod +x build.sh** - - **./build.sh** - -3. 执行以下命令,查看**output**生成的软件列表。 - - **ll** _/home/mind-cluster/component/_**npu-exporter/output** - - ``` - drwxr-xr-x 2 root root 4096 Feb 23 07:10 . - drwxr-xr-x 10 root root 4096 Feb 23 07:10 .. - -r-------- 1 root root 623 Feb 23 07:10 Dockerfile - -r-------- 1 root root 623 Feb 23 07:10 Dockerfile-310P-1usoc - -r-------- 1 root root 623 Feb 23 07:10 metricConfiguration.json - -r-x------ 1 root root 25481072 Feb 23 07:10 npu-exporter - -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-310P-1usoc-v6.0.0.yaml - -r-------- 1 root root 3438 Feb 23 07:10 npu-exporter-v6.0.0.yaml - -r-------- 1 root root 623 Feb 23 07:10 pluginConfiguration.json - -r-x------ 1 root root 2579 Feb 23 07:10 run_for_310P_1usoc.sh - ``` - -# 说明 - -1. 当前Npu-Exporter仅支持http启动,如果需要使用https启动,请自行完成代码修改并适配Prometheus \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile b/mind-cluster/component/npu-exporter/build/Dockerfile deleted file mode 100644 index 24f9943..0000000 --- a/mind-cluster/component/npu-exporter/build/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM ubuntu:22.04 - -RUN useradd -d /home/HwHiAiUser -u 1000 -m -s /usr/sbin/nologin HwHiAiUser &&\ - usermod root -s /usr/sbin/nologin - -COPY ./npu-exporter /usr/local/bin/ -COPY ./metricConfiguration.json /usr/local/metricConfiguration.json -COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json - -RUN chown root:root /usr/local/bin/npu-exporter &&\ - chmod 750 -R /home/HwHiAiUser &&\ - chmod 550 /usr/local/bin/ &&\ - chmod 500 /usr/local/bin/npu-exporter &&\ - chmod 440 /usr/local/metricConfiguration.json &&\ - chmod 440 /usr/local/pluginConfiguration.json &&\ - echo 'umask 027' >> /etc/profile && \ - echo 'source /etc/profile' >> ~/.bashrc -ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi - -CMD /usr/local/bin/npu-exporter - diff --git a/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc b/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc deleted file mode 100644 index 5927f7d..0000000 --- a/mind-cluster/component/npu-exporter/build/Dockerfile-310P-1usoc +++ /dev/null @@ -1,31 +0,0 @@ -FROM ubuntu:22.04 - -RUN groupadd -g 1000 HwHiAiUser && useradd -u 1000 -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser &&\ - groupadd -g 1101 HwDmUser && useradd -u 1101 -g HwDmUser -d /home/HwDmUser -m HwDmUser &&\ - groupadd -g 1102 HwBaseUser && useradd -u 1102 -g HwBaseUser -d /home/HwBaseUser -m HwBaseUser &&\ - usermod -a -G HwBaseUser HwHiAiUser &&\ - usermod -a -G HwDmUser HwHiAiUser &&\ - usermod -a -G HwBaseUser HwDmUser &&\ - usermod -a -G HwHiAiUser HwDmUser &&\ - usermod root -s /usr/sbin/nologin - -COPY ./npu-exporter /usr/local/bin/ -COPY ./run_for_310P_1usoc.sh / -COPY ./metricConfiguration.json /usr/local/metricConfiguration.json -COPY ./pluginConfiguration.json /usr/local/pluginConfiguration.json - -RUN chown root:root /usr/local/bin/npu-exporter &&\ - chmod 500 /run_for_310P_1usoc.sh &&\ - chmod 550 /usr/local/bin/ &&\ - chmod 500 /usr/local/bin/npu-exporter &&\ - chmod 440 /usr/local/metricConfiguration.json &&\ - chmod 440 /usr/local/pluginConfiguration.json &&\ - echo 'umask 027' >> /etc/profile && \ - echo 'source /etc/profile' >> ~/.bashrc - -RUN ln -s /lib /lib64 2>&1 >> /dev/null &&\ - mkdir -m 750 /var/driver -m 750 /var/dmp -m 750 /usr/slog -p -m 750 /home/drv/hdc_ppc &&\ - chown HwDmUser:HwDmUser /var/dmp &&\ - chown HwHiAiUser:HwHiAiUser /var/driver &&\ - chown HwHiAiUser:HwHiAiUser /home/drv/hdc_ppc &&\ - chown HwHiAiUser:HwHiAiUser /usr/slog \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/build.sh b/mind-cluster/component/npu-exporter/build/build.sh deleted file mode 100644 index 16c101d..0000000 --- a/mind-cluster/component/npu-exporter/build/build.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# Perform build npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2020-2023. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -set -e -CUR_DIR=$(dirname $(readlink -f $0)) -TOP_DIR=$(realpath "${CUR_DIR}"/..) -export GO111MODULE="on" -VER_FILE="${TOP_DIR}"/service_config.ini -build_version="v6.0.0" -if [ -f "$VER_FILE" ]; then - line=$(sed -n '1p' "$VER_FILE" 2>&1) - #cut the chars after ':' and add char 'v', the final example is v3.0.0 - build_version="v"${line#*=} -fi - -arch=$(arch 2>&1) -echo "Build Architecture is" "${arch}" - -OUTPUT_NAME="npu-exporter" -DOCKER_FILE_NAME="Dockerfile" -A200ISOC_DOCKER_FILE_NAME="Dockerfile-310P-1usoc" -A200ISOC_RUN_SHELL="run_for_310P_1usoc.sh" - -function clean() { - rm -rf "${TOP_DIR}"/output - mkdir -p "${TOP_DIR}"/output -} - -function build() { - cd "${TOP_DIR}/cmd/npu-exporter" - CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ - -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ - -o ${OUTPUT_NAME} - ls ${OUTPUT_NAME} - if [ $? -ne 0 ]; then - echo "fail to find npu-exporter" - exit 1 - fi -} - -function mv_file() { - mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - cp "${TOP_DIR}"/build/npu-exporter-310P-1usoc.yaml "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml - cp "${TOP_DIR}"/build/metricConfiguration.json "${TOP_DIR}"/output/ - cp "${TOP_DIR}"/build/pluginConfiguration.json "${TOP_DIR}"/output/ - sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-310P-1usoc-"${build_version}".yaml - cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/${A200ISOC_DOCKER_FILE_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/${A200ISOC_RUN_SHELL} "${TOP_DIR}"/output - chmod 400 "${TOP_DIR}"/output/* - chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} - chmod 500 "${TOP_DIR}"/output/${A200ISOC_RUN_SHELL} - -} - -function main() { - clean - build - mv_file -} - -main diff --git a/mind-cluster/component/npu-exporter/build/build_ch.sh b/mind-cluster/component/npu-exporter/build/build_ch.sh deleted file mode 100644 index 878fcbd..0000000 --- a/mind-cluster/component/npu-exporter/build/build_ch.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# Perform build npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2025-2025. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -set -e -CUR_DIR=$(dirname $(readlink -f $0)) -TOP_DIR=$(realpath "${CUR_DIR}"/..) -export GO111MODULE="on" -VER_FILE="${TOP_DIR}"/service_config.ini -build_version="v6.0.0" -if [ -f "$VER_FILE" ]; then - line=$(sed -n '1p' "$VER_FILE" 2>&1) - #cut the chars after ':' and add char 'v', the final example is v3.0.0 - build_version="v"${line#*=} -fi - -arch=$(arch 2>&1) -echo "Build Architecture is" "${arch}" - -OUTPUT_NAME="npu-exporter" -DOCKER_FILE_NAME="Dockerfile" - - -function clean() { - rm -rf "${TOP_DIR}"/output - mkdir -p "${TOP_DIR}"/output -} - -function build() { - cd "${TOP_DIR}/cmd/npu-exporter" - CGO_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - CGO_CPPFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -fPIC -ftrapv" - go build -mod=mod -buildmode=pie -ldflags "-s -extldflags=-Wl,-z,now -X huawei.com/npu-exporter/v6/versions.BuildName=${OUTPUT_NAME} \ - -X huawei.com/npu-exporter/v6/versions.BuildVersion=${build_version}_linux-${arch}" \ - -o ${OUTPUT_NAME} - ls ${OUTPUT_NAME} - if [ $? -ne 0 ]; then - echo "fail to find npu-exporter" - exit 1 - fi -} - -function mv_file() { - mv "${TOP_DIR}"/cmd/npu-exporter/${OUTPUT_NAME} "${TOP_DIR}"/output - cp "${TOP_DIR}"/build/npu-exporter.yaml "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - sed -i "s/npu-exporter:.*/npu-exporter:${build_version}/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - sed -i "s/ascend*/alan/" "${TOP_DIR}"/output/npu-exporter-"${build_version}".yaml - - cp "${TOP_DIR}"/build/${DOCKER_FILE_NAME} "${TOP_DIR}"/output - chmod 400 "${TOP_DIR}"/output/* - chmod 500 "${TOP_DIR}"/output/${OUTPUT_NAME} - -} - -function main() { - clean - build - mv_file -} - -main diff --git a/mind-cluster/component/npu-exporter/build/metricConfiguration.json b/mind-cluster/component/npu-exporter/build/metricConfiguration.json deleted file mode 100644 index 3dbd82b..0000000 --- a/mind-cluster/component/npu-exporter/build/metricConfiguration.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - {"metricsGroup": "ddr", "state": "ON"}, - {"metricsGroup": "hccs", "state": "ON"}, - {"metricsGroup": "npu", "state": "ON"}, - {"metricsGroup": "network", "state": "ON"}, - {"metricsGroup": "pcie", "state": "ON"}, - {"metricsGroup": "roce", "state": "ON"}, - {"metricsGroup": "sio", "state": "ON"}, - {"metricsGroup": "vnpu", "state": "ON"}, - {"metricsGroup": "version", "state": "ON"}, - {"metricsGroup": "optical", "state": "ON"}, - {"metricsGroup": "hbm", "state": "ON"} -] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml deleted file mode 100644 index 3b6e22f..0000000 --- a/mind-cluster/component/npu-exporter/build/npu-exporter-310P-1usoc.yaml +++ /dev/null @@ -1,167 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: npu-exporter ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: exporter-network-policy - namespace: npu-exporter -spec: - podSelector: - matchLabels: - app: npu-exporter - policyTypes: - - Ingress - - Egress - ingress: - - from: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus - egress: - - to: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: npu-exporter-310p-1usoc - namespace: npu-exporter -spec: - selector: - matchLabels: - app: npu-exporter - template: - metadata: - ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - labels: - app: npu-exporter - spec: - ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile -# securityContext: -# seccompProfile: -# type: RuntimeDefault - automountServiceAccountToken: false - nodeSelector: - workerselector: dls-worker-node - servertype: soc - containers: - - name: npu-exporter - image: npu-exporter:v5.0.RC1 - resources: - requests: - memory: 1000Mi - cpu: 1000m - limits: - memory: 1000Mi - cpu: 1000m - imagePullPolicy: Never - command: [ "/bin/bash", "-c", "/run_for_310P_1usoc.sh"] - # pair firstly - securityContext: - privileged: true - readOnlyRootFilesystem: true - runAsUser: 0 - runAsGroup: 0 - ports: - - name: http - containerPort: 8082 - protocol: TCP - volumeMounts: - - name: log-npu-exporter - mountPath: /var/log/mindx-dl/npu-exporter - - name: localtime - mountPath: /etc/localtime - readOnly: true - - name: ascend-driver - mountPath: /usr/local/Ascend/driver - readOnly: true - - name: ascend-dcmi - mountPath: /usr/local/dcmi - readOnly: true - - name: libyaml - mountPath: /usr/lib64/libyaml-0.so.2 - readOnly: true - - name: docker-shim # delete when only use containerd - mountPath: /run/dockershim.sock - readOnly: true - - name: docker # delete when only use containerd - mountPath: /run/docker/containerd/containerd.sock - readOnly: true - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - mountPath: /var/run/cri-dockerd.sock - readOnly: true - - name: containerd - mountPath: /run/containerd - readOnly: true - - name: tmp - mountPath: /tmp - - name: dmp - mountPath: /var/dmp_daemon - readOnly: true - - name: slogd - mountPath: /var/slogd - readOnly: true - - name: hbasic - mountPath: /etc/hdcBasic.cfg - readOnly: true - - name: slogconf - mountPath: /etc/slog.conf - readOnly: true - volumes: - - name: log-npu-exporter - hostPath: - path: /var/log/mindx-dl/npu-exporter - type: Directory - - name: localtime - hostPath: - path: /etc/localtime - - name: libyaml - hostPath: - path: /usr/lib64/libyaml-0.so.2 - type: File - - name: ascend-driver - hostPath: - path: /usr/local/Ascend/driver - - name: ascend-dcmi - hostPath: - path: /usr/local/dcmi - - name: docker-shim # delete when only use containerd - hostPath: - path: /run/dockershim.sock - - name: docker # delete when only use containerd - hostPath: - path: /run/docker/containerd/containerd.sock - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - hostPath: - path: /var/run/cri-dockerd.sock - - name: containerd - hostPath: - path: /run/containerd - - name: tmp - hostPath: - path: /tmp - - name: dmp - hostPath: - path: /var/dmp_daemon - type: File - - name: slogd - hostPath: - path: /var/slogd - type: File - - name: hbasic - hostPath: - path: /etc/hdcBasic.cfg - type: File - - name: slogconf - hostPath: - path: /etc/slog.conf - type: File \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/npu-exporter.yaml b/mind-cluster/component/npu-exporter/build/npu-exporter.yaml deleted file mode 100644 index 970e3cf..0000000 --- a/mind-cluster/component/npu-exporter/build/npu-exporter.yaml +++ /dev/null @@ -1,140 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: npu-exporter ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: exporter-network-policy - namespace: npu-exporter -spec: - podSelector: - matchLabels: - app: npu-exporter - policyTypes: - - Ingress - - Egress - ingress: - - from: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus - egress: - - to: - - namespaceSelector: {} - podSelector: - matchLabels: - app: prometheus ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: npu-exporter - namespace: npu-exporter -spec: - selector: - matchLabels: - app: npu-exporter - template: - metadata: - ##### For Kubernetes versions lower than 1.19, seccomp is used with annotations. - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - labels: - app: npu-exporter - spec: - ##### For Kubernetes version 1.19 and above, seccomp is used with securityContext:seccompProfile -# securityContext: -# seccompProfile: -# type: RuntimeDefault - automountServiceAccountToken: false - nodeSelector: - workerselector: dls-worker-node - containers: - - name: npu-exporter - image: npu-exporter:v5.0.RC1 - resources: - requests: - memory: 1000Mi - cpu: 1000m - limits: - memory: 1000Mi - cpu: 1000m - imagePullPolicy: Never - command: [ "/bin/bash", "-c", "--"] - # pair firstly - args: [ "umask 027;npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 - -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker" ] - securityContext: - privileged: true - readOnlyRootFilesystem: true - runAsUser: 0 - runAsGroup: 0 - ports: - - name: http - containerPort: 8082 - protocol: TCP - volumeMounts: - - name: log-npu-exporter - mountPath: /var/log/mindx-dl/npu-exporter - - name: localtime - mountPath: /etc/localtime - readOnly: true - - name: ascend-driver - mountPath: /usr/local/Ascend/driver - readOnly: true - - name: ascend-dcmi - mountPath: /usr/local/dcmi - readOnly: true - - name: docker-shim # delete when only use containerd or isula - mountPath: /var/run/dockershim.sock - readOnly: true - - name: docker # delete when only use containerd or isula - mountPath: /var/run/docker - readOnly: true - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - mountPath: /var/run/cri-dockerd.sock - readOnly: true - - name: containerd # delete when only use isula - mountPath: /run/containerd - readOnly: true - - name: isulad # delete when use containerd or docker - mountPath: /run/isulad.sock - readOnly: true - - name: tmp - mountPath: /tmp - volumes: - - name: log-npu-exporter - hostPath: - path: /var/log/mindx-dl/npu-exporter - type: Directory - - name: localtime - hostPath: - path: /etc/localtime - - name: ascend-driver - hostPath: - path: /usr/local/Ascend/driver - - name: ascend-dcmi - hostPath: - path: /usr/local/dcmi - - name: docker-shim # delete when only use containerd or isula - hostPath: - path: /var/run/dockershim.sock - - name: docker # delete when only use containerd or isula - hostPath: - path: /var/run/docker - - name: cri-dockerd # reserve when k8s version is 1.24+ and the container runtime is docker - hostPath: - path: /var/run/cri-dockerd.sock - - name: containerd # delete when only use isula - hostPath: - path: /run/containerd - - name: isulad # delete when use containerd or docker - hostPath: - path: /run/isulad.sock - - name: tmp - hostPath: - path: /tmp - diff --git a/mind-cluster/component/npu-exporter/build/pluginConfiguration.json b/mind-cluster/component/npu-exporter/build/pluginConfiguration.json deleted file mode 100644 index 68823e0..0000000 --- a/mind-cluster/component/npu-exporter/build/pluginConfiguration.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"metricsGroup": "MyPlugin", "state": "OFF"}, - {"metricsGroup": "text", "state": "ON"} -] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh b/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh deleted file mode 100644 index 055ed41..0000000 --- a/mind-cluster/component/npu-exporter/build/run_for_310P_1usoc.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Perform build npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2022-2022. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -set -e - -# log process run in background -echo -e "[INFO]\t $(date +"%F %T:%N")\t start slogd server in background" -su - HwHiAiUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/slogd -d &" -echo -e "[INFO]\t $(date +"%F %T:%N")\t start dmp_daemon server in background" -# dcmi interface process run in background -su - HwDmUser -c "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:/usr/lib64 && /var/dmp_daemon -I -M -U 8087 &" - -export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64:/usr/local/dcmi -# the host is openEuler, so the parameters "endpoint" and "containerd" are set to adapt to "-containerMode=docker" in default -# in openEuler os, the path of parameters "endpoint" and "containerd" are not in the default place -echo -e "[INFO]\t $(date +"%F %T:%N")\t start npu-exporter server" -/usr/local/bin/npu-exporter -port=8082 -ip=0.0.0.0 -updateTime=5 -logFile=/var/log/mindx-dl/npu-exporter/npu-exporter.log -logLevel=0 -containerMode=docker -endpoint=/run/dockershim.sock -containerd=/run/docker/containerd/containerd.sock - diff --git a/mind-cluster/component/npu-exporter/build/test.sh b/mind-cluster/component/npu-exporter/build/test.sh deleted file mode 100644 index 097eb3a..0000000 --- a/mind-cluster/component/npu-exporter/build/test.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# Perform test for npu-exporter -# Copyright @ Huawei Technologies CO., Ltd. 2020-2020. All rights reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -set -e - - -# execute go test and echo result to report files -function execute_test() { - if ! (go test -v -race -coverprofile cov.out "${TOP_DIR}"/... >./"$file_input") - then - echo '****** go test cases error! ******' - cat $file_input - exit 1 - else - gocov convert cov.out | gocov-html >"$file_detail_output" - gotestsum --junitfile unit-tests.xml "${TOP_DIR}"/... - - total_coverage=$(go tool cover -func=cov.out | grep "total:" | awk '{print $3}'| sed 's/%//') - # round up - coverage=$(echo "$total_coverage" | awk '{if ($1 >= 0) print ($1 == int($1)) ? int($1) : int($1) + 1;\ - else print ($1 == int($1)) ? int($1) : int($1)}') - if [[ $coverage -ge 80 ]]; then - echo "coverage passed: $coverage%" - exit 0 - else - echo "coverage failed: $coverage%, it needs to be greater than 80%." - exit 1 - fi - fi -} - - -export GO111MODULE="on" -export PATH=$GOPATH/bin:$PATH -export GOFLAGS="-gcflags=all=-l" -unset GOPATH -# if didn't install the following tools, please install firstly -#go get -insecure github.com/axw/gocov/gocov -#go get github.com/matm/gocov-html -CUR_DIR=$(dirname "$(readlink -f "$0")") -TOP_DIR=$(realpath "${CUR_DIR}"/..) - -file_input='testExporter.txt' -file_detail_output='api.html' - -if [ -f "${TOP_DIR}"/test ]; then - rm -rf "${TOP_DIR}"/test -fi -mkdir -p "${TOP_DIR}"/test -cd "${TOP_DIR}"/test -echo "clean old version test results" - -if [ -f "$file_input" ]; then - rm -rf "$file_input" -fi -if [ -f "$file_detail_output" ]; then - rm -rf "$file_detail_output" -fi - -echo "************************************* Start LLT Test *************************************" -execute_test -echo "************************************* End LLT Test *************************************" diff --git a/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go b/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go deleted file mode 100644 index 700b248..0000000 --- a/mind-cluster/component/npu-exporter/cmd/npu-exporter/main.go +++ /dev/null @@ -1,545 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package main -package main - -import ( - "context" - "errors" - "flag" - "fmt" - "log" - "net" - "net/http" - "os" - "regexp" - "strconv" - "strings" - "sync" - "time" - - "github.com/influxdata/telegraf/plugins/common/shim" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/limiter" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/config" - "huawei.com/npu-exporter/v6/collector/container" - _ "huawei.com/npu-exporter/v6/platforms/inputs/npu" - "huawei.com/npu-exporter/v6/platforms/prom" - "huawei.com/npu-exporter/v6/plugins" - "huawei.com/npu-exporter/v6/utils/logger" - "huawei.com/npu-exporter/v6/versions" -) - -var ( - port int - updateTime int - ip = "" - version bool - concurrency int - containerMode = "" - containerd = "" - endpoint = "" - limitIPReq = "" - platform = "" - textMetricsFilePath = "" - limitIPConn int - limitTotalConn int - cacheSize int - profilingTime int - hccsBWProfilingTime int - pollInterval time.Duration - deviceResetTimeout int -) - -const ( - portConst = 8082 - updateTimeConst = 5 - cacheTime = 100 * time.Second - portLeft = 1025 - portRight = 40000 - oneMinute = 60 - defaultConcurrency = 5 - defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" - containerModeDocker = "docker" - containerModeContainerd = "containerd" - containerModeIsula = "isula" - unixPre = "unix://" - timeout = 10 - maxHeaderBytes = 1024 - // tenDays ten days - tenDays = 10 - maxIPConnLimit = 128 - maxConcurrency = 512 - defaultConnection = 20 - maxProfilingTime = 2000 - minHccsBWProfilingTime = 1 - maxHccsBWProfilingTime = 1000 - defaultShutDownTimeout = 30 * time.Second -) - -const ( - prometheusPlatform = "Prometheus" - telegrafPlatform = "Telegraf" - pollIntervalStr = "poll_interval" - platformStr = "platform" - defaultProfilingTime = 200 - defaultHccsBwProfilingTime = 200 -) - -func main() { - flag.Parse() - if version { - fmt.Printf("NPU-exporter version: %s \n", versions.BuildVersion) - return - } - err := logger.InitLogger(platform) - if err != nil { - fmt.Fprintf(os.Stderr, "%v", err) - return - } - initPaprams() - err = paramValid(platform) - if err != nil { - return - } - dmgr, err := devmanager.AutoInit("", deviceResetTimeout) - if err != nil { - logger.Errorf("new npu collector failed, error is %v", err) - return - } - logger.Infof("npu exporter starting and the version is %s", versions.BuildVersion) - deviceParser := container.MakeDevicesParser(readCntMonitoringFlags()) - defer deviceParser.Close() - - if err := deviceParser.Init(); err != nil { - logger.Errorf("failed to init devices parser: %v", err) - } - deviceParser.Timeout = time.Duration(updateTime) * time.Second - - colcommon.Collector = colcommon.NewNpuCollector(cacheTime, time.Duration(updateTime)*time.Second, deviceParser, dmgr) - plugins.InitTextMetricsDesc(textMetricsFilePath) - plugins.RegisterPlugin() - config.Register(colcommon.Collector) - - ctx, cancel := context.WithCancel(context.Background()) - wg := &sync.WaitGroup{} - colcommon.InitCardInfo(wg, ctx, colcommon.Collector) - colcommon.StartContainerInfoCollect(ctx, cancel, wg, colcommon.Collector) - - colcommon.StartCollect(wg, ctx, colcommon.Collector) - switch platform { - case prometheusPlatform: - prometheusProcss(wg, ctx, cancel) - case telegrafPlatform: - telegrafProcess() - default: - err = fmt.Errorf("err platform input") - } - wg.Wait() -} - -func prometheusProcss(wg *sync.WaitGroup, ctx context.Context, cancel context.CancelFunc) { - c := prom.NewPrometheusCollector(colcommon.Collector) - reg := prometheus.NewRegistry() - reg.MustRegister(c) - - wg.Add(1) - go func() { - startServe(ctx, cancel, reg) - wg.Done() - }() -} - -func initPaprams() { - common.SetHccsBWProfilingTime(hccsBWProfilingTime) - common.SetExternalParams(profilingTime) -} - -func paramValid(platform string) error { - var err error - switch platform { - case prometheusPlatform: - err = paramValidInPrometheus() - case telegrafPlatform: - err = paramValidInTelegraf() - default: - err = fmt.Errorf("err platform input") - } - if err != nil { - logger.Error(err) - return err - } - return nil -} - -func initConfig() *limiter.HandlerConfig { - conf := &limiter.HandlerConfig{ - PrintLog: true, - Method: http.MethodGet, - LimitBytes: limiter.DefaultDataLimit, - TotalConCurrency: concurrency, - IPConCurrency: limitIPReq, - CacheSize: limiter.DefaultCacheSize, - } - return conf -} - -func newServerAndListener(conf *limiter.HandlerConfig) (*http.Server, net.Listener) { - handler, err := limiter.NewLimitHandlerV2(http.DefaultServeMux, conf) - if err != nil { - hwlog.RunLog.Error(err) - return nil, nil - } - s := &http.Server{ - Addr: ip + ":" + strconv.Itoa(port), - Handler: handler, - ReadTimeout: timeout * time.Second, - WriteTimeout: timeout * time.Second, - MaxHeaderBytes: maxHeaderBytes, - ErrorLog: log.New(&hwlog.SelfLogWriter{}, "", log.Lshortfile), - } - ln, err := net.Listen("tcp", s.Addr) - if err != nil { - logger.Errorf("listen ip and port error: %v", err) - return nil, nil - } - limitLs, err := limiter.LimitListener(ln, limitTotalConn, limitIPConn, limiter.DefaultCacheSize) - if err != nil { - hwlog.RunLog.Error(err) - return nil, nil - } - return s, limitLs -} - -func readCntMonitoringFlags() container.CntNpuMonitorOpts { - opts := container.CntNpuMonitorOpts{UseOciBackup: true, UseCriBackup: true} - switch containerMode { - case containerModeDocker: - opts.EndpointType = container.EndpointTypeDockerd - opts.OciEndpoint = container.DefaultDockerAddr - opts.CriEndpoint = container.DefaultDockerShim - case containerModeContainerd: - opts.EndpointType = container.EndpointTypeContainerd - opts.OciEndpoint = container.DefaultContainerdAddr - opts.CriEndpoint = container.DefaultContainerdAddr - case containerModeIsula: - opts.EndpointType = container.EndpointTypeIsula - opts.OciEndpoint = container.DefaultIsuladAddr - opts.CriEndpoint = container.DefaultIsuladAddr - default: - hwlog.RunLog.Error("invalid container mode setting,reset to docker") - opts.EndpointType = container.EndpointTypeDockerd - opts.OciEndpoint = container.DefaultDockerAddr - opts.CriEndpoint = container.DefaultDockerShim - } - if containerd != "" { - opts.OciEndpoint = containerd - opts.UseOciBackup = false - } - if endpoint != "" { - opts.CriEndpoint = endpoint - opts.UseCriBackup = false - } - return opts -} - -func checkIPAndPortInPrometheus() error { - if port < portLeft || port > portRight { - return errors.New("the port is invalid") - } - parsedIP := net.ParseIP(ip) - if parsedIP == nil { - return errors.New("the listen ip is invalid") - } - ip = parsedIP.String() - logger.Infof("listen on: %s", ip) - return nil -} - -func paramValidInPrometheus() error { - checks := []func() error{ - checkIPAndPortInPrometheus, - checkUpdateTime, - containerSockCheck, - checkLimitIPReqFormat, - checkLimitIPConn, - checkLimitTotalConn, - checkCacheSize, - checkConcurrency, - checkProfilingTime, - checkHccsBWProfilingTime, - checkDeviceResetTimeout, - checkPollIntervalInCmdLine, - } - - for _, check := range checks { - if err := check(); err != nil { - return err - } - } - return nil -} - -func checkUpdateTime() error { - if updateTime > oneMinute || updateTime < 1 { - return errors.New("the updateTime is invalid") - } - return nil -} - -func checkLimitIPReqFormat() error { - reg := regexp.MustCompile(limiter.IPReqLimitReg) - if !reg.Match([]byte(limitIPReq)) { - return errors.New("limitIPReq format error") - } - return nil -} - -func checkLimitIPConn() error { - if limitIPConn < 1 || limitIPConn > maxIPConnLimit { - return errors.New("limitIPConn is invalid") - } - return nil -} - -func checkLimitTotalConn() error { - if limitTotalConn < 1 || limitTotalConn > maxConcurrency { - return errors.New("limitTotalConn is invalid") - } - return nil -} - -func checkCacheSize() error { - if cacheSize < 1 || cacheSize > limiter.DefaultCacheSize*tenDays { - return errors.New("cacheSize is invalid") - } - return nil -} - -func checkConcurrency() error { - if concurrency < 1 || concurrency > maxConcurrency { - return errors.New("concurrency is invalid") - } - return nil -} - -func checkProfilingTime() error { - if profilingTime < 1 || profilingTime > maxProfilingTime { - return errors.New("profilingTime range error") - } - return nil -} - -func checkHccsBWProfilingTime() error { - if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { - return errors.New("hccsBWProfilingTime range error") - } - return nil -} - -func checkDeviceResetTimeout() error { - if deviceResetTimeout < api.MinDeviceResetTimeout || deviceResetTimeout > api.MaxDeviceResetTimeout { - return errors.New("deviceResetTimeout range error") - } - return nil -} - -func checkPollIntervalInCmdLine() error { - cmdLine := strings.Join(os.Args[1:], "") - if strings.Contains(cmdLine, pollIntervalStr) { - return fmt.Errorf("%s is not support this scene", pollIntervalStr) - } - return nil -} - -func containerSockCheck() error { - if endpoint != "" && !strings.Contains(endpoint, ".sock") { - return errors.New("endpoint file is not sock address") - } - if containerd != "" && !strings.Contains(containerd, ".sock") { - return errors.New("containerd file is not sock address") - } - if endpoint != "" && !strings.Contains(endpoint, unixPre) { - endpoint = unixPre + endpoint - } - if containerd != "" && !strings.Contains(containerd, unixPre) { - containerd = unixPre + containerd - } - return nil -} - -func init() { - flag.IntVar(&port, "port", portConst, - "The server port of the http service,range[1025-40000]") - flag.StringVar(&ip, "ip", "", - "The listen ip of the service,0.0.0.0 is not recommended when install on Multi-NIC host") - flag.IntVar(&updateTime, "updateTime", updateTimeConst, - "Interval (seconds) to update the npu metrics cache,range[1-60]") - flag.BoolVar(&version, "version", false, - "If true,query the version of the program (default false)") - flag.StringVar(&containerMode, "containerMode", containerModeDocker, - "Set 'docker' for monitoring docker containers or 'containerd' for CRI & containerd") - flag.StringVar(&containerd, "containerd", "", - "The endpoint of containerd used for listening containers' events") - flag.StringVar(&endpoint, "endpoint", "", - "The endpoint of the CRI server to which will be connected") - flag.IntVar(&concurrency, "concurrency", defaultConcurrency, - "The max concurrency of the http server, range is [1-512]") - // hwlog configuration - flag.IntVar(&logger.HwLogConfig.LogLevel, "logLevel", 0, - "Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)") - flag.IntVar(&logger.HwLogConfig.MaxAge, "maxAge", hwlog.DefaultMinSaveAge, - "Maximum number of days for backup log files, range [7, 700] days") - flag.StringVar(&logger.HwLogConfig.LogFileName, "logFile", defaultLogFile, - "Log file path. If the file size exceeds 20MB, will be rotated") - flag.IntVar(&logger.HwLogConfig.MaxBackups, "maxBackups", hwlog.DefaultMaxBackups, - "Maximum number of backup log files, range is (0, 30]") - flag.IntVar(&cacheSize, "cacheSize", limiter.DefaultCacheSize, "the cacheSize for ip limit,"+ - "range is [1,1024000],keep default normally") - flag.IntVar(&limitIPConn, "limitIPConn", defaultConcurrency, "the tcp connection limit for each Ip,"+ - "range is [1,128]") - flag.IntVar(&limitTotalConn, "limitTotalConn", defaultConnection, "the tcp connection limit for all"+ - " request,range is [1,512]") - flag.StringVar(&limitIPReq, "limitIPReq", "20/1", - "the http request limit counts for each Ip,20/1 means allow 20 request in 1 seconds") - flag.StringVar(&platform, "platform", "Prometheus", "the data reporting platform, "+ - "just support Prometheus and Telegraf") - flag.StringVar(&textMetricsFilePath, "textMetricsFilePath", "", - "text indicator collection path, only support specified one file path") - flag.DurationVar(&pollInterval, pollIntervalStr, 1*time.Second, - "how often to send metrics when use Telegraf plugin, "+ - "needs to be used with -platform=Telegraf, otherwise, it does not take effect") - flag.IntVar(&profilingTime, "profilingTime", defaultProfilingTime, - "config pcie bandwidth profiling time, range is [1, 2000]") - flag.IntVar(&hccsBWProfilingTime, api.HccsBWProfilingTimeStr, defaultHccsBwProfilingTime, - "config "+api.Hccs+" bandwidth profiling time, range is [1, 1000]") - flag.IntVar(&deviceResetTimeout, api.DeviceResetTimeout, api.DefaultDeviceResetTimeout, - "when npu-exporter starts, if the number of chips is insufficient, the maximum duration to wait for "+ - "the driver to report all chips, unit second, range [10, 600]") -} - -func indexHandler(w http.ResponseWriter, _ *http.Request) { - var proposal = "http" - _, err := w.Write([]byte( - ` - NPU-Exporter - -

NPU-Exporter

-

Welcome to use NPU-Exporter,the Prometheus metrics url is ` + proposal + `://ip:` + - strconv.Itoa(port) + `/metrics: Metrics

- - `)) - if err != nil { - logger.Errorf("Write to response error: %v", err) - } -} - -func prometheusProcess() { - -} - -func startServe(ctx context.Context, cancel context.CancelFunc, reg *prometheus.Registry) { - http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{ErrorHandling: promhttp.ContinueOnError})) - http.Handle("/", http.HandlerFunc(indexHandler)) - conf := initConfig() - s, limitLs := newServerAndListener(conf) - if s == nil || limitLs == nil { - cancel() - return - } - - go func() { - logger.Warn("enable unsafe http server") - if err := s.Serve(limitLs); err != nil { - logger.Errorf("Http server error: %v and stopped", err) - cancel() - } - }() - - <-ctx.Done() - shutErr := func() error { - logger.Info("received stop signal, STOP http server") - ctxShutDown, timeOut := context.WithTimeout(context.Background(), defaultShutDownTimeout) - defer timeOut() - return s.Shutdown(ctxShutDown) - }() - if shutErr != nil { - logger.Errorf("shutdown http server error: %v", shutErr) - } -} - -func paramValidInTelegraf() error { - // cmdLine here must contain "-platform=Telegraf", otherwise, it will enter the Prometheus process - cmdLine := os.Args[1:] - - // store the preset parameter names in the map - presetParamsMap := map[string]bool{ - platformStr: true, - pollIntervalStr: true, - api.HccsBWProfilingTimeStr: true, - } - - if len(cmdLine) > len(presetParamsMap) { - return errors.New("too many parameters") - } - - var paramLen = 2 - // check every input params - for _, param := range cmdLine { - param = strings.TrimPrefix(param, "-") - split := strings.Split(param, "=") - if len(split) != paramLen { - return fmt.Errorf("the param [%s] is a wrong format", param) - } - paramName := split[0] - if !presetParamsMap[paramName] { - return fmt.Errorf("not support [%s] in Telegraf", paramName) - } - } - - if hccsBWProfilingTime < minHccsBWProfilingTime || hccsBWProfilingTime > maxHccsBWProfilingTime { - return errors.New(api.Hccs + "BWProfilingTime range error") - } - return nil -} - -func telegrafProcess() { - // create the shim. This is what will run your plugins. - shim := shim.New() - - // If no config is specified, all imported plugins are loaded. - // otherwise follow what the config asks for. - // Check for settings from a config toml file, - // (or just use whatever plugins were imported above) - configFile := "" - err := shim.LoadConfig(&configFile) - if err != nil { - fmt.Fprintf(os.Stderr, "Err loading input: %s\n", err) - return - } - - // run the input plugin(s) until stdin closes, or we receive a termination signal - if err := shim.Run(pollInterval); err != nil { - fmt.Fprintf(os.Stderr, "Err: %s\n", err) - return - } -} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go deleted file mode 100644 index af46251..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/collector_for_container.go +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "context" - "strings" - "sync" - "time" - - "ascend-common/common-utils/hwlog" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -// StartContainerInfoCollect start collect container info -func StartContainerInfoCollect(ctx context.Context, cancelFunc context.CancelFunc, group *sync.WaitGroup, - n *NpuCollector) { - group.Add(1) - - go func() { - defer group.Done() - retryCount := 0 - collectContainerInfo := func() { - logger.Info("start to collect container info") - n.devicesParser.FetchAndParse(nil) - select { - case result := <-n.devicesParser.RecvResult(): - if err := n.cache.Set(containersDevicesCacheKey, result, n.cacheTime); err != nil { - logger.Error(err) - } - logger.Infof(UpdateCachePattern, containersDevicesCacheKey) - retryCount = 0 - case err := <-n.devicesParser.RecvErr(): - logger.Errorf("received error from device parser: %v", err) - if strings.Contains(err.Error(), "connection refused") { - retryCount++ - if retryCount == connectRefusedMaxRetry { - logger.Error("connection refused, task shutdown") - cancelFunc() - } - } - } - } - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop container info collect") - return - default: - collectContainerInfo() - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, containersDevicesCacheKey) - return - } - } - } - }() -} - -// GetContainerNPUInfo get container npu info -func GetContainerNPUInfo(n *NpuCollector) map[int32]container.DevicesInfo { - obj, err := n.cache.Get(containersDevicesCacheKey) - // only run once to prevent wait when container info get failed - npuContainerInfoInit.Do(func() { - if err != nil { - logger.Warn("containers' devices info not found in cache, rebuilding") - resultChan := make(chan container.DevicesInfos, 1) - n.devicesParser.FetchAndParse(resultChan) - select { - case obj = <-resultChan: - case <-time.After(time.Second): - logger.Warn("rebuild container info cache timeout") - return - } - logger.Info("rebuild cache successfully") - } - }) - cntNpuInfos, ok := obj.(container.DevicesInfos) - if !ok { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: DomainForContainerInfo, ID: 0}, - "error container npu info cache and convert failed") - return nil - } - hwlog.ResetErrCnt(DomainForContainerInfo, 0) - res := make(map[int32]container.DevicesInfo, initSize) - for _, v := range cntNpuInfos { - for _, deviceID := range v.Devices { - res[int32(deviceID)] = v - } - } - return res -} diff --git a/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go b/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go deleted file mode 100644 index 6412e12..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/collector_for_container_test.go +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/cache" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - testCacheTime = 60 * time.Second - testUpdateTime = 10 * time.Millisecond - testDeviceID0 = 0 - testDeviceID1 = 1 - testDeviceID2 = 2 - testContainerID1 = "container1" - testContainerID2 = "container2" - testContainerName1 = "test-container-1" - testContainerName2 = "test-container-2" -) - -var ( - testDevicesInfos = container.DevicesInfos{ - testContainerID1: { - ID: testContainerID1, - Name: testContainerName1, - Devices: []int{testDeviceID0, testDeviceID1}, - }, - testContainerID2: { - ID: testContainerID2, - Name: testContainerName2, - Devices: []int{testDeviceID2}, - }, - } -) - -func createTestNpuCollector() *NpuCollector { - parser := &container.DevicesParser{} - return &NpuCollector{ - cache: cache.New(cacheSize), - devicesParser: parser, - updateTime: testUpdateTime, - cacheTime: testCacheTime, - } -} - -func resetNpuContainerInfoInit() { - npuContainerInfoInit = sync.Once{} -} - -type getContainerNPUInfoTestCase struct { - name string - setupCache func(*NpuCollector) - mockParser func(*gomonkey.Patches, *container.DevicesParser) - expectedResult map[int32]container.DevicesInfo -} - -func createGetContainerNPUInfoTestCases() []getContainerNPUInfoTestCase { - return []getContainerNPUInfoTestCase{ - { - name: "should return container npu info when cache exists", - setupCache: func(n *NpuCollector) { - n.cache.Set(containersDevicesCacheKey, testDevicesInfos, testCacheTime) - }, - mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, - expectedResult: map[int32]container.DevicesInfo{ - int32(testDeviceID0): testDevicesInfos[testContainerID1], - int32(testDeviceID1): testDevicesInfos[testContainerID1], - int32(testDeviceID2): testDevicesInfos[testContainerID2], - }, - }, - { - name: "should rebuild cache when cache not exists", - setupCache: func(n *NpuCollector) {}, - mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) { - patches.ApplyMethod(parser, "FetchAndParse", - func(p *container.DevicesParser, resultOut chan<- container.DevicesInfos) { - if resultOut != nil { - resultOut <- testDevicesInfos - } - }) - }, - expectedResult: map[int32]container.DevicesInfo{ - int32(testDeviceID0): testDevicesInfos[testContainerID1], - int32(testDeviceID1): testDevicesInfos[testContainerID1], - int32(testDeviceID2): testDevicesInfos[testContainerID2], - }, - }, - { - name: "should return nil when cache type conversion failed", - setupCache: func(n *NpuCollector) { - n.cache.Set(containersDevicesCacheKey, "invalid type", testCacheTime) - }, - mockParser: func(patches *gomonkey.Patches, parser *container.DevicesParser) {}, - expectedResult: nil, - }, - } -} - -func TestGetContainerNPUInfo(t *testing.T) { - testCases := createGetContainerNPUInfoTestCases() - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - resetNpuContainerInfoInit() - n := createTestNpuCollector() - tc.setupCache(n) - - patches := gomonkey.NewPatches() - defer patches.Reset() - tc.mockParser(patches, n.devicesParser) - - result := GetContainerNPUInfo(n) - convey.So(result, convey.ShouldResemble, tc.expectedResult) - }) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/common/constants.go b/mind-cluster/component/npu-exporter/collector/common/constants.go deleted file mode 100644 index d7e1409..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/constants.go +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general constants -package common - -// metric label name -const ( - npuID = "id" - modelName = "model_name" - npuUUID = "vdie_id" - npuPCIEInfo = "pcie_bus_info" - namespace = "namespace" - podName = "pod_name" - cntrName = "container_name" -) - -const ( - // Healthy status of Health - Healthy = "Healthy" - // UnHealthy status of unhealth - UnHealthy = "UnHealthy" - // Abnormal status of Abnormal - Abnormal = "Abnormal" - - // LinkUp npu interface up - LinkUp = "UP" - // LinkDown npu interface down - LinkDown = "DOWN" - - // Base convert base - Base = 10 - // ContainerNameLen container name length - ContainerNameLen = 3 - // npuListCacheKey Cache key - npuListCacheKey = "npu-exporter-npu-list" - // Cache key for parsing-device result - containersDevicesCacheKey = "npu-exporter-containers-devices" - initSize = 8 - tickerFailedPattern = "%s ticker failed, task shutdown" - // UpdateCachePattern Update cache pattern - UpdateCachePattern = "update Cache,key is %s" - connectRefusedMaxRetry = 3 -) - -const ( - cacheSize = 128 - // NameSpaceIdx is the index of namespace in container name - NameSpaceIdx = 0 - // PodNameIdx is the index of pod name in container name - PodNameIdx = 1 - // ConNameIdx is the index of container name in container name - ConNameIdx = 2 - - // DecimalPlaces is the decimal places of float64 - DecimalPlaces = 2 - // BitSize is the bit size of float64 - BitSize = 64 - // GeneralDevTagKey is the default value of devTagKey in telegraf, it means the metric is not related to any device - GeneralDevTagKey = "GeneralDevTagKey" -) - -// log limit domains for metrics -const ( - // DomainForLogicIdErr domain for faild to get cardId and deviceId by logicID - DomainForLogicIdErr = "logicID" - - // DomainForHccs domain for hccs - DomainForHccs = "hccs" - - // DomainForDDR domain for DDR - DomainForDDR = "DDR" - - // DomainForSio domain for sio - DomainForSio = "sio" - - // DomainForHBM domain for HBM - DomainForHBM = "hbm" - - // DomainForHBMECC domain for hbmEcc - DomainForHBMECC = "hbmEcc" - - // DomainForHccsBW domain for hccs bandwidth - DomainForHccsBW = "hccsBw" - - // DomainForOptical domain for Optical - DomainForOptical = "optical" - - // DomainForLinkState domain for linkState - DomainForLinkState = "linkState" - - // DomainForBandwidth domain for bandwidth - DomainForBandwidth = "bandwidth" - - // DomainForLinkStat domain for linkStat - DomainForLinkStat = "linkStat" - - // DomainForLinkSpeed domain for linkSpeed - DomainForLinkSpeed = "linkSpeed" - - // DomainForRoce domain for roce - DomainForRoce = "roce" - - // DomainForMcuPower domain for mcu power - DomainForMcuPower = "mcuPower" - - // DomainForChipPower domain for chip power - DomainForChipPower = "chipPower" - - // DomainForAICoreUtilization domain for ai core utilization - DomainForAICoreUtilization = "AICoreUtilization" - - // DomainForVectorCoreUtilization domain for vector core utilization - DomainForVectorCoreUtilization = "vectorCoreUtilization" - - // DomainForProcess domain for process info - DomainForProcess = "processInfo" - - // DomainForHbmUtilization domain for High Bandwidth Memory Utilization - DomainForHbmUtilization = "hbmUtilization" - - // DomainForOverallUtilization domain for overall utilization - DomainForOverallUtilization = "overallUtilization" - - // DomainForPcieBandwidth domain for pcie bandwidth - DomainForPcieBandwidth = "pcieBandwidth" - // DomainForContainerInfo domain for pcie container info - DomainForContainerInfo = "containerInfo" -) diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go deleted file mode 100644 index d891649..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/metrics_collector.go +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "reflect" - "strings" - "sync" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - // CardLabel general card label - CardLabel = []string{npuID, modelName, npuUUID, npuPCIEInfo, namespace, podName, cntrName} - - noNeedToPrintUpdateLog = map[string]bool{ - "NetworkCollector": true, - "RoceCollector": true, - "OpticalCollector": true, - } -) - -// BuildDescSlice build desc slice -func BuildDescSlice(slice *[]*prometheus.Desc, name string, help string) { - *slice = append(*slice, BuildDesc(name, help)) -} - -// BuildDesc build desc -func BuildDesc(name string, help string) *prometheus.Desc { - return prometheus.NewDesc(name, help, CardLabel, nil) -} - -// BuildDescWithLabel build desc with label -func BuildDescWithLabel(name string, help string, label []string) *prometheus.Desc { - return prometheus.NewDesc(name, help, label, nil) -} - -// MetricsCollector metrics collector -type MetricsCollector interface { - // Describe report metrics to prometheus - Describe(ch chan<- *prometheus.Desc) - - // CollectToCache collect data to cache - CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) - - // UpdatePrometheus update prometheus - UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, containerMap map[int32]container.DevicesInfo, - chips []HuaWeiAIChip) - - // UpdateTelegraf update telegraf - UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} - - // PreCollect pre handle before collect - PreCollect(*NpuCollector, []HuaWeiAIChip) - - // PostCollect post handle after collect - PostCollect(*NpuCollector) - - // IsSupported Check whether the current hardware supports this metric - IsSupported(*NpuCollector) bool -} - -// MetricsCollectorAdapter base collector for metrics collector -type MetricsCollectorAdapter struct { - LocalCache sync.Map - Is910Series bool - ContainerMap map[int32]container.DevicesInfo - Chips []HuaWeiAIChip -} - -// Describe report metrics to prometheus -func (c *MetricsCollectorAdapter) Describe(ch chan<- *prometheus.Desc) { -} - -// CollectToCache collect data to cache -func (c *MetricsCollectorAdapter) CollectToCache(n *NpuCollector, chipList []HuaWeiAIChip) { -} - -// UpdatePrometheus update prometheus -func (c *MetricsCollectorAdapter) UpdatePrometheus(ch chan<- prometheus.Metric, n *NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) { -} - -// UpdateTelegraf update telegraf -func (c *MetricsCollectorAdapter) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []HuaWeiAIChip) map[string]map[string]interface{} { - return fieldsMap -} - -// PreCollect pre handle before collect -func (c *MetricsCollectorAdapter) PreCollect(n *NpuCollector, chipList []HuaWeiAIChip) { - if strings.Contains(n.Dmgr.GetDevType(), api.Ascend910A) { - c.Is910Series = true - } -} - -// PostCollect post handle after collect -func (c *MetricsCollectorAdapter) PostCollect(*NpuCollector) { -} - -// IsSupported Check whether the current hardware supports this metric -func (c *MetricsCollectorAdapter) IsSupported(*NpuCollector) bool { - return true -} - -// UpdateCache update cache -func UpdateCache[T any](n *NpuCollector, cacheKey string, localCache *sync.Map) { - var cacheInfo = make(map[int32]T) - obj, err := n.cache.Get(cacheKey) - if err != nil { - logger.Debugf("get info of %s failed: %v, use initial data", cacheKey, err) - } else { - if oldCacheInfo, ok := obj.(map[int32]T); ok { - cacheInfo = copyMap(oldCacheInfo) - } else { - logger.Debug("cache format invalid, reset") - } - } - - localCache.Range(func(key, value interface{}) bool { - finalKey, okKey := key.(int32) - finalValue, okValue := value.(T) - if okKey && okValue { - cacheInfo[finalKey] = finalValue - } - return true - }) - - err = n.cache.Set(cacheKey, cacheInfo, n.cacheTime) - if noNeedToPrintUpdateLog[cacheKey] { - return - } - if err != nil { - logger.Error(err) - } -} - -func copyMap[T any](oldCacheInfo map[int32]T) map[int32]T { - var cacheInfo = make(map[int32]T) - for key, value := range oldCacheInfo { - cacheInfo[key] = value - } - return cacheInfo -} - -// GetInfoFromCache get info from cache -func GetInfoFromCache[T any](n *NpuCollector, cacheKey string) map[int32]T { - res := make(map[int32]T) - obj, err := n.cache.Get(cacheKey) - if err != nil { - logger.Warn("cache not found, please wait for rebuild") - return res - } - - if data, ok := obj.(map[int32]T); ok { - return data - } - logger.Error("cache type mismatch") - return res -} - -// GetCacheKey Obtain the name of the struct pointer as the key of the cache -func GetCacheKey(ptr interface{}) string { - v := reflect.ValueOf(ptr) - if v.Kind() != reflect.Ptr { - return "" - } - v = v.Elem() - if v.Kind() != reflect.Struct { - return "" - } - return v.Type().Name() -} diff --git a/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go deleted file mode 100644 index f66ceb5..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/metrics_collector_test.go +++ /dev/null @@ -1,231 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "reflect" - "sync" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" -) - -// TestCopyMap test copyMap -func TestCopyMap(t *testing.T) { - type testStruct struct { - name string - age int - } - mockString := "mock" - tests := []struct { - name string - input map[int32]testStruct - validate func(*testing.T, interface{}) - }{ - {name: "NilInput", input: (map[int32]testStruct)(nil), - validate: func(t *testing.T, got interface{}) { - g, ok := got.(map[int32]testStruct) - if !ok || g == nil || len(g) != 0 { - t.Errorf("should return empty map for nil input") - } - }}, - {name: "EmptyMap", input: map[int32]testStruct{}, - validate: func(t *testing.T, got interface{}) { - if len(got.(map[int32]testStruct)) != 0 { - t.Errorf("expected empty map") - } - }}, - {name: "SingleElement", input: map[int32]testStruct{1: {name: mockString, age: 1}}, - validate: func(t *testing.T, got interface{}) { - g, ok := got.(map[int32]testStruct) - if !ok || g[1].name != mockString || g[1].age != 1 || len(g) != 1 { - t.Errorf("element mismatch") - } - }}, - {name: "MultipleElements", input: map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}}, - validate: func(t *testing.T, got interface{}) { - expected := map[int32]testStruct{1: {name: mockString, age: 1}, 2: {name: mockString, age: 1}} - if !reflect.DeepEqual(got, expected) { - t.Errorf("deepEqual failed") - } - }}, - } - - for _, tt := range tests { - convey.Convey(tt.name, t, func() { - got := copyMap[testStruct](tt.input) - tt.validate(t, got) - }) - } -} - -func TestPreCollect(t *testing.T) { - tests := []struct { - name string - deviceType string - expected bool - }{ - {name: "TestPreCollect_" + api.Ascend910, - deviceType: api.Ascend910, - expected: true, - }, - {name: "TestPreCollect_" + api.Ascend310, - deviceType: api.Ascend310, - expected: false, - }, - } - convey.Convey("TestPreCollect", t, func() { - n := mockNewNpuCollector() - adapter := MetricsCollectorAdapter{ - Is910Series: false, - ContainerMap: nil, - Chips: nil, - } - for _, tt := range tests { - convey.Convey(tt.name, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.deviceType) - adapter.PreCollect(n, nil) - convey.So(adapter.Is910Series, convey.ShouldEqual, tt.expected) - }) - } - }) -} - -type cacheCase struct { - name string - cacheKey string - preHandle func() - expected int -} - -func buildTestsForUpdateCache(expected int) []cacheCase { - tests := []cacheCase{ - {name: "TestUpdateCache_save info to cache", - cacheKey: "mockKey1", - preHandle: func() {}, - expected: expected, - }, - {name: "TestUpdateCache_update old cache", - cacheKey: "mockKey2", - preHandle: func() { - noNeedToPrintUpdateLog["mockKey2"] = true - }, - expected: expected, - }, - {name: "TestUpdateCache_old cache is in incorrect type", - cacheKey: "mockKey3", - preHandle: func() {}, - expected: expected, - }, - } - return tests -} - -func TestUpdateCache(t *testing.T) { - const key = int32(0) - const expected = 1 - tests := buildTestsForUpdateCache(expected) - - n := mockNewNpuCollector() - // data init - n.cache.Set("mockKey2", map[int32]string{key: "0"}, n.cacheTime) - n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) - - convey.Convey("TestUpdateCache", t, func() { - - for _, tt := range tests { - convey.Convey(tt.name, func() { - localCache := sync.Map{} - localCache.Store(key, "mockValue") - tt.preHandle() - UpdateCache[string](n, tt.cacheKey, &localCache) - - data, err := n.cache.Get(tt.cacheKey) - convey.So(err, convey.ShouldBeNil) - map2, ok := data.(map[int32]string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(len(map2), convey.ShouldEqual, tt.expected) - }) - } - - }) -} - -func TestGetInfoFromCache(t *testing.T) { - const key = int32(0) - tests := []struct { - name string - cacheKey string - expected int - }{ - {name: "TestGetInfoFromCache_no info in cache", - cacheKey: "mockKey1", - expected: 0, - }, - {name: "TestGetInfoFromCache_correct", - cacheKey: "mockKey2", - expected: 1, - }, - {name: "TestGetInfoFromCache_info in cache is in incorrect type", - cacheKey: "mockKey3", - expected: 0, - }, - } - n := mockNewNpuCollector() - // data init - n.cache.Set("mockKey2", map[int32]string{key: "mockValue"}, n.cacheTime) - n.cache.Set("mockKey3", map[int32]int{key: 0}, n.cacheTime) - for _, tt := range tests { - convey.Convey(tt.name, t, func() { - cache := GetInfoFromCache[string](n, tt.cacheKey) - convey.So(len(cache), convey.ShouldEqual, tt.expected) - }) - } -} - -func TestGetCacheKey(t *testing.T) { - tests := []struct { - name string - args interface{} - expected string - }{ - {name: "TestGetCacheKey_ptr", - args: &MetricsCollectorAdapter{}, - expected: "MetricsCollectorAdapter", - }, - {name: "TestGetCacheKey_int", - args: 0, - expected: "", - }, - {name: "TestGetCacheKey_struct", - args: MetricsCollectorAdapter{}, - expected: "", - }, - } - - convey.Convey("TestGetCacheKey", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - convey.So(GetCacheKey(tt.args), convey.ShouldEqual, tt.expected) - }) - } - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector.go deleted file mode 100644 index fee5312..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/npu_collector.go +++ /dev/null @@ -1,423 +0,0 @@ -/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for general collector -package common - -import ( - "context" - "sync" - "time" - - "ascend-common/api" - "ascend-common/common-utils/cache" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - "ascend-common/devmanager/dcmi" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - npuContainerInfoInit sync.Once - npuChipInfoInit sync.Once - // Collector base collector for prometheus and telegraf - Collector *NpuCollector - - // ChainForSingleGoroutine a list of collectors for single goroutine - ChainForSingleGoroutine []MetricsCollector - - // ChainForMultiGoroutine a list of collectors for multi goroutine - ChainForMultiGoroutine []MetricsCollector - - // ChainForCustomPlugin a list of collectors for plugin - ChainForCustomPlugin []MetricsCollector - - updateTimeForCardIds = time.Minute -) - -const ( - maxCollectTimeout = 10 * time.Second -) - -// NpuCollector for collect metrics -type NpuCollector struct { - cache *cache.ConcurrencyLRUCache - devicesParser *container.DevicesParser - updateTime time.Duration - cacheTime time.Duration - Dmgr *devmanager.DeviceManager -} - -// NewNpuCollector create a new collector -func NewNpuCollector(cacheTime time.Duration, updateTime time.Duration, - deviceParser *container.DevicesParser, dmgr *devmanager.DeviceManager) *NpuCollector { - CommonCollector := &NpuCollector{ - cache: cache.New(cacheSize), - cacheTime: cacheTime, - updateTime: updateTime, - devicesParser: deviceParser, - Dmgr: dmgr, - } - return CommonCollector -} - -// StartCollect start collect -func StartCollect(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - npuChipInfoInitAtFirstTime(n) - startCollectSingleGoroutine(group, ctx, n) - startCollectForMultiGoroutine(group, ctx, n) - startCollectForPluginGoroutine(group, ctx, n) -} - -func startCollectForPluginGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - group.Add(1) - go func() { - defer group.Done() - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - goroutinePreCollect(ChainForCustomPlugin, n) - defer goroutinePostCollect(ChainForCustomPlugin, n) - runPluginCollect(ctx, n, ticker) - }() -} - -func runPluginCollect(ctx context.Context, n *NpuCollector, ticker *time.Ticker) { - for { - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop plugin collect") - return - default: - collectPluginMetrics(n) - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, "handling plugin collectors") - return - } - } - } -} - -func collectPluginMetrics(n *NpuCollector) { - chipList := getChipListCache(n) - for _, c := range ChainForCustomPlugin { - resultChan := make(chan struct{}, 1) - go func(cur MetricsCollector) { - cur.CollectToCache(n, chipList) - resultChan <- struct{}{} - }(c) - select { - case <-resultChan: - continue - case <-time.After(maxCollectTimeout): - logger.Errorf("collect timeout for %v", GetCacheKey(c)) - continue - } - - } -} - -func startCollectForMultiGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - chips := getChipListCache(n) - - group.Add(len(chips)) - for _, chip := range chips { - go func(chip HuaWeiAIChip) { - defer group.Done() - runChipCollector(ctx, n, chip) - }(chip) - } -} - -func runChipCollector(ctx context.Context, n *NpuCollector, chip HuaWeiAIChip) { - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - goroutinePreCollect(ChainForMultiGoroutine, n) - defer goroutinePostCollect(ChainForMultiGoroutine, n) - for { - select { - case <-ctx.Done(): - logger.Infof("received the stop signal,stop collect network info of npu(%d)", chip.LogicID) - return - default: - singleChipSlice := []HuaWeiAIChip{chip} - for _, c := range ChainForMultiGoroutine { - c.CollectToCache(n, singleChipSlice) - } - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, "collect for multigroutine ") - return - } - } - } -} - -func goroutinePreCollect(collectors []MetricsCollector, n *NpuCollector) { - chipList := getChipListCache(n) - for _, c := range collectors { - c.PreCollect(n, chipList) - } -} - -func goroutinePostCollect(collectors []MetricsCollector, n *NpuCollector) { - for _, c := range collectors { - c.PostCollect(n) - } -} - -func startCollectSingleGoroutine(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - group.Add(1) - go func() { - defer group.Done() - ticker := time.NewTicker(n.updateTime) - defer ticker.Stop() - goroutinePreCollect(ChainForSingleGoroutine, n) - defer goroutinePostCollect(ChainForSingleGoroutine, n) - for { - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop npu base info collect") - return - default: - chipList := getChipListCache(n) - for _, c := range ChainForSingleGoroutine { - c.CollectToCache(n, chipList) - } - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, "handling all collectors") - return - } - } - } - }() -} - -// npuChipInfoInitAtFirstTime When first enter, the cache data is empty, -// need to get the data from the device, and build the cache -func npuChipInfoInitAtFirstTime(n *NpuCollector) { - npuChipInfoInit.Do(func() { - _, err := n.cache.Get(npuListCacheKey) - if err != nil { - logger.Debug("no cache in first time, start to collect chip list and rebuild cache") - - npuInfo := getNPUChipList(n.Dmgr) - if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { - logger.Error(err) - } else { - logger.Infof(UpdateCachePattern, npuListCacheKey) - } - logger.Debug("rebuild cache successfully") - } - }) -} - -// InitCardInfo init card info -func InitCardInfo(group *sync.WaitGroup, ctx context.Context, n *NpuCollector) { - - group.Add(1) - go func() { - defer group.Done() - ticker := time.NewTicker(updateTimeForCardIds) - defer ticker.Stop() - for { - logger.Info("start to collect npu chip list info") - select { - case <-ctx.Done(): - logger.Info("received the stop signal,stop card info collect") - return - default: - npuInfo := getNPUChipList(n.Dmgr) - if err := n.cache.Set(npuListCacheKey, npuInfo, n.cacheTime); err != nil { - logger.Error(err) - } else { - logger.Infof(UpdateCachePattern, npuListCacheKey) - } - if _, ok := <-ticker.C; !ok { - logger.Errorf(tickerFailedPattern, npuListCacheKey) - return - } - } - } - }() -} - -func getNPUChipList(dmgr devmanager.DeviceInterface) (npuInfo []HuaWeiAIChip) { - chipList := make([]HuaWeiAIChip, 0) - - cardNum, cards, err := dmgr.GetCardList() - if err != nil || cardNum == 0 { - logger.Errorf("failed to get npu info, error is: %v", err) - return chipList - } - - chipListIDs := make([]int32, 0) - - for _, cardID := range cards { - deviceNum, _ := dmgr.GetDeviceNumInCard(cardID) - for deviceID := int32(0); deviceID < deviceNum; deviceID++ { - var chip HuaWeiAIChip - // get logicID - logicID, err := dmgr.GetDeviceLogicID(cardID, deviceID) - if err != nil { - logger.Errorf("get logic ID of card: %v device:%v failed: %v", cardID, deviceID, err) - continue - } - - chip.LogicID = logicID - chip.CardId = cardID - chip.MainBoardId = dmgr.GetMainBoardId() - - setPhyId(&chip, dmgr, cardID, deviceID) - setChipInfo(&chip, dmgr, cardID, deviceID) - setBoardInfo(&chip, dmgr, cardID, deviceID) - setVdieID(&chip, dmgr, cardID, deviceID) - assemblevNPUInfo(dmgr, logicID, &chip) - setPCIeBusInfo(logicID, dmgr, &chip) - setElabelInfo(&chip, dmgr, cardID) - - chipList = append(chipList, chip) - chipListIDs = append(chipListIDs, logicID) - } - } - - logger.Debugf("flush chip info list successed,chip num is : %v, chipLogicIDs: %v", - len(chipList), chipListIDs) - return chipList -} - -func setBoardInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - boardInfo, err := dmgr.GetBoardInfo(chip.LogicID) - if err != nil { - logger.Errorf("get board info of card: %v device:%v failed: %v", cardID, deviceID, err) - boardInfo = common.BoardInfo{} - } - chip.BoardInfo = &boardInfo -} -func setVdieID(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - vdieID, err := dmgr.GetDieID(chip.LogicID, dcmi.VDIE) - if err != nil { - logger.Debug(err) - } - chip.VDieID = vdieID -} - -func setPhyId(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - phyID, err := dmgr.GetPhysicIDFromLogicID(chip.LogicID) - if err != nil { - logger.Errorf("get phy ID of card: %v device:%v failed: %v", cardID, deviceID, err) - } - chip.PhyId = phyID - chip.DeviceID = phyID -} -func setChipInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32, deviceID int32) { - // get chip info - chipInfo, err := dmgr.GetChipInfo(chip.LogicID) - if err != nil { - logger.Errorf("get chip info of card: %v device:%v failed: %v", cardID, deviceID, err) - chipInfo = &common.ChipInfo{} - } - chip.ChipInfo = chipInfo -} - -func setPCIeBusInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *HuaWeiAIChip) { - productTypes := dmgr.GetProductTypeArray() - pcieInfo, err := dmgr.GetPCIeBusInfo(logicID) - if err != nil { - if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { - logger.Debugf("pcie bus info is not supported on %s", common.Atlas200ISoc) - hwChip.PCIeBusInfo = "" - return - } - logger.Error(err) - pcieInfo = "" - } - hwChip.PCIeBusInfo = pcieInfo -} - -func setElabelInfo(chip *HuaWeiAIChip, dmgr devmanager.DeviceInterface, cardID int32) { - elabelInfo, err := dmgr.GetCardElabelV2(cardID) - if err != nil { - logger.Errorf("get elabel info of card: %v failed: %v", cardID, err) - chip.ElabelInfo = &common.ElabelInfo{SerialNumber: "NA"} - return - } - chip.ElabelInfo = &common.ElabelInfo{ - SerialNumber: elabelInfo.SerialNumber, - } -} - -func assemblevNPUInfo(dmgr devmanager.DeviceInterface, logicID int32, baseChipInfo *HuaWeiAIChip) { - if dmgr.GetDevType() != api.Ascend310P { - return - } - vDevInfos, err := dmgr.GetVirtualDeviceInfo(logicID) - if err != nil { - logger.Warnf("failed to get virtual device info,logicID(%d),err: %v", logicID, err) - baseChipInfo.VDevInfos = nil - } - if vDevInfos.TotalResource.VDevNum == 0 { - baseChipInfo.VDevInfos = &common.VirtualDevInfo{} - } - baseChipInfo.VDevInfos = &vDevInfos -} - -// GetChipListWithVNPU get chip list with vnpu -func GetChipListWithVNPU(n *NpuCollector) []HuaWeiAIChip { - result := make([]HuaWeiAIChip, 0) - chips := getChipListCache(n) - - for _, chipInfo := range chips { - isNeedHandleVnpu := n.Dmgr.GetDevType() == api.Ascend310P && chipInfo.VDevInfos != nil && - len(chipInfo.VDevInfos.VDevActivityInfo) > 0 - - if !isNeedHandleVnpu { - result = append(result, chipInfo) - continue - } - - for _, activityVDev := range chipInfo.VDevInfos.VDevActivityInfo { - vDevInfo := chipInfo - activityVDevCopy := activityVDev - vDevInfo.VDevActivityInfo = &activityVDevCopy - result = append(result, vDevInfo) - } - } - - return result - -} -func getChipListCache(n *NpuCollector) []HuaWeiAIChip { - obj, err := n.cache.Get(npuListCacheKey) - if err != nil { - logger.Errorf("get npu chip list from cache failed,err is : %v", err) - return make([]HuaWeiAIChip, 0) - } - if obj == nil { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "getChipListCache"}, - "there is no chip list info in cache,please check collect logs") - return make([]HuaWeiAIChip, 0) - } - - chipList, ok := obj.([]HuaWeiAIChip) - if !ok { - logger.Errorf("error npu chip info cache and convert failed,real type is (%T)", obj) - n.cache.Delete(npuListCacheKey) - return make([]HuaWeiAIChip, 0) - } - // if cache is empty or nil, return empty list - if len(chipList) == 0 { - return make([]HuaWeiAIChip, 0) - } - return chipList -} diff --git a/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go b/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go deleted file mode 100644 index 722079b..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/npu_collector_test.go +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package common for general collector -package common - -import ( - "context" - "errors" - "strconv" - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "github.com/stretchr/testify/assert" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - mockErr = errors.New("mockErr") - testError = errors.New(testErrorMsg) -) - -const ( - cacheTime = 60 * time.Second - npuCount = 8 - defaultUpdateTime = 10 * time.Millisecond - num2 = 2 - num100 = 100 - mockKey = "mockKey" - mockValue = "mockValue" - - // Test constants for setElabelInfo - testCardID = int32(1) - testProductName = "Atlas 900" - testModel = "Atlas-900-9000" - testManufacturer = "Huawei" - testManufacturerDate = "2023-01-01" - testSerialNumber = "SN123456789" - testDefaultSerial = "NA" - testErrorMsg = "get elabel info failed" -) - -type mockContainerRuntimeOperator struct{} - -// Init implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) Init() error { - return nil -} - -// Close implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) Close() error { - return nil -} - -// ContainerIDs implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetContainers(ctx context.Context) ([]*container.CommonContainer, error) { - return []*container.CommonContainer{}, nil -} - -// GetContainerInfoByID implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { - return v1.Spec{}, nil -} - -// GetIsulaContainerInfoByID implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetIsulaContainerInfoByID(ctx context.Context, - id string) (isula.ContainerJson, error) { - return isula.ContainerJson{}, nil -} - -// GetContainerType implements ContainerRuntimeOperator -func (operator *mockContainerRuntimeOperator) GetContainerType() string { - return container.DefaultContainer -} - -func mockScan4AscendDevices(_ string) ([]int, bool, error) { - return []int{1}, true, nil -} - -func mockGetCgroupPath(controller, specCgroupsPath string) (string, error) { - return "", nil -} - -func makeMockDevicesParser() *container.DevicesParser { - return &container.DevicesParser{ - RuntimeOperator: new(mockContainerRuntimeOperator), - } -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -// TestNewNpuCollector test method of NewNpuCollector -func TestNewNpuCollector(t *testing.T) { - tc := newNpuCollectorTestCase{ - cacheTime: cacheTime, - updateTime: defaultUpdateTime, - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - - c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - - assert.NotNil(t, c) -} - -type testCase struct { - name string - wantErr bool - mockPart interface{} - expectValue interface{} - expectCount interface{} -} - -func newTestCase(name string, wantErr bool, mockPart interface{}) testCase { - return testCase{ - name: name, - wantErr: wantErr, - mockPart: mockPart, - } -} - -// TestGetChipInfo test method getChipInfo -func TestGetChipInfo(t *testing.T) { - tests := []testCase{ - newTestCase("should return chip info successfully when dsmi works normally", false, - &devmanager.DeviceManagerMock{}), - newTestCase("should return nil when dsmi works abnormally", true, &devmanager.DeviceManagerMockErr{}), - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - chipInfo := getNPUChipList(tt.mockPart.(devmanager.DeviceInterface)) - t.Logf("%#v", chipInfo) - assert.NotNil(t, chipInfo) - if tt.wantErr { - assert.Len(t, chipInfo, 0) - } else { - assert.NotNil(t, chipInfo) - } - }) - } -} - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") -} - -func mockGetNPUChipList() []HuaWeiAIChip { - chips := make([]HuaWeiAIChip, 0) - for id := int32(0); id < npuCount; id++ { - chip := HuaWeiAIChip{ - CardId: id, - PhyId: id, - DeviceID: id, - LogicID: id, - } - - chips = append(chips, chip) - } - return chips -} - -// TestInitCardInfo test method getChipInfo -func TestInitCardInfo(t *testing.T) { - patches := gomonkey.ApplyFuncReturn(getNPUChipList, mockGetNPUChipList()) - defer patches.Reset() - convey.Convey("test InitCardInfo", t, func() { - - ctx, cancelFunc := context.WithCancel(context.Background()) - defer cancelFunc() - npuCollector := mockNewNpuCollector() - - InitCardInfo(&sync.WaitGroup{}, ctx, npuCollector) - time.Sleep(time.Millisecond * num100) - cancelFunc() - chips := getChipListCache(npuCollector) - convey.So(len(chips), convey.ShouldEqual, npuCount) - }) -} - -// TestGetChipListCache test method getChipListCache -func TestGetChipListCache(t *testing.T) { - npuCollector := mockNewNpuCollector() - tests := []testCase{ - {name: "should return 0 chips when cache is nil", wantErr: false, mockPart: func() {}, expectCount: 0}, - {name: "should return chips : " + strconv.Itoa(npuCount), expectCount: npuCount, wantErr: false, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, mockGetNPUChipList(), cacheTime) }}, - {name: "should return 0 chips when cache value is nil", wantErr: false, expectCount: 0, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, nil, cacheTime) }}, - {name: "should return 0 chips when value is a incorrect type", expectCount: 0, wantErr: false, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, &HuaWeiAIChip{}, cacheTime) }}, - {name: "should return 0 chips when cache is empty", expectCount: 0, wantErr: false, - mockPart: func() { npuCollector.cache.Set(npuListCacheKey, []HuaWeiAIChip{}, cacheTime) }, - }, - } - - convey.Convey("getChipListCache", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - tt.mockPart.(func())() - chips := getChipListCache(npuCollector) - assert.Len(t, chips, tt.expectCount.(int)) - convey.So(len(chips), convey.ShouldEqual, tt.expectCount) - }) - } - }) -} - -func mockNewNpuCollector() *NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: cacheTime, - updateTime: defaultUpdateTime, - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -func TestNpuChipInfoInitAtFirstTime(t *testing.T) { - n := mockNewNpuCollector() - convey.Convey("TestNpuChipInfoInitAtFirstTime", t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyFuncReturn(getNPUChipList, []HuaWeiAIChip{{CardId: 0}}) - // do test - npuChipInfoInitAtFirstTime(n) - // valid cache - data, err := n.cache.Get(npuListCacheKey) - convey.So(err, convey.ShouldBeNil) - chips, ok := data.([]HuaWeiAIChip) - convey.So(ok, convey.ShouldBeTrue) - convey.So(len(chips), convey.ShouldEqual, 1) - }) -} - -func patchCollectToCache() *gomonkey.Patches { - return gomonkey.ApplyMethod(&MetricsCollectorAdapter{}, "CollectToCache", - func(_ *MetricsCollectorAdapter, n *NpuCollector, chipList []HuaWeiAIChip) { - n.cache.Set(mockKey, mockValue, n.cacheTime) - }) -} - -func TestStartCollectForMultiGoroutine(t *testing.T) { - n := mockNewNpuCollector() - wg := sync.WaitGroup{} - ChainForMultiGoroutine = []MetricsCollector{ - &MetricsCollectorAdapter{}, - &MetricsCollectorAdapter{}, - } - patches := patchCollectToCache() - defer patches.Reset() - patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{createChip()}) - convey.Convey("TestStartCollectForMultiGoroutine", t, func() { - ctx, cancel := context.WithCancel(context.Background()) - startCollectForMultiGoroutine(&wg, ctx, n) - time.Sleep(n.updateTime) - cancel() - data, err := n.cache.Get(mockKey) - convey.So(err, convey.ShouldBeNil) - value, ok := data.(string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(value, convey.ShouldEqual, mockValue) - }) -} - -func TestRunChipCollector(t *testing.T) { - n := mockNewNpuCollector() - patches := patchCollectToCache() - defer patches.Reset() - convey.Convey("TestRunChipCollector", t, func() { - ctx, cancel := context.WithCancel(context.Background()) - tickCh := make(chan time.Time) - patches.ApplyFuncReturn(time.NewTicker, &time.Ticker{C: tickCh}) - close(tickCh) - go runChipCollector(ctx, n, createChip()) - time.Sleep(n.updateTime) - cancel() - data, err := n.cache.Get(mockKey) - convey.So(err, convey.ShouldBeNil) - value, ok := data.(string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(value, convey.ShouldEqual, mockValue) - }) -} - -func TestStartCollectSingleGoroutine(t *testing.T) { - n := mockNewNpuCollector() - wg := sync.WaitGroup{} - ChainForSingleGoroutine = []MetricsCollector{ - &MetricsCollectorAdapter{}, - } - patches := patchCollectToCache() - defer patches.Reset() - convey.Convey("TestStartCollectSingleGoroutine", t, func() { - ctx, cancel := context.WithCancel(context.Background()) - startCollectSingleGoroutine(&wg, ctx, n) - time.Sleep(n.updateTime) - cancel() - data, err := n.cache.Get(mockKey) - convey.So(err, convey.ShouldBeNil) - value, ok := data.(string) - convey.So(ok, convey.ShouldBeTrue) - convey.So(value, convey.ShouldEqual, mockValue) - }) -} - -type chipsCase struct { - name string - devType string - buildChips func() - expectValue int -} - -func TestGetChipListWithVNPU(t *testing.T) { - n := mockNewNpuCollector() - chip := HuaWeiAIChip{} - tests := []chipsCase{ - {name: "TestGetChipListWithVNPU_310p_no_vnpu", - devType: api.Ascend310P, - buildChips: func() { - chip = createChip() - }, - expectValue: 1, - }, - {name: "TestGetChipListWithVNPU_310p_2_vnpus", - devType: api.Ascend310P, - buildChips: func() { - chip = createValidVnpuChip() - }, - expectValue: num2, - }, - {name: "TestGetChipListWithVNPU_910", - devType: api.Ascend910, - buildChips: func() { - chip = createChip() - }, - expectValue: 1, - }, - } - - convey.Convey("TestGetChipListWithVNPU", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - tt.buildChips() - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tt.devType) - patches.ApplyFuncReturn(getChipListCache, []HuaWeiAIChip{chip}) - - chips := GetChipListWithVNPU(n) - convey.So(len(chips), convey.ShouldEqual, tt.expectValue) - }) - } - }) -} - -func createValidVnpuChip() HuaWeiAIChip { - chip := createChip() - chip.VDevInfos = &common.VirtualDevInfo{ - VDevActivityInfo: []common.VDevActivityInfo{ - { - VDevID: 0, - VDevAiCore: 0, - VDevTotalMem: 0, - VDevUsedMem: 0, - IsVirtualDev: true, - }, - { - VDevID: 1, - VDevAiCore: 1, - VDevTotalMem: 1, - VDevUsedMem: 1, - IsVirtualDev: true, - }, - }, - } - return chip -} - -func createChip() HuaWeiAIChip { - return HuaWeiAIChip{ - CardId: 0, - PhyId: 0, - DeviceID: 0, - LogicID: 0, - ChipInfo: &common.ChipInfo{ - Name: api.Ascend910, - Type: "Ascend", - Version: "V1", - }, - } -} - -func TestSetPCIeBusInfo(t *testing.T) { - const mockPcieBus = "0000:01:00.0" - tests := []struct { - name string - productTypes []string - err error - expectValue string - }{{ - name: "TestSetPCIeBusInfo_910", - productTypes: []string{api.Ascend910}, - err: nil, - expectValue: mockPcieBus, - }, { - name: "TestSetPCIeBusInfo_910_err", - productTypes: []string{api.Ascend910}, - err: mockErr, - expectValue: "", - }, { - name: "TestSetPCIeBusInfo_Atlas200ISoc", - productTypes: []string{common.Atlas200ISoc}, - err: nil, - expectValue: mockPcieBus, - }, { - name: "TestSetPCIeBusInfo_Atlas200ISoc_err", - productTypes: []string{common.Atlas200ISoc}, - err: mockErr, - expectValue: "", - }} - chip := createChip() - convey.Convey("TestSetPCIeBusInfo", t, func() { - for _, tt := range tests { - convey.Convey(tt.name, func() { - dmgr := &devmanager.DeviceManager{ProductTypes: tt.productTypes} - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(dmgr, "GetPCIeBusInfo", mockPcieBus, tt.err) - - setPCIeBusInfo(0, dmgr, &chip) - convey.So(chip.PCIeBusInfo, convey.ShouldEqual, tt.expectValue) - }) - } - }) -} - -type setElabelInfoTestCase struct { - name string - cardID int32 - mockElabelInfo common.ElabelInfo - mockError error - expectSerial string - expectProduct string - expectModel string - expectManufacturer string - expectManufacturerDate string -} - -func createSetElabelInfoTestCases() []setElabelInfoTestCase { - return []setElabelInfoTestCase{ - { - name: "should set elabel info successfully when GetCardElabelV2 returns valid data", - cardID: testCardID, - mockElabelInfo: common.ElabelInfo{ - ProductName: testProductName, - Model: testModel, - Manufacturer: testManufacturer, - ManufacturerDate: testManufacturerDate, - SerialNumber: testSerialNumber, - }, - mockError: nil, - expectSerial: testSerialNumber, - expectProduct: testProductName, - expectModel: testModel, - expectManufacturer: testManufacturer, - expectManufacturerDate: testManufacturerDate, - }, - { - name: "should set default elabel info when GetCardElabelV2 returns error", - cardID: testCardID, - mockElabelInfo: common.ElabelInfo{}, - mockError: testError, - expectSerial: testDefaultSerial, - expectProduct: "", - expectModel: "", - expectManufacturer: "", - expectManufacturerDate: "", - }, - } -} - -func executeSetElabelInfoTest(tc setElabelInfoTestCase) { - // Create mock device manager - mockDmgr := &devmanager.DeviceManager{} - - // Create test chip - chip := &HuaWeiAIChip{} - - // Apply gomonkey patches - patches := gomonkey.NewPatches() - defer patches.Reset() - - patches.ApplyMethodReturn(mockDmgr, "GetCardElabelV2", - tc.mockElabelInfo, tc.mockError) - - // Execute the function under test - setElabelInfo(chip, mockDmgr, tc.cardID) - - // Verify results - convey.So(chip.ElabelInfo, convey.ShouldNotBeNil) - convey.So(chip.ElabelInfo.SerialNumber, convey.ShouldEqual, tc.expectSerial) -} - -// TestSetElabelInfo test setElabelInfo method -func TestSetElabelInfo(t *testing.T) { - testCases := createSetElabelInfoTestCases() - - convey.Convey("TestSetElabelInfo", t, func() { - for _, tc := range testCases { - convey.Convey(tc.name, func() { - executeSetElabelInfoTest(tc) - }) - } - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/common/types.go b/mind-cluster/component/npu-exporter/collector/common/types.go deleted file mode 100644 index 4576c85..0000000 --- a/mind-cluster/component/npu-exporter/collector/common/types.go +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package common for collector -package common - -import ( - "ascend-common/devmanager/common" -) - -// HuaWeiAIChip chip info -type HuaWeiAIChip struct { - - // CardId npu card id - CardId int32 `json:"card_id"` - // PhyId npu chip phy id - PhyId int32 `json:"phy_id"` - // DeviceID the chip physic ID - DeviceID int32 `json:"device_id"` - // the chip logic ID - LogicID int32 `json:"logic_id"` - // VDieID the vdie id - VDieID string `json:"vdie_id"` - // MainBoardId main board id , used to distinguish between A900A3SuperPod and A9000A3SuperPod - MainBoardId uint32 - // ChipInfo the chip info - ChipInfo *common.ChipInfo `json:"chip_info"` - // BoardInfo board info of device, but not display - BoardInfo *common.BoardInfo - - // VDevActivityInfo the activity virtual device info - VDevActivityInfo *common.VDevActivityInfo `json:"v_dev_activity_info"` - // VDevInfos the virtual device info - VDevInfos *common.VirtualDevInfo `json:"v_dev_infos"` - // PCIeBusInfo bus info - PCIeBusInfo string - // ElabelInfo elabel info including SN - ElabelInfo *common.ElabelInfo `json:"elabel_info"` -} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config.go deleted file mode 100644 index be32832..0000000 --- a/mind-cluster/component/npu-exporter/collector/config/metrics_config.go +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package config for general collector -package config - -import ( - "encoding/json" - "fmt" - "reflect" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" - - "ascend-common/common-utils/utils" -) - -var ( - // singleGoroutineMap metrics in this map will be collected in single goroutine - singleGoroutineMap = map[string]common.MetricsCollector{ - groupHccs: &metrics.HccsCollector{}, - groupNpu: &metrics.BaseInfoCollector{}, - groupSio: &metrics.SioCollector{}, - groupVersion: &metrics.VersionCollector{}, - groupHbm: &metrics.HbmCollector{}, - groupDDR: &metrics.DdrCollector{}, - groupVnpu: &metrics.VnpuCollector{}, - groupPcie: &metrics.PcieCollector{}, - } - // multiGoroutineMap metrics in this map will be collected in multi goroutine - multiGoroutineMap = map[string]common.MetricsCollector{ - groupNetwork: &metrics.NetworkCollector{}, - groupRoce: &metrics.RoceCollector{}, - groupOptical: &metrics.OpticalCollector{}, - } - // pluginCollectorMap metrics in this map will be collected in plugin goroutine - pluginCollectorMap = map[string]common.MetricsCollector{} - presetConfigs = make([]map[string]string, 0) - pluginConfigs = make([]map[string]string, 0) - - defaultPresetConfigs = []map[string]string{ - {metricsGroup: groupDDR, state: stateOn}, - {metricsGroup: groupHccs, state: stateOn}, - {metricsGroup: groupNpu, state: stateOn}, - {metricsGroup: groupNetwork, state: stateOn}, - {metricsGroup: groupPcie, state: stateOn}, - {metricsGroup: groupRoce, state: stateOn}, - {metricsGroup: groupSio, state: stateOn}, - {metricsGroup: groupVnpu, state: stateOn}, - {metricsGroup: groupVersion, state: stateOn}, - {metricsGroup: groupOptical, state: stateOn}, - {metricsGroup: groupHbm, state: stateOn}, - } - defaultPluginConfigs = []map[string]string{ - {metricsGroup: groupText, state: stateOn}, - } -) - -const ( - metricsGroup = "metricsGroup" - state = "state" - - groupDDR = "ddr" - groupHccs = "hccs" - groupNpu = "npu" - groupNetwork = "network" - groupPcie = "pcie" - groupRoce = "roce" - groupSio = "sio" - groupVnpu = "vnpu" - groupVersion = "version" - groupOptical = "optical" - groupHbm = "hbm" - groupText = "text" - - stateOn = "ON" - stateOFF = "OFF" -) - -const ( - PresetConfigPath = "/usr/local/metricConfiguration.json" - PluginConfigPath = "/usr/local/pluginConfiguration.json" -) - -func loadConfiguration() { - if fileBytes := loadFromFile(PresetConfigPath); fileBytes == nil { - logger.Warnf("load config from file %s failed, use default config", PresetConfigPath) - presetConfigs = defaultPresetConfigs - } else { - initConfiguration(fileBytes, &presetConfigs) - } - if fileBytes := loadFromFile(PluginConfigPath); fileBytes == nil { - logger.Warnf("load config from file %s failed, use default config", PluginConfigPath) - pluginConfigs = defaultPluginConfigs - } else { - initConfiguration(fileBytes, &pluginConfigs) - } -} - -func loadFromFile(filePath string) []byte { - fileBytes, err := utils.LoadFile(filePath) - if err != nil { - return nil - } - return fileBytes -} - -func initConfiguration(fileBytes []byte, configs *[]map[string]string) { - if err := json.Unmarshal(fileBytes, configs); err != nil { - logger.Errorf("unmarshal config byte failed: %v", err) - return - } -} - -// AddPluginCollector add plugin collector to cache -func AddPluginCollector(name string, collector common.MetricsCollector) error { - if _, exist := pluginCollectorMap[name]; exist { - logger.Errorf("plugin collector %v already exist", name) - return fmt.Errorf("plugin collector %v already exist", name) - } - logger.Infof("add plugin collector %v ok", name) - pluginCollectorMap[name] = collector - return nil -} - -// DeletePluginCollector delete plugin collector from cache -func DeletePluginCollector(name string) { - if _, exist := pluginCollectorMap[name]; !exist { - logger.Warnf("plugin collector %v does not exist", name) - return - } - logger.Infof("delete plugin collector %v ok", name) - delete(pluginCollectorMap, name) -} - -// Register register collector to cache -func Register(n *common.NpuCollector) { - loadConfiguration() - - for _, config := range presetConfigs { - metricsGroupName := config[metricsGroup] - - if config[state] != stateOn { - logger.Infof("metricsGroup [%v] is off", metricsGroupName) - continue - } - logger.Infof("metricsGroup [%v] is on", metricsGroupName) - collector, exist := singleGoroutineMap[metricsGroupName] - if exist && collector.IsSupported(n) { - common.ChainForSingleGoroutine = append(common.ChainForSingleGoroutine, collector) - } - - collector, exist = multiGoroutineMap[metricsGroupName] - if exist && collector.IsSupported(n) { - common.ChainForMultiGoroutine = append(common.ChainForMultiGoroutine, collector) - } - } - - for _, config := range pluginConfigs { - metricsGroupName := config[metricsGroup] - - if config[state] != stateOn { - logger.Infof("plugin collector [%v] is off", metricsGroupName) - continue - } - logger.Infof("plugin collector [%v] is on", metricsGroupName) - collector, exist := pluginCollectorMap[metricsGroupName] - if exist && collector.IsSupported(n) { - logger.Infof("add plugin collector:%v", metricsGroupName) - common.ChainForCustomPlugin = append(common.ChainForCustomPlugin, collector) - } - - } - - logger.Infof("ChainForSingleGoroutine:%#v", common.ChainForSingleGoroutine) - logger.Infof("ChainForMultiGoroutine:%#v", common.ChainForMultiGoroutine) - logger.Infof("ChainForCustomPlugin:%#v", common.ChainForCustomPlugin) -} - -// UnRegister delete collector from chain -func UnRegister(worker reflect.Type) { - logger.Debugf("unRegister collector:%v", worker) - unRegisterChain(worker, &common.ChainForSingleGoroutine) - unRegisterChain(worker, &common.ChainForMultiGoroutine) - unRegisterChain(worker, &common.ChainForCustomPlugin) -} - -func unRegisterChain(worker reflect.Type, chain *[]common.MetricsCollector) { - newChain := make([]common.MetricsCollector, 0) - for _, collector := range *chain { - if reflect.TypeOf(collector) != worker { - newChain = append(newChain, collector) - } - } - *chain = newChain -} diff --git a/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go b/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go deleted file mode 100644 index 974ed3e..0000000 --- a/mind-cluster/component/npu-exporter/collector/config/metrics_config_test.go +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package config for general collector -package config - -import ( - "ascend-common/common-utils/utils" - "reflect" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" -) - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - initChain() -} - -func initChain() { - common.ChainForSingleGoroutine = []common.MetricsCollector{} - common.ChainForMultiGoroutine = []common.MetricsCollector{} -} - -func TestInitConfiguration(t *testing.T) { - convey.Convey("TestInitConfiguration", t, func() { - initConfiguration([]byte("test"), &presetConfigs) - convey.So(len(presetConfigs), convey.ShouldEqual, 0) - }) -} - -func TestLoadConfiguration(t *testing.T) { - convey.Convey("TestLoadConfiguration", t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - convey.Convey("load config ok", func() { - patches.ApplyFunc(loadFromFile, func(filePath string) []byte { - if filePath == PresetConfigPath { - filePath = "../../build/metricConfiguration.json" - } else if filePath == PluginConfigPath { - filePath = "../../build/pluginConfiguration.json" - } - fileBytes, _ := utils.LoadFile(filePath) - return fileBytes - }) - defer func() { - presetConfigs = make([]map[string]string, 0) - pluginConfigs = make([]map[string]string, 0) - }() - loadConfiguration() - convey.So(len(presetConfigs), convey.ShouldBeGreaterThan, 0) - convey.So(len(pluginConfigs), convey.ShouldBeGreaterThan, 0) - }) - convey.Convey("load config fail", func() { - presetConfigs = make([]map[string]string, 0) - pluginConfigs = make([]map[string]string, 0) - patches.ApplyFunc(loadFromFile, func(filePath string) []byte { - return nil - }) - loadConfiguration() - convey.So(len(presetConfigs), convey.ShouldEqual, len(defaultPresetConfigs)) - convey.So(len(pluginConfigs), convey.ShouldEqual, len(defaultPluginConfigs)) - }) - }) -} - -func TestAddPluginCollector(t *testing.T) { - convey.Convey("TestAddPluginCollector", t, func() { - convey.Convey("add plugin ok", func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - defer func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - }() - err := AddPluginCollector("test", &metrics.HccsCollector{}) - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("add plugin fail", func() { - pluginCollectorMap["test"] = &metrics.HccsCollector{} - defer func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - }() - err := AddPluginCollector("test", &metrics.HccsCollector{}) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestDeletePluginCollector(t *testing.T) { - convey.Convey("TestDeletePluginCollector", t, func() { - convey.Convey("delete plugin ok", func() { - pluginCollectorMap["test"] = &metrics.HccsCollector{} - DeletePluginCollector("test") - convey.So(pluginCollectorMap["test"], convey.ShouldBeNil) - }) - convey.Convey("delete plugin fail", func() { - pluginCollectorMap = make(map[string]common.MetricsCollector) - DeletePluginCollector("test") - convey.So(len(pluginCollectorMap), convey.ShouldEqual, 0) - }) - }) -} - -func TestRegister(t *testing.T) { - convey.Convey("TestRegister", t, func() { - n := &common.NpuCollector{} - patches := gomonkey.NewPatches() - defer patches.Reset() - // Mock IsSupported method to always return true - patches.ApplyMethodReturn(&metrics.HccsCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.BaseInfoCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.SioCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.VersionCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.HbmCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.DdrCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.VnpuCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.PcieCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.NetworkCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.RoceCollector{}, "IsSupported", true) - patches.ApplyMethodReturn(&metrics.OpticalCollector{}, "IsSupported", true) - patches.ApplyFunc(loadConfiguration, func() { - initConfiguration(loadFromFile("../../build/metricConfiguration.json"), &presetConfigs) - initConfiguration(loadFromFile("../../build/pluginConfiguration.json"), &pluginConfigs) - }) - Register(n) - convey.Convey("Should add collectors to ChainForSingleGoroutine", func() { - convey.So(len(common.ChainForSingleGoroutine), convey.ShouldBeGreaterThan, 0) - }) - convey.Convey("Should add collectors to ChainForMultiGoroutine", func() { - convey.So(len(common.ChainForMultiGoroutine), convey.ShouldBeGreaterThan, 0) - }) - }) -} - -func TestUnRegister(t *testing.T) { - convey.Convey("TestUnRegister", t, func() { - // Initialize chains with some collectors - common.ChainForSingleGoroutine = []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.BaseInfoCollector{}, - } - common.ChainForMultiGoroutine = []common.MetricsCollector{ - &metrics.NetworkCollector{}, - &metrics.RoceCollector{}, - } - - convey.Convey("When UnRegister is called with HccsCollector type", func() { - UnRegister(reflect.TypeOf(&metrics.HccsCollector{})) - - convey.Convey("Should remove HccsCollector from ChainForSingleGoroutine", func() { - expected := []common.MetricsCollector{ - &metrics.BaseInfoCollector{}, - } - convey.So(len(common.ChainForSingleGoroutine), convey.ShouldEqual, len(expected)) - for i, collector := range common.ChainForSingleGoroutine { - convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) - } - }) - - convey.Convey("Should not affect ChainForMultiGoroutine", func() { - expected := []common.MetricsCollector{ - &metrics.NetworkCollector{}, - &metrics.RoceCollector{}, - } - convey.So(len(common.ChainForMultiGoroutine), convey.ShouldEqual, len(expected)) - for i, collector := range common.ChainForMultiGoroutine { - convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) - } - }) - }) - }) -} - -func TestUnRegisterChain(t *testing.T) { - convey.Convey("TestUnRegisterChain", t, func() { - // Initialize a chain with some collectors - chain := []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.BaseInfoCollector{}, - &metrics.NetworkCollector{}, - } - - convey.Convey("When unRegisterChain is called with BaseInfoCollector type", func() { - unRegisterChain(reflect.TypeOf(&metrics.BaseInfoCollector{}), &chain) - convey.Convey("Should remove BaseInfoCollector from the chain", func() { - expected := []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.NetworkCollector{}, - } - convey.So(len(chain), convey.ShouldEqual, len(expected)) - for i, collector := range chain { - convey.So(reflect.TypeOf(collector), convey.ShouldEqual, reflect.TypeOf(expected[i])) - } - }) - }) - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go deleted file mode 100644 index 5ee3c7f..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.pb.go +++ /dev/null @@ -1,870 +0,0 @@ -// -//Copyright 2018 The Kubernetes Authors. -//Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. -//modify descripe: remove unused options for example: -//remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" -// -//Licensed under the Apache License, Version 2.0 (the "License"); -//you may not use this file except in compliance with the License. -//You may obtain a copy of the License at -// -//http://www.apache.org/licenses/LICENSE-2.0 -// -//Unless required by applicable law or agreed to in writing, software -//distributed under the License is distributed on an "AS IS" BASIS, -//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -//See the License for the specific language governing permissions and -//limitations under the License. - -// To regenerate api.pb.go run hack/update-generated-runtime.sh - -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.28.1 -// protoc v3.13.0 -// source: isula_api.proto - -package isula - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type ContainerState int32 - -const ( - ContainerState_CONTAINER_CREATED ContainerState = 0 - ContainerState_CONTAINER_RUNNING ContainerState = 1 - ContainerState_CONTAINER_EXITED ContainerState = 2 - ContainerState_CONTAINER_UNKNOWN ContainerState = 3 -) - -// Enum value maps for ContainerState. -var ( - ContainerState_name = map[int32]string{ - 0: "CONTAINER_CREATED", - 1: "CONTAINER_RUNNING", - 2: "CONTAINER_EXITED", - 3: "CONTAINER_UNKNOWN", - } - ContainerState_value = map[string]int32{ - "CONTAINER_CREATED": 0, - "CONTAINER_RUNNING": 1, - "CONTAINER_EXITED": 2, - "CONTAINER_UNKNOWN": 3, - } -) - -func (x ContainerState) Enum() *ContainerState { - p := new(ContainerState) - *p = x - return p -} - -func (x ContainerState) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (ContainerState) Descriptor() protoreflect.EnumDescriptor { - return file_isula_api_proto_enumTypes[0].Descriptor() -} - -func (ContainerState) Type() protoreflect.EnumType { - return &file_isula_api_proto_enumTypes[0] -} - -func (x ContainerState) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use ContainerState.Descriptor instead. -func (ContainerState) EnumDescriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{0} -} - -// ImageSpec is an internal representation of an image. Currently, it wraps the -// value of a Container's Image field (e.g. imageID or imageDigest), but in the -// future it will include more detailed information about the different image types. -type ImageSpec struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Image string `protobuf:"bytes,1,opt,name=image,proto3" json:"image,omitempty"` - // Unstructured key-value map holding arbitrary metadata. - // ImageSpec Annotations can be used to help the runtime target specific - // images in multi-arch images. - Annotations map[string]string `protobuf:"bytes,2,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` -} - -func (x *ImageSpec) Reset() { - *x = ImageSpec{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ImageSpec) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ImageSpec) ProtoMessage() {} - -func (x *ImageSpec) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ImageSpec.ProtoReflect.Descriptor instead. -func (*ImageSpec) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{0} -} - -func (x *ImageSpec) GetImage() string { - if x != nil { - return x.Image - } - return "" -} - -func (x *ImageSpec) GetAnnotations() map[string]string { - if x != nil { - return x.Annotations - } - return nil -} - -// ContainerMetadata holds all necessary information for building the container -// name. The container runtime is encouraged to expose the metadata in its user -// interface for better user experience. E.g., runtime can construct a unique -// container name based on the metadata. Note that (name, attempt) is unique -// within a sandbox for the entire lifetime of the sandbox. -type ContainerMetadata struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // Name of the container. Same as the container name in the PodSpec. - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - // Attempt number of creating the container. Default: 0. - Attempt uint32 `protobuf:"varint,2,opt,name=attempt,proto3" json:"attempt,omitempty"` -} - -func (x *ContainerMetadata) Reset() { - *x = ContainerMetadata{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ContainerMetadata) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ContainerMetadata) ProtoMessage() {} - -func (x *ContainerMetadata) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ContainerMetadata.ProtoReflect.Descriptor instead. -func (*ContainerMetadata) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{1} -} - -func (x *ContainerMetadata) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *ContainerMetadata) GetAttempt() uint32 { - if x != nil { - return x.Attempt - } - return 0 -} - -// ContainerStateValue is the wrapper of ContainerState. -type ContainerStateValue struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // State of the container. - State ContainerState `protobuf:"varint,1,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` -} - -func (x *ContainerStateValue) Reset() { - *x = ContainerStateValue{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ContainerStateValue) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ContainerStateValue) ProtoMessage() {} - -func (x *ContainerStateValue) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[2] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ContainerStateValue.ProtoReflect.Descriptor instead. -func (*ContainerStateValue) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{2} -} - -func (x *ContainerStateValue) GetState() ContainerState { - if x != nil { - return x.State - } - return ContainerState_CONTAINER_CREATED -} - -// ContainerFilter is used to filter containers. -// All those fields are combined with 'AND' -type ContainerFilter struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // ID of the container. - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - // State of the container. - State *ContainerStateValue `protobuf:"bytes,2,opt,name=state,proto3" json:"state,omitempty"` - // ID of the PodSandbox. - PodSandboxId string `protobuf:"bytes,3,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` - // LabelSelector to select matches. - // Only api.MatchLabels is supported for now and the requirements - // are ANDed. MatchExpressions is not supported yet. - LabelSelector map[string]string `protobuf:"bytes,4,rep,name=label_selector,json=labelSelector,proto3" json:"label_selector,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` -} - -func (x *ContainerFilter) Reset() { - *x = ContainerFilter{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ContainerFilter) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ContainerFilter) ProtoMessage() {} - -func (x *ContainerFilter) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[3] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ContainerFilter.ProtoReflect.Descriptor instead. -func (*ContainerFilter) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{3} -} - -func (x *ContainerFilter) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *ContainerFilter) GetState() *ContainerStateValue { - if x != nil { - return x.State - } - return nil -} - -func (x *ContainerFilter) GetPodSandboxId() string { - if x != nil { - return x.PodSandboxId - } - return "" -} - -func (x *ContainerFilter) GetLabelSelector() map[string]string { - if x != nil { - return x.LabelSelector - } - return nil -} - -type ListContainersRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Filter *ContainerFilter `protobuf:"bytes,1,opt,name=filter,proto3" json:"filter,omitempty"` -} - -func (x *ListContainersRequest) Reset() { - *x = ListContainersRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ListContainersRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ListContainersRequest) ProtoMessage() {} - -func (x *ListContainersRequest) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[4] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ListContainersRequest.ProtoReflect.Descriptor instead. -func (*ListContainersRequest) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{4} -} - -func (x *ListContainersRequest) GetFilter() *ContainerFilter { - if x != nil { - return x.Filter - } - return nil -} - -// Container provides the runtime information for a container, such as ID, hash, -// state of the container. -type Container struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // ID of the container, used by the container runtime to identify - // a container. - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - // ID of the sandbox to which this container belongs. - PodSandboxId string `protobuf:"bytes,2,opt,name=pod_sandbox_id,json=podSandboxId,proto3" json:"pod_sandbox_id,omitempty"` - // Metadata of the container. - Metadata *ContainerMetadata `protobuf:"bytes,3,opt,name=metadata,proto3" json:"metadata,omitempty"` - // Spec of the image. - Image *ImageSpec `protobuf:"bytes,4,opt,name=image,proto3" json:"image,omitempty"` - // Reference to the image in use. For most runtimes, this should be an - // image ID. - ImageRef string `protobuf:"bytes,5,opt,name=image_ref,json=imageRef,proto3" json:"image_ref,omitempty"` - // State of the container. - State ContainerState `protobuf:"varint,6,opt,name=state,proto3,enum=runtime.v1alpha2.ContainerState" json:"state,omitempty"` - // Creation time of the container in nanoseconds. - CreatedAt int64 `protobuf:"varint,7,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` - // Key-value pairs that may be used to scope and select individual resources. - Labels map[string]string `protobuf:"bytes,8,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` - // Unstructured key-value map holding arbitrary metadata. - // Annotations MUST NOT be altered by the runtime; the value of this field - // MUST be identical to that of the corresponding ContainerConfig used to - // instantiate this Container. - Annotations map[string]string `protobuf:"bytes,9,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` -} - -func (x *Container) Reset() { - *x = Container{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[5] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *Container) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*Container) ProtoMessage() {} - -func (x *Container) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[5] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use Container.ProtoReflect.Descriptor instead. -func (*Container) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{5} -} - -func (x *Container) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *Container) GetPodSandboxId() string { - if x != nil { - return x.PodSandboxId - } - return "" -} - -func (x *Container) GetMetadata() *ContainerMetadata { - if x != nil { - return x.Metadata - } - return nil -} - -func (x *Container) GetImage() *ImageSpec { - if x != nil { - return x.Image - } - return nil -} - -func (x *Container) GetImageRef() string { - if x != nil { - return x.ImageRef - } - return "" -} - -func (x *Container) GetState() ContainerState { - if x != nil { - return x.State - } - return ContainerState_CONTAINER_CREATED -} - -func (x *Container) GetCreatedAt() int64 { - if x != nil { - return x.CreatedAt - } - return 0 -} - -func (x *Container) GetLabels() map[string]string { - if x != nil { - return x.Labels - } - return nil -} - -func (x *Container) GetAnnotations() map[string]string { - if x != nil { - return x.Annotations - } - return nil -} - -type ListContainersResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - // List of containers. - Containers []*Container `protobuf:"bytes,1,rep,name=containers,proto3" json:"containers,omitempty"` -} - -func (x *ListContainersResponse) Reset() { - *x = ListContainersResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_isula_api_proto_msgTypes[6] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ListContainersResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ListContainersResponse) ProtoMessage() {} - -func (x *ListContainersResponse) ProtoReflect() protoreflect.Message { - mi := &file_isula_api_proto_msgTypes[6] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ListContainersResponse.ProtoReflect.Descriptor instead. -func (*ListContainersResponse) Descriptor() ([]byte, []int) { - return file_isula_api_proto_rawDescGZIP(), []int{6} -} - -func (x *ListContainersResponse) GetContainers() []*Container { - if x != nil { - return x.Containers - } - return nil -} - -var File_isula_api_proto protoreflect.FileDescriptor - -var file_isula_api_proto_rawDesc = []byte{ - 0x0a, 0x0f, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x5f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x12, 0x10, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, - 0x68, 0x61, 0x32, 0x22, 0xb1, 0x01, 0x0a, 0x09, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, - 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, - 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, - 0x49, 0x6d, 0x61, 0x67, 0x65, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, - 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, - 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x3e, 0x0a, 0x10, 0x41, 0x6e, 0x6e, 0x6f, 0x74, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, - 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, - 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x41, 0x0a, 0x11, 0x43, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, - 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, - 0x12, 0x18, 0x0a, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x0d, 0x52, 0x07, 0x61, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x22, 0x4d, 0x0a, 0x13, 0x43, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, 0x6c, 0x75, - 0x65, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, - 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, - 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0xa3, 0x02, 0x0a, 0x0f, 0x43, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x12, 0x0e, 0x0a, - 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x3b, 0x0a, - 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x25, 0x2e, 0x72, - 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, - 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x56, 0x61, - 0x6c, 0x75, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, - 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x5f, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, - 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, - 0x12, 0x5b, 0x0a, 0x0e, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, - 0x6f, 0x72, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, - 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, 0x65, - 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0d, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x1a, 0x40, 0x0a, - 0x12, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x45, 0x6e, - 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, - 0x52, 0x0a, 0x15, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x39, 0x0a, 0x06, 0x66, 0x69, 0x6c, 0x74, - 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, - 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x52, 0x06, 0x66, 0x69, 0x6c, - 0x74, 0x65, 0x72, 0x22, 0xb5, 0x04, 0x0a, 0x09, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, - 0x72, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, - 0x64, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x6f, 0x64, 0x5f, 0x73, 0x61, 0x6e, 0x64, 0x62, 0x6f, 0x78, - 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x6f, 0x64, 0x53, 0x61, - 0x6e, 0x64, 0x62, 0x6f, 0x78, 0x49, 0x64, 0x12, 0x3f, 0x0a, 0x08, 0x6d, 0x65, 0x74, 0x61, 0x64, - 0x61, 0x74, 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x72, 0x75, 0x6e, 0x74, - 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x08, - 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x31, 0x0a, 0x05, 0x69, 0x6d, 0x61, 0x67, - 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, - 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x49, 0x6d, 0x61, 0x67, 0x65, - 0x53, 0x70, 0x65, 0x63, 0x52, 0x05, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x69, - 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x72, 0x65, 0x66, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, - 0x69, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x66, 0x12, 0x36, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, - 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x20, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, - 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, - 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x61, 0x74, 0x18, 0x07, - 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x41, 0x74, 0x12, - 0x3f, 0x0a, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, - 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, - 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x4c, 0x61, 0x62, - 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, - 0x12, 0x4e, 0x0a, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, - 0x09, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2c, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, - 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x2e, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, - 0x74, 0x72, 0x79, 0x52, 0x0b, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, - 0x1a, 0x39, 0x0a, 0x0b, 0x4c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, - 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, - 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x1a, 0x3e, 0x0a, 0x10, 0x41, - 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, - 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, - 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x55, 0x0a, 0x16, 0x4c, - 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x73, - 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x72, 0x75, 0x6e, 0x74, - 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x43, 0x6f, 0x6e, - 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, - 0x72, 0x73, 0x2a, 0x6b, 0x0a, 0x0e, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, - 0x74, 0x61, 0x74, 0x65, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, - 0x52, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x43, - 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x52, 0x55, 0x4e, 0x4e, 0x49, 0x4e, 0x47, - 0x10, 0x01, 0x12, 0x14, 0x0a, 0x10, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, - 0x45, 0x58, 0x49, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4e, 0x54, - 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x03, 0x32, - 0x77, 0x0a, 0x0e, 0x52, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, - 0x65, 0x12, 0x65, 0x0a, 0x0e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x73, 0x12, 0x27, 0x2e, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, - 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x28, 0x2e, 0x72, - 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x32, 0x2e, - 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x0a, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, - 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_isula_api_proto_rawDescOnce sync.Once - file_isula_api_proto_rawDescData = file_isula_api_proto_rawDesc -) - -func file_isula_api_proto_rawDescGZIP() []byte { - file_isula_api_proto_rawDescOnce.Do(func() { - file_isula_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_isula_api_proto_rawDescData) - }) - return file_isula_api_proto_rawDescData -} - -var file_isula_api_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_isula_api_proto_msgTypes = make([]protoimpl.MessageInfo, 11) -var file_isula_api_proto_goTypes = []interface{}{ - (ContainerState)(0), // 0: runtime.v1alpha2.ContainerState - (*ImageSpec)(nil), // 1: runtime.v1alpha2.ImageSpec - (*ContainerMetadata)(nil), // 2: runtime.v1alpha2.ContainerMetadata - (*ContainerStateValue)(nil), // 3: runtime.v1alpha2.ContainerStateValue - (*ContainerFilter)(nil), // 4: runtime.v1alpha2.ContainerFilter - (*ListContainersRequest)(nil), // 5: runtime.v1alpha2.ListContainersRequest - (*Container)(nil), // 6: runtime.v1alpha2.Container - (*ListContainersResponse)(nil), // 7: runtime.v1alpha2.ListContainersResponse - nil, // 8: runtime.v1alpha2.ImageSpec.AnnotationsEntry - nil, // 9: runtime.v1alpha2.ContainerFilter.LabelSelectorEntry - nil, // 10: runtime.v1alpha2.Container.LabelsEntry - nil, // 11: runtime.v1alpha2.Container.AnnotationsEntry -} -var file_isula_api_proto_depIdxs = []int32{ - 8, // 0: runtime.v1alpha2.ImageSpec.annotations:type_name -> runtime.v1alpha2.ImageSpec.AnnotationsEntry - 0, // 1: runtime.v1alpha2.ContainerStateValue.state:type_name -> runtime.v1alpha2.ContainerState - 3, // 2: runtime.v1alpha2.ContainerFilter.state:type_name -> runtime.v1alpha2.ContainerStateValue - 9, // 3: runtime.v1alpha2.ContainerFilter.label_selector:type_name -> runtime.v1alpha2.ContainerFilter.LabelSelectorEntry - 4, // 4: runtime.v1alpha2.ListContainersRequest.filter:type_name -> runtime.v1alpha2.ContainerFilter - 2, // 5: runtime.v1alpha2.Container.metadata:type_name -> runtime.v1alpha2.ContainerMetadata - 1, // 6: runtime.v1alpha2.Container.image:type_name -> runtime.v1alpha2.ImageSpec - 0, // 7: runtime.v1alpha2.Container.state:type_name -> runtime.v1alpha2.ContainerState - 10, // 8: runtime.v1alpha2.Container.labels:type_name -> runtime.v1alpha2.Container.LabelsEntry - 11, // 9: runtime.v1alpha2.Container.annotations:type_name -> runtime.v1alpha2.Container.AnnotationsEntry - 6, // 10: runtime.v1alpha2.ListContainersResponse.containers:type_name -> runtime.v1alpha2.Container - 5, // 11: runtime.v1alpha2.RuntimeService.ListContainers:input_type -> runtime.v1alpha2.ListContainersRequest - 7, // 12: runtime.v1alpha2.RuntimeService.ListContainers:output_type -> runtime.v1alpha2.ListContainersResponse - 12, // [12:13] is the sub-list for method output_type - 11, // [11:12] is the sub-list for method input_type - 11, // [11:11] is the sub-list for extension type_name - 11, // [11:11] is the sub-list for extension extendee - 0, // [0:11] is the sub-list for field type_name -} - -func init() { file_isula_api_proto_init() } -func file_isula_api_proto_init() { - if File_isula_api_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_isula_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ImageSpec) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ContainerMetadata) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ContainerStateValue) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ContainerFilter) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ListContainersRequest) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*Container) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isula_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*ListContainersResponse) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_isula_api_proto_rawDesc, - NumEnums: 1, - NumMessages: 11, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_isula_api_proto_goTypes, - DependencyIndexes: file_isula_api_proto_depIdxs, - EnumInfos: file_isula_api_proto_enumTypes, - MessageInfos: file_isula_api_proto_msgTypes, - }.Build() - File_isula_api_proto = out.File - file_isula_api_proto_rawDesc = nil - file_isula_api_proto_goTypes = nil - file_isula_api_proto_depIdxs = nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto deleted file mode 100644 index 3f1f9f9..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api.proto +++ /dev/null @@ -1,118 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. -Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. - modify descripe: remove unused options for example: - remove import "github.com/gogo/protobuf/gogoproto/gogo.proto" - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// To regenerate api.pb.go run hack/update-generated-runtime.sh -syntax = 'proto3'; - -package runtime.v1alpha2; -option go_package = "./;isula"; - -// Runtime service defines the public APIs for remote container runtimes -service RuntimeService { - // ListContainers lists all containers by filters. - rpc ListContainers(ListContainersRequest) returns (ListContainersResponse) {} -} - -// ImageSpec is an internal representation of an image. Currently, it wraps the -// value of a Container's Image field (e.g. imageID or imageDigest), but in the -// future it will include more detailed information about the different image types. -message ImageSpec { - string image = 1; - // Unstructured key-value map holding arbitrary metadata. - // ImageSpec Annotations can be used to help the runtime target specific - // images in multi-arch images. - map annotations = 2; -} - -// ContainerMetadata holds all necessary information for building the container -// name. The container runtime is encouraged to expose the metadata in its user -// interface for better user experience. E.g., runtime can construct a unique -// container name based on the metadata. Note that (name, attempt) is unique -// within a sandbox for the entire lifetime of the sandbox. -message ContainerMetadata { - // Name of the container. Same as the container name in the PodSpec. - string name = 1; - // Attempt number of creating the container. Default: 0. - uint32 attempt = 2; -} - -enum ContainerState { - CONTAINER_CREATED = 0; - CONTAINER_RUNNING = 1; - CONTAINER_EXITED = 2; - CONTAINER_UNKNOWN = 3; -} - -// ContainerStateValue is the wrapper of ContainerState. -message ContainerStateValue { - // State of the container. - ContainerState state = 1; -} - -// ContainerFilter is used to filter containers. -// All those fields are combined with 'AND' -message ContainerFilter { - // ID of the container. - string id = 1; - // State of the container. - ContainerStateValue state = 2; - // ID of the PodSandbox. - string pod_sandbox_id = 3; - // LabelSelector to select matches. - // Only api.MatchLabels is supported for now and the requirements - // are ANDed. MatchExpressions is not supported yet. - map label_selector = 4; -} - -message ListContainersRequest { - ContainerFilter filter = 1; -} - -// Container provides the runtime information for a container, such as ID, hash, -// state of the container. -message Container { - // ID of the container, used by the container runtime to identify - // a container. - string id = 1; - // ID of the sandbox to which this container belongs. - string pod_sandbox_id = 2; - // Metadata of the container. - ContainerMetadata metadata = 3; - // Spec of the image. - ImageSpec image = 4; - // Reference to the image in use. For most runtimes, this should be an - // image ID. - string image_ref = 5; - // State of the container. - ContainerState state = 6; - // Creation time of the container in nanoseconds. - int64 created_at = 7; - // Key-value pairs that may be used to scope and select individual resources. - map labels = 8; - // Unstructured key-value map holding arbitrary metadata. - // Annotations MUST NOT be altered by the runtime; the value of this field - // MUST be identical to that of the corresponding ContainerConfig used to - // instantiate this Container. - map annotations = 9; -} - -message ListContainersResponse { - // List of containers. - repeated Container containers = 1; -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go deleted file mode 100644 index a503e15..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_api_grpc.pb.go +++ /dev/null @@ -1,107 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. -// versions: -// - protoc-gen-go-grpc v1.2.0 -// - protoc v3.13.0 -// source: isula_api.proto - -package isula - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// RuntimeServiceClient is the client API for RuntimeService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type RuntimeServiceClient interface { - // ListContainers lists all containers by filters. - ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) -} - -type runtimeServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewRuntimeServiceClient(cc grpc.ClientConnInterface) RuntimeServiceClient { - return &runtimeServiceClient{cc} -} - -func (c *runtimeServiceClient) ListContainers(ctx context.Context, in *ListContainersRequest, opts ...grpc.CallOption) (*ListContainersResponse, error) { - out := new(ListContainersResponse) - err := c.cc.Invoke(ctx, "/runtime.v1alpha2.RuntimeService/ListContainers", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// RuntimeServiceServer is the server API for RuntimeService service. -// All implementations must embed UnimplementedRuntimeServiceServer -// for forward compatibility -type RuntimeServiceServer interface { - // ListContainers lists all containers by filters. - ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) - mustEmbedUnimplementedRuntimeServiceServer() -} - -// UnimplementedRuntimeServiceServer must be embedded to have forward compatible implementations. -type UnimplementedRuntimeServiceServer struct { -} - -func (UnimplementedRuntimeServiceServer) ListContainers(context.Context, *ListContainersRequest) (*ListContainersResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method ListContainers not implemented") -} -func (UnimplementedRuntimeServiceServer) mustEmbedUnimplementedRuntimeServiceServer() {} - -// UnsafeRuntimeServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to RuntimeServiceServer will -// result in compilation errors. -type UnsafeRuntimeServiceServer interface { - mustEmbedUnimplementedRuntimeServiceServer() -} - -func RegisterRuntimeServiceServer(s grpc.ServiceRegistrar, srv RuntimeServiceServer) { - s.RegisterService(&RuntimeService_ServiceDesc, srv) -} - -func _RuntimeService_ListContainers_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(ListContainersRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(RuntimeServiceServer).ListContainers(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/runtime.v1alpha2.RuntimeService/ListContainers", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(RuntimeServiceServer).ListContainers(ctx, req.(*ListContainersRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// RuntimeService_ServiceDesc is the grpc.ServiceDesc for RuntimeService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var RuntimeService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "runtime.v1alpha2.RuntimeService", - HandlerType: (*RuntimeServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "ListContainers", - Handler: _RuntimeService_ListContainers_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "isula_api.proto", -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go b/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go deleted file mode 100644 index e31fea9..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isula_container.go +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2021-2024. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package isula for monitoring isula' npu allocation -package isula - -// Config represents env -type Config struct { - Env []string `json:"Env,omitempty" platform:"linux"` -} - -// DeviceInfo represents device info -type DeviceInfo struct { - PathInContainer string `json:"PathInContainer,omitempty" platform:"linux"` -} - -// HostConfig represents host config content -type HostConfig struct { - Devices []DeviceInfo `json:"Devices,omitempty" platform:"linux"` - Privileged bool `json:"Privileged,omitempty" platform:"linux"` -} - -// ContainerJson represents container json content -type ContainerJson struct { - Config *Config `json:"Config,omitempty" platform:"linux"` - HostConfig *HostConfig `json:"HostConfig,omitempty" platform:"linux"` -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go deleted file mode 100644 index 5e4f83f..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.pb.go +++ /dev/null @@ -1,278 +0,0 @@ -// ####################################################################### -// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. -// # - iSulad licensed under the Mulan PSL v2. -// # - You can use this software according to the terms and conditions of the Mulan PSL v2. -// # - You may obtain a copy of Mulan PSL v2 at: -// # - http://license.coscl.org.cn/MulanPSL2 -// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -// # - PURPOSE. -// # - See the Mulan PSL v2 for more details. -// ##- @Description: generate grpc -// ##- @Author: wujing -// ##- @Create: 2019-04-25 -// ####################################################################### - -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.28.1 -// protoc v3.13.0 -// source: isulad.proto - -package isula - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type InspectContainerRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - Bformat bool `protobuf:"varint,2,opt,name=bformat,proto3" json:"bformat,omitempty"` - Timeout int32 `protobuf:"varint,3,opt,name=timeout,proto3" json:"timeout,omitempty"` -} - -func (x *InspectContainerRequest) Reset() { - *x = InspectContainerRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_isulad_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *InspectContainerRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*InspectContainerRequest) ProtoMessage() {} - -func (x *InspectContainerRequest) ProtoReflect() protoreflect.Message { - mi := &file_isulad_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use InspectContainerRequest.ProtoReflect.Descriptor instead. -func (*InspectContainerRequest) Descriptor() ([]byte, []int) { - return file_isulad_proto_rawDescGZIP(), []int{0} -} - -func (x *InspectContainerRequest) GetId() string { - if x != nil { - return x.Id - } - return "" -} - -func (x *InspectContainerRequest) GetBformat() bool { - if x != nil { - return x.Bformat - } - return false -} - -func (x *InspectContainerRequest) GetTimeout() int32 { - if x != nil { - return x.Timeout - } - return 0 -} - -type InspectContainerResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - ContainerJSON string `protobuf:"bytes,1,opt,name=ContainerJSON,proto3" json:"ContainerJSON,omitempty"` - Cc uint32 `protobuf:"varint,2,opt,name=cc,proto3" json:"cc,omitempty"` - Errmsg string `protobuf:"bytes,3,opt,name=errmsg,proto3" json:"errmsg,omitempty"` -} - -func (x *InspectContainerResponse) Reset() { - *x = InspectContainerResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_isulad_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *InspectContainerResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*InspectContainerResponse) ProtoMessage() {} - -func (x *InspectContainerResponse) ProtoReflect() protoreflect.Message { - mi := &file_isulad_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use InspectContainerResponse.ProtoReflect.Descriptor instead. -func (*InspectContainerResponse) Descriptor() ([]byte, []int) { - return file_isulad_proto_rawDescGZIP(), []int{1} -} - -func (x *InspectContainerResponse) GetContainerJSON() string { - if x != nil { - return x.ContainerJSON - } - return "" -} - -func (x *InspectContainerResponse) GetCc() uint32 { - if x != nil { - return x.Cc - } - return 0 -} - -func (x *InspectContainerResponse) GetErrmsg() string { - if x != nil { - return x.Errmsg - } - return "" -} - -var File_isulad_proto protoreflect.FileDescriptor - -var file_isulad_proto_rawDesc = []byte{ - 0x0a, 0x0c, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0a, - 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x22, 0x5d, 0x0a, 0x17, 0x49, 0x6e, - 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, - 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x62, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x12, - 0x18, 0x0a, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x22, 0x68, 0x0a, 0x18, 0x49, 0x6e, 0x73, - 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, - 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x24, 0x0a, 0x0d, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x43, 0x6f, - 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x4a, 0x53, 0x4f, 0x4e, 0x12, 0x0e, 0x0a, 0x02, 0x63, - 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x02, 0x63, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x65, - 0x72, 0x72, 0x6d, 0x73, 0x67, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x65, 0x72, 0x72, - 0x6d, 0x73, 0x67, 0x32, 0x68, 0x0a, 0x10, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x54, 0x0a, 0x07, 0x49, 0x6e, 0x73, 0x70, 0x65, - 0x63, 0x74, 0x12, 0x23, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x2e, - 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, - 0x6e, 0x65, 0x72, 0x73, 0x2e, 0x49, 0x6e, 0x73, 0x70, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x42, 0x0c, 0x48, - 0x02, 0x5a, 0x08, 0x2e, 0x2f, 0x3b, 0x69, 0x73, 0x75, 0x6c, 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x33, -} - -var ( - file_isulad_proto_rawDescOnce sync.Once - file_isulad_proto_rawDescData = file_isulad_proto_rawDesc -) - -func file_isulad_proto_rawDescGZIP() []byte { - file_isulad_proto_rawDescOnce.Do(func() { - file_isulad_proto_rawDescData = protoimpl.X.CompressGZIP(file_isulad_proto_rawDescData) - }) - return file_isulad_proto_rawDescData -} - -var file_isulad_proto_msgTypes = make([]protoimpl.MessageInfo, 2) -var file_isulad_proto_goTypes = []interface{}{ - (*InspectContainerRequest)(nil), // 0: containers.InspectContainerRequest - (*InspectContainerResponse)(nil), // 1: containers.InspectContainerResponse -} -var file_isulad_proto_depIdxs = []int32{ - 0, // 0: containers.ContainerService.Inspect:input_type -> containers.InspectContainerRequest - 1, // 1: containers.ContainerService.Inspect:output_type -> containers.InspectContainerResponse - 1, // [1:2] is the sub-list for method output_type - 0, // [0:1] is the sub-list for method input_type - 0, // [0:0] is the sub-list for extension type_name - 0, // [0:0] is the sub-list for extension extendee - 0, // [0:0] is the sub-list for field type_name -} - -func init() { file_isulad_proto_init() } -func file_isulad_proto_init() { - if File_isulad_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_isulad_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*InspectContainerRequest) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_isulad_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { - value, ok := v.(*InspectContainerResponse) - if !ok { - return nil - } - - switch v := value; i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_isulad_proto_rawDesc, - NumEnums: 0, - NumMessages: 2, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_isulad_proto_goTypes, - DependencyIndexes: file_isulad_proto_depIdxs, - MessageInfos: file_isulad_proto_msgTypes, - }.Build() - File_isulad_proto = out.File - file_isulad_proto_rawDesc = nil - file_isulad_proto_goTypes = nil - file_isulad_proto_depIdxs = nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto b/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto deleted file mode 100644 index af5f85c..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isulad.proto +++ /dev/null @@ -1,35 +0,0 @@ -// ####################################################################### -// ##- Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. -// # - iSulad licensed under the Mulan PSL v2. -// # - You can use this software according to the terms and conditions of the Mulan PSL v2. -// # - You may obtain a copy of Mulan PSL v2 at: -// # - http://license.coscl.org.cn/MulanPSL2 -// # - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -// # - IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -// # - PURPOSE. -// # - See the Mulan PSL v2 for more details. -// ##- @Description: generate grpc -// ##- @Author: wujing -// ##- @Create: 2019-04-25 -// ####################################################################### -syntax = "proto3"; -option optimize_for = CODE_SIZE; - -package containers; -option go_package = "./;isula"; - -service ContainerService { - rpc Inspect(InspectContainerRequest) returns (InspectContainerResponse); -} - -message InspectContainerRequest { - string id = 1; - bool bformat = 2; - int32 timeout = 3; -} - -message InspectContainerResponse { - string ContainerJSON = 1; - uint32 cc = 2; - string errmsg = 3; -} \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go b/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go deleted file mode 100644 index c563e0a..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/isula/isulad_grpc.pb.go +++ /dev/null @@ -1,105 +0,0 @@ -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. -// versions: -// - protoc-gen-go-grpc v1.2.0 -// - protoc v3.13.0 -// source: isulad.proto - -package isula - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.32.0 or later. -const _ = grpc.SupportPackageIsVersion7 - -// ContainerServiceClient is the client API for ContainerService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type ContainerServiceClient interface { - Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) -} - -type containerServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewContainerServiceClient(cc grpc.ClientConnInterface) ContainerServiceClient { - return &containerServiceClient{cc} -} - -func (c *containerServiceClient) Inspect(ctx context.Context, in *InspectContainerRequest, opts ...grpc.CallOption) (*InspectContainerResponse, error) { - out := new(InspectContainerResponse) - err := c.cc.Invoke(ctx, "/containers.ContainerService/Inspect", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// ContainerServiceServer is the server API for ContainerService service. -// All implementations must embed UnimplementedContainerServiceServer -// for forward compatibility -type ContainerServiceServer interface { - Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) - mustEmbedUnimplementedContainerServiceServer() -} - -// UnimplementedContainerServiceServer must be embedded to have forward compatible implementations. -type UnimplementedContainerServiceServer struct { -} - -func (UnimplementedContainerServiceServer) Inspect(context.Context, *InspectContainerRequest) (*InspectContainerResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Inspect not implemented") -} -func (UnimplementedContainerServiceServer) mustEmbedUnimplementedContainerServiceServer() {} - -// UnsafeContainerServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to ContainerServiceServer will -// result in compilation errors. -type UnsafeContainerServiceServer interface { - mustEmbedUnimplementedContainerServiceServer() -} - -func RegisterContainerServiceServer(s grpc.ServiceRegistrar, srv ContainerServiceServer) { - s.RegisterService(&ContainerService_ServiceDesc, srv) -} - -func _ContainerService_Inspect_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(InspectContainerRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(ContainerServiceServer).Inspect(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/containers.ContainerService/Inspect", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(ContainerServiceServer).Inspect(ctx, req.(*InspectContainerRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// ContainerService_ServiceDesc is the grpc.ServiceDesc for ContainerService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var ContainerService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "containers.ContainerService", - HandlerType: (*ContainerServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "Inspect", - Handler: _ContainerService_Inspect_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "isulad.proto", -} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser.go b/mind-cluster/component/npu-exporter/collector/container/parser.go deleted file mode 100644 index 4531374..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/parser.go +++ /dev/null @@ -1,630 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container for monitoring containers' npu allocation -package container - -import ( - "bufio" - "context" - "errors" - "fmt" - "math" - "os" - "regexp" - "strconv" - "strings" - "sync" - "time" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - namespaceMoby = "moby" // Docker - namespaceK8s = "k8s.io" // CRI + Containerd - sliceLen8 = 8 - ascendEnvPart = 2 - charDevice = "c" - - minus = "-" - comma = "," - ascend = "Ascend" - maxEnvLength = 1024 - parsingNpuDefaultTimeoutDuration = 3 -) - -const ( - // EndpointTypeContainerd K8S + Containerd - EndpointTypeContainerd = iota - // EndpointTypeDockerd Docker with or without K8S - EndpointTypeDockerd - // EndpointTypeIsula K8S + isula - EndpointTypeIsula = 2 -) - -var ( - // ErrFromContext error is from the context - ErrFromContext = errors.New("error from context") - - npuMajorID []string - npuMajorFetchCtrl sync.Once - parsingNpuDefaultTimeout = parsingNpuDefaultTimeoutDuration * time.Second -) - -var ( - envErrDescribe = func(ctrID, devID, env string, err error) string { - return fmt.Sprintf("container (%s) has an invalid device ID (%s) in %s, err is %v", ctrID, devID, env, err) - } - minusStyle = func(s string) bool { - return strings.Contains(s, minus) - } - commaMinusStyle = func(s string) bool { - return strings.Contains(s, minus) && strings.Contains(s, comma) - } - ascendStyle = func(s string) bool { - return strings.Contains(s, ascend) - } -) - -// CntNpuMonitorOpts contains setting options for monitoring containers -type CntNpuMonitorOpts struct { - EndpointType int // containerd or docker - CriEndpoint string // CRI server address - UseCriBackup bool // whether try to use cri backup address - OciEndpoint string // OCI server, now is containerd address - UseOciBackup bool // whether try to use oci backup address -} - -// MakeDevicesParser evaluates option settings and make an instance according to it -func MakeDevicesParser(opts CntNpuMonitorOpts) *DevicesParser { - runtimeOperator := &RuntimeOperatorTool{ - UseCriBackup: opts.UseCriBackup, - UseOciBackup: opts.UseOciBackup, - CriEndpoint: opts.CriEndpoint, - OciEndpoint: opts.OciEndpoint, - } - parser := &DevicesParser{ - RuntimeOperator: runtimeOperator, - } - - switch opts.EndpointType { - case EndpointTypeContainerd: - runtimeOperator.Namespace = namespaceK8s - case EndpointTypeDockerd: - runtimeOperator.Namespace = namespaceMoby - case EndpointTypeIsula: - runtimeOperator.Namespace = namespaceK8s - default: - logger.Errorf("invalid type value %d", opts.EndpointType) - } - - return parser -} - -// DevicesInfo the container device information struct -type DevicesInfo struct { - // container id - ID string - // container name, the format is: PodNameSpace_PodName_ContainerName - Name string - Devices []int -} - -// DevicesInfos the device information storage map -type DevicesInfos = map[string]DevicesInfo - -// DevicesParser the parser which parse device info -type DevicesParser struct { - // instances - result chan DevicesInfos - err chan error - // configuration - RuntimeOperator RuntimeOperator - Timeout time.Duration -} - -// Init initializes connection to containerd daemon and to CRI server or dockerd daemon based on name fetcher setting -func (dp *DevicesParser) Init() error { - if err := dp.RuntimeOperator.Init(); err != nil { - return contactError(err, "connecting to container runtime failed") - } - dp.result = make(chan DevicesInfos, 1) - dp.err = make(chan error, 1) - return nil -} - -// RecvResult exposes the channel used for receiving devices info analyzing result -func (dp *DevicesParser) RecvResult() <-chan DevicesInfos { - return dp.result -} - -// RecvErr exposes the channel used for receiving errors occurred during analyzing -func (dp *DevicesParser) RecvErr() <-chan error { - return dp.err -} - -// Close closes all connections and channels established during initializing -func (dp *DevicesParser) Close() { - _ = dp.RuntimeOperator.Close() -} - -func (dp *DevicesParser) parseDevices(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { - if dp.RuntimeOperator.GetContainerType() == IsulaContainer { - return dp.parseDeviceInIsula(ctx, c, rs) - } - - return dp.parseDevicesInContainerd(ctx, c, rs) -} - -func (dp *DevicesParser) parseDevicesInContainerd(ctx context.Context, c *CommonContainer, - rs chan<- DevicesInfo) error { - if rs == nil { - return errors.New("empty result channel") - } - deviceInfo := DevicesInfo{} - defer func(di *DevicesInfo) { - rs <- *di - }(&deviceInfo) - - spec, err := dp.RuntimeOperator.GetContainerInfoByID(ctx, c.Id) - if err != nil { - return contactError(err, fmt.Sprintf("cannot get container devices by container id (%s)", c.Id)) - } - if spec.Linux == nil || spec.Linux.Resources == nil || len(spec.Linux.Resources.Devices) > maxDevicesNum { - return contactError(errors.New("device error"), - fmt.Sprintf("devices in container is too much (%v) or empty", maxDevicesNum)) - } - if spec.Process == nil || len(spec.Process.Env) > maxEnvNum { - return contactError(errors.New("env error"), fmt.Sprintf("env in container is too much (%v) or empty", - maxEnvNum)) - } - - envs := spec.Process.Env - for i := len(envs) - 1; i >= 0; i-- { - e := envs[i] - if strings.Contains(e, api.AscendDeviceInfo) { - deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) - return err - } - } - - deviceInfo, err = dp.getDevicesWithoutAscendRuntime(spec, c) - return err -} - -func (dp *DevicesParser) getDevicesWithoutAscendRuntime(spec v1.Spec, c *CommonContainer) (DevicesInfo, error) { - deviceInfo := DevicesInfo{} - devicesIDs, err := filterNPUDevices(spec) - if err != nil { - logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) - return DevicesInfo{}, nil - } - logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) - - if len(devicesIDs) != 0 { - if deviceInfo, err = makeUpDeviceInfo(c); err == nil { - deviceInfo.Devices = devicesIDs - return deviceInfo, nil - } else { - logger.Errorf("makeUpDeviceInfo failed: %s", err) - } - return DevicesInfo{}, err - } - - return DevicesInfo{}, nil -} - -func (dp *DevicesParser) getDevicesWithAscendRuntime(ascendDevEnv string, c *CommonContainer) (DevicesInfo, error) { - logger.Debugf("get device info by env (%s) in %s", ascendDevEnv, c.Id) - devInfo := strings.Split(ascendDevEnv, "=") - if len(devInfo) != ascendEnvPart { - return DevicesInfo{}, fmt.Errorf("an invalid %s env(%s)", api.AscendDeviceInfo, ascendDevEnv) - } - devicesIDs := dp.parseDiffEnvFmt(devInfo[1], c.Id) - if len(devicesIDs) == 0 { - return DevicesInfo{}, nil - } - - deviceInfo, err := makeUpDeviceInfo(c) - if err != nil { - hwlog.RunLog.Error(err) - return DevicesInfo{}, err - } - deviceInfo.Devices = devicesIDs - return deviceInfo, nil -} - -func (dp *DevicesParser) parseDiffEnvFmt(devices, containerID string) []int { - if len(devices) > maxEnvLength { - return []int{} - } - if ascendStyle(devices) { - return dp.getDeviceIDsByAscendStyle(devices, containerID) - } - if commaMinusStyle(devices) { - return dp.getDeviceIDsByCommaMinusStyle(devices, containerID) - } - if minusStyle(devices) { - return dp.getDeviceIDsByMinusStyle(devices, containerID) - } - return dp.getDeviceIDsByCommaStyle(devices, containerID) -} - -func (dp *DevicesParser) getDeviceIDsByCommaStyle(devices, containerID string) []int { - devList := strings.Split(devices, comma) - devicesIDs := make([]int, 0, len(devList)) - for _, devID := range devList { - id, err := strconv.Atoi(devID) - if err != nil { - logger.Errorf("container (%s) has an invalid device ID (%v) in %s, error is %s", containerID, - devID, api.AscendDeviceInfo, err) - continue - } - devicesIDs = append(devicesIDs, id) - } - return devicesIDs -} - -func (dp *DevicesParser) getDeviceIDsByAscendStyle(devices, containerID string) []int { - devList := strings.Split(devices, comma) - deviceIDs := make([]int, 0, len(devList)) - for _, subDevice := range devList { - deviceName := strings.Split(subDevice, minus) - if len(deviceName) != ascendEnvPart { - logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, nil)) - continue - } - id, err := strconv.Atoi(deviceName[1]) - if err != nil { - logger.Errorf(envErrDescribe(containerID, deviceName[1], api.AscendDeviceInfo, err)) - continue - } - deviceIDs = append(deviceIDs, id) - } - return deviceIDs -} - -func (dp *DevicesParser) getDeviceIDsByMinusStyle(devices, containerID string) []int { - deviceIDs := make([]int, 0) - devIDRange := strings.Split(devices, minus) - if len(devIDRange) != ascendEnvPart { - logger.Errorf(envErrDescribe(containerID, "range", api.AscendDeviceInfo, nil)) - return deviceIDs - } - minDevID, err := strconv.Atoi(devIDRange[0]) - if err != nil { - logger.Errorf(envErrDescribe(containerID, devIDRange[0], api.AscendDeviceInfo, err)) - return deviceIDs - } - maxDevID, err := strconv.Atoi(devIDRange[1]) - if err != nil { - logger.Errorf(envErrDescribe(containerID, devIDRange[1], api.AscendDeviceInfo, err)) - return deviceIDs - } - if minDevID > maxDevID { - logger.Errorf(envErrDescribe(containerID, "", - api.AscendDeviceInfo, errors.New("min id bigger than max id"))) - return deviceIDs - } - if maxDevID > math.MaxInt16 { - logger.Errorf(envErrDescribe(containerID, "", api.AscendDeviceInfo, errors.New("max id invalid"))) - return deviceIDs - } - for deviceID := minDevID; deviceID <= maxDevID; deviceID++ { - deviceIDs = append(deviceIDs, deviceID) - } - return deviceIDs -} - -func (dp *DevicesParser) getDeviceIDsByCommaMinusStyle(devices, containerID string) []int { - var deviceIDs []int - devList := strings.Split(devices, comma) - for _, subDevices := range devList { - if minusStyle(subDevices) { - deviceIDs = append(deviceIDs, dp.getDeviceIDsByMinusStyle(subDevices, containerID)...) - continue - } - deviceIDs = append(deviceIDs, dp.getDeviceIDsByCommaStyle(subDevices, containerID)...) - } - return deviceIDs -} - -func (dp *DevicesParser) getDevWithoutAscendRuntimeInIsula(containerInfo isula.ContainerJson, - c *CommonContainer) (DevicesInfo, error) { - deviceInfo := DevicesInfo{} - devicesIDs, err := filterNPUDevicesInIsula(containerInfo) - if err != nil { - logger.Debugf("filter npu devices failed by container id (%s), err is %v", c.Id, err) - return DevicesInfo{}, nil - } - logger.Debugf("filter npu devices %v in container (%s)", devicesIDs, c.Id) - - if len(devicesIDs) == 0 { - return DevicesInfo{}, nil - } - - deviceInfo, err = makeUpDeviceInfo(c) - if err != nil { - hwlog.RunLog.Error(err) - return DevicesInfo{}, err - } - deviceInfo.Devices = devicesIDs - return deviceInfo, nil -} - -func (dp *DevicesParser) parseDeviceInIsula(ctx context.Context, c *CommonContainer, rs chan<- DevicesInfo) error { - if rs == nil { - return errors.New("empty result channel") - } - - deviceInfo := DevicesInfo{} - defer func(di *DevicesInfo) { - rs <- *di - }(&deviceInfo) - - if len(c.Id) > maxCgroupPath { - return fmt.Errorf("the containerId (%s) is too long", c.Id) - } - containerInfo, err := dp.RuntimeOperator.GetIsulaContainerInfoByID(ctx, c.Id) - if err != nil { - return contactError(err, fmt.Sprintf("getting config of container(%s) fail", c.Id)) - } - if containerInfo.HostConfig == nil || containerInfo.Config == nil { - return errors.New("empty container info") - } - - envs := containerInfo.Config.Env - for i := len(envs) - 1; i >= 0; i-- { - e := envs[i] - if strings.Contains(e, api.AscendDeviceInfo) { - deviceInfo, err = dp.getDevicesWithAscendRuntime(e, c) - return err - } - } - - deviceInfo, err = dp.getDevWithoutAscendRuntimeInIsula(containerInfo, c) - return err -} - -func (dp *DevicesParser) collect(ctx context.Context, r <-chan DevicesInfo, ct int32) (DevicesInfos, error) { - if r == nil { - return nil, errors.New("receiving channel is empty") - } - if ct < 0 { - return nil, nil - } - - results := make(map[string]DevicesInfo, ct) - for { - select { - case info, ok := <-r: - if !ok { - return nil, nil - } - if info.ID != "" { - results[info.ID] = info - } - if ct -= 1; ct <= 0 { - return results, nil - } - case <-ctx.Done(): - hwlog.RunLog.Error("ctx is timeout") - dp.err <- ErrFromContext - return nil, nil - } - } -} - -func (dp *DevicesParser) doParse(resultOut chan<- DevicesInfos) { - var result DevicesInfos = nil - defer func(rslt DevicesInfos) { - if resultOut != nil { - resultOut <- rslt - close(resultOut) - } - }(result) - - ctx := context.Background() - containers, err := dp.RuntimeOperator.GetContainers(ctx) - if err != nil { - dp.err <- err - return - } - - l := len(containers) - if l == 0 || l > maxContainers { - logger.Debugf("get %d containers from cri interface, return empty data", l) - dp.result <- make(DevicesInfos) - return - } - - r := make(chan DevicesInfo) - defer close(r) - wg := sync.WaitGroup{} - wg.Add(l) - - for _, container := range containers { - go func(container *CommonContainer, c context.Context) { - if err := dp.parseDevices(c, container, r); err != nil { - dp.err <- err - } - wg.Done() - }(container, ctx) - } - ctx, cancelFn := context.WithTimeout(ctx, withDefault(dp.Timeout, parsingNpuDefaultTimeout)) - defer cancelFn() - result, err = dp.collect(ctx, r, int32(l)) - if err != nil { - logger.Errorf("collect info error: %v", err) - } - - if result != nil { - dp.result <- result - } - wg.Wait() -} - -// FetchAndParse triggers the asynchronous process of querying and analyzing all containers -// resultOut channel is for fetching the current result -func (dp *DevicesParser) FetchAndParse(resultOut chan<- DevicesInfos) { - if dp.err == nil { - logger.Debug("device paster is not initialized") - return - } - go dp.doParse(resultOut) -} - -func withDefault(v time.Duration, d time.Duration) time.Duration { - if v == 0 { - return d - } - - return v -} - -// query the MajorID of NPU devices -func getNPUMajorID() ([]string, error) { - const ( - deviceCount = 2 - maxSearchLine = 512 - ) - - path, err := utils.CheckPath("/proc/devices") - if err != nil { - return nil, err - } - majorID := make([]string, 0, deviceCount) - f, err := os.Open(path) - if err != nil { - return majorID, err - } - defer func() { - err = f.Close() - if err != nil { - hwlog.RunLog.Error(err) - } - }() - s := bufio.NewScanner(f) - count := 0 - for s.Scan() { - // prevent from searching too many lines - if count > maxSearchLine { - break - } - count++ - text := s.Text() - matched, err := regexp.MatchString("^[0-9]{1,3}\\s[v]?devdrv-cdev$", text) - if err != nil { - return majorID, err - } - if !matched { - continue - } - fields := strings.Fields(text) - majorID = append(majorID, fields[0]) - } - return majorID, nil -} - -func npuMajor() []string { - npuMajorFetchCtrl.Do(func() { - var err error - npuMajorID, err = getNPUMajorID() - if err != nil { - return - } - }) - return npuMajorID -} - -func contains(slice []string, target string) bool { - for _, v := range slice { - if v == target { - return true - } - } - return false -} - -func contactError(err error, msg string) error { - return fmt.Errorf("%s->%s", err.Error(), msg) -} - -func filterNPUDevices(spec v1.Spec) ([]int, error) { - if spec.Linux == nil || spec.Linux.Resources == nil { - return nil, errors.New("empty spec info") - } - - const base = 10 - devIDs := make([]int, 0, sliceLen8) - majorIDs := npuMajor() - for _, dev := range spec.Linux.Resources.Devices { - if dev.Minor == nil || dev.Major == nil { - // do not monitor privileged container - continue - } - if *dev.Minor > math.MaxInt32 { - return nil, fmt.Errorf("get wrong device ID (%v)", dev.Minor) - } - major := strconv.FormatInt(*dev.Major, base) - if dev.Type == charDevice && contains(majorIDs, major) { - devIDs = append(devIDs, int(*dev.Minor)) - } - } - - return devIDs, nil -} - -// filterNPUDevicesInIsula get id of device from containerJson(containerInfo) -func filterNPUDevicesInIsula(containerInfo isula.ContainerJson) ([]int, error) { - privileged := containerInfo.HostConfig.Privileged - if privileged { - return nil, errors.New("it's a privileged container and skip it") - } - - devIDs := make([]int, 0, sliceLen8) - devices := containerInfo.HostConfig.Devices - for _, dev := range devices { - Id, err := getDevIdFromPath(api.DevicePathPattern, dev.PathInContainer) - if err != nil { - logger.Warn(err) - continue - } - devIDs = append(devIDs, Id) - } - - return devIDs, nil -} - -func getDevIdFromPath(pattern, path string) (int, error) { - if match, err := regexp.MatchString(pattern, path); err != nil || !match { - return -1, fmt.Errorf("unexpected path of device: %s or match error: %v", path, err) - } - number := regexp.MustCompile(`\d+`) - IdStr := number.FindString(path) - Id, err := strconv.Atoi(IdStr) - if err != nil { - return -1, fmt.Errorf("unexpected device ID (%v)", IdStr) - } - if Id > math.MaxInt32 { - return -1, fmt.Errorf("get wrong device ID (%v)", Id) - } - return Id, nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/parser_test.go b/mind-cluster/component/npu-exporter/collector/container/parser_test.go deleted file mode 100644 index f2975b9..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/parser_test.go +++ /dev/null @@ -1,1027 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container provides utilities for container monitoring and testing. -package container - -import ( - "context" - "errors" - "os" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - // Test endpoint constants - testContainerdEndpoint = "unix:///run/containerd.sock" - testDockerEndpoint = "unix:///run/docker.sock" - - device0 = 0 - device1 = 1 - device2 = 2 - device3 = 3 - testDeviceRange = "0-2" - testDeviceComma = "0,1,2" - testDeviceCommaRange = "0-1,2-3" - testAscendDevice0 = "Ascend-0" - testAscendDevices = "Ascend-0,Ascend-1" - testMixedDevices = "0-1,3" - - // Test error constants - testOriginalError = "original error" - testErrorMessage = "test message" - testContactedError = "original error->test message" - - // Test path constants - testDevicePattern = "/dev/npu([0-9]+)" - - // Test duration constants - testZeroDuration = 0 -) - -func TestMakeDevicesParser(t *testing.T) { - testCases := []struct { - name string - opts CntNpuMonitorOpts - expected *DevicesParser - }{ - {name: "should create parser when options are valid for containerd", - opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeContainerd, - OciEndpoint: testContainerdEndpoint, UseOciBackup: false, UseCriBackup: false}, - expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: false, UseCriBackup: false, - CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, - {name: "should create parser when options are valid for docker", - opts: CntNpuMonitorOpts{CriEndpoint: testDockerEndpoint, EndpointType: EndpointTypeDockerd, - OciEndpoint: testDockerEndpoint, UseOciBackup: true, UseCriBackup: false}, - expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, - CriEndpoint: testDockerEndpoint, OciEndpoint: testDockerEndpoint}, Timeout: testZeroDuration}}, - {name: "should create parser when options are valid for isula", - opts: CntNpuMonitorOpts{CriEndpoint: testContainerdEndpoint, EndpointType: EndpointTypeIsula, - OciEndpoint: testContainerdEndpoint, UseOciBackup: true, UseCriBackup: true}, - expected: &DevicesParser{RuntimeOperator: &RuntimeOperatorTool{UseOciBackup: true, UseCriBackup: true, - CriEndpoint: testContainerdEndpoint, OciEndpoint: testContainerdEndpoint}, Timeout: testZeroDuration}}, - } - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - result := MakeDevicesParser(tc.opts) - convey.So(result, convey.ShouldNotBeNil) - convey.So(result.RuntimeOperator, convey.ShouldNotBeNil) - convey.So(result.Timeout, convey.ShouldEqual, tc.expected.Timeout) - }) - } -} - -func TestDevicesParserInit(t *testing.T) { - convey.Convey("TestDevicesParserInit", t, func() { - convey.Convey("should initialize successfully when runtime operator init succeeds", func() { - dp := &DevicesParser{ - RuntimeOperator: &RuntimeOperatorTool{}, - } - - patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", nil) - defer patches.Reset() - - err := dp.Init() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when initialization fails", func() { - dp := &DevicesParser{ - RuntimeOperator: &RuntimeOperatorTool{}, - } - patches := gomonkey.ApplyMethodReturn(dp.RuntimeOperator, "Init", errors.New("init failed")) - defer patches.Reset() - err := dp.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "init failed") - }) - }) -} - -func TestDevicesParserRecvResult(t *testing.T) { - convey.Convey("TestDevicesParserRecvResult", t, func() { - convey.Convey("should return result channel when initialized", func() { - dp := &DevicesParser{ - result: make(chan DevicesInfos, 1), - } - resultChan := dp.RecvResult() - convey.So(resultChan, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserRecvErr(t *testing.T) { - convey.Convey("TestDevicesParserRecvErr", t, func() { - convey.Convey("should return error channel when initialized", func() { - dp := &DevicesParser{ - err: make(chan error, 1), - } - errChan := dp.RecvErr() - convey.So(errChan, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserClose(t *testing.T) { - convey.Convey("TestDevicesParserClose", t, func() { - convey.Convey("should close runtime operator when called", func() { - mockOperator := &RuntimeOperatorTool{} - dp := &DevicesParser{ - RuntimeOperator: mockOperator, - } - - visited := false - patches := gomonkey.ApplyMethod(mockOperator, "Close", func(*RuntimeOperatorTool) error { - visited = true - return nil - }) - defer patches.Reset() - - dp.Close() - convey.So(visited, convey.ShouldBeTrue) - }) - }) -} - -func TestDevicesParserParseDevices(t *testing.T) { - convey.Convey("TestDevicesParserParseDevices", t, func() { - convey.Convey("should parse isula devices when container type is isula", func() { - dp := &DevicesParser{} - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", IsulaContainer). - ApplyFuncReturn((*DevicesParser).parseDeviceInIsula, nil) - defer patches.Reset() - - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - resultChan := make(chan DevicesInfo, 1) - err := dp.parseDevices(ctx, container, resultChan) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should parse containerd devices when container type is not isula", func() { - dp := &DevicesParser{} - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethodReturn(mockOperator, "GetContainerType", DefaultContainer). - ApplyFuncReturn((*DevicesParser).parseDevicesInContainerd, nil) - defer patches.Reset() - - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - resultChan := make(chan DevicesInfo, 1) - err := dp.parseDevices(ctx, container, resultChan) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestDevicesParserParseDevicesInContainerd(t *testing.T) { - convey.Convey("TestDevicesParserParseDevicesInContainerd", t, func() { - convey.Convey("should return error when result channel is nil", func() { - dp := &DevicesParser{} - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - - err := dp.parseDevicesInContainerd(ctx, container, nil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") - }) - - convey.Convey("should return error when get container info fails", func() { - dp := &DevicesParser{} - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethod(mockOperator, "GetContainerInfoByID", - func(*RuntimeOperatorTool, context.Context, string) (v1.Spec, error) { - return v1.Spec{}, errors.New("get container info failed") - }) - defer patches.Reset() - - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - resultChan := make(chan DevicesInfo, 1) - - err := dp.parseDevicesInContainerd(ctx, container, resultChan) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserGetDevicesWithoutAscendRuntime(t *testing.T) { - convey.Convey("TestDevicesParserGetDevicesWithoutAscendRuntime", t, func() { - convey.Convey("should return devices when filter succeeds", func() { - dp := &DevicesParser{} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevices, []int{device0, device1, device2}, nil) - defer patches.Reset() - - patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) - - spec := v1.Spec{} - container := &CommonContainer{Id: "test-container"} - - result, err := dp.getDevicesWithoutAscendRuntime(spec, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) - }) - - convey.Convey("should return empty when filter fails", func() { - dp := &DevicesParser{} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevices, nil, errors.New("filter failed")) - defer patches.Reset() - - spec := v1.Spec{} - container := &CommonContainer{Id: "test-container"} - - result, err := dp.getDevicesWithoutAscendRuntime(spec, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldResemble, DevicesInfo{}) - }) - }) -} - -func TestDevicesParserGetDevicesWithAscendRuntime(t *testing.T) { - convey.Convey("TestDevicesParserGetDevicesWithAscendRuntime", t, func() { - convey.Convey("should return error when env format is invalid", func() { - dp := &DevicesParser{} - ascendDevEnv := "invalid-env" - container := &CommonContainer{Id: "test-container"} - - result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) - convey.So(err, convey.ShouldNotBeNil) - convey.So(result, convey.ShouldResemble, DevicesInfo{}) - }) - - convey.Convey("should return devices when env format is valid", func() { - dp := &DevicesParser{} - ascendDevEnv := "ASCEND_VISIBLE_DEVICES=0,1,2" - container := &CommonContainer{Id: "test-container"} - - patches := gomonkey.ApplyFunc(makeUpDeviceInfo, func(*CommonContainer) (DevicesInfo, error) { - return DevicesInfo{ID: "test", Name: "test-name"}, nil - }) - defer patches.Reset() - - result, err := dp.getDevicesWithAscendRuntime(ascendDevEnv, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) - }) - }) -} - -func TestDevicesParserGetDevWithoutAscendRuntimeInIsula(t *testing.T) { - convey.Convey("TestDevicesParserGetDevWithoutAscendRuntimeInIsula", t, func() { - convey.Convey("should return devices when filter succeeds", func() { - dp := &DevicesParser{} - containerInfo := isula.ContainerJson{} - container := &CommonContainer{Id: "test-container"} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, []int{device0, device1, device2}, nil) - defer patches.Reset() - - patches.ApplyFuncReturn(makeUpDeviceInfo, DevicesInfo{ID: "test", Name: "test-name"}, nil) - - result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result.Devices, convey.ShouldResemble, []int{device0, device1, device2}) - }) - - convey.Convey("should return empty when filter fails", func() { - dp := &DevicesParser{} - containerInfo := isula.ContainerJson{} - container := &CommonContainer{Id: "test-container"} - - patches := gomonkey.ApplyFuncReturn(filterNPUDevicesInIsula, nil, errors.New("filter failed")) - defer patches.Reset() - - result, err := dp.getDevWithoutAscendRuntimeInIsula(containerInfo, container) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldResemble, DevicesInfo{}) - }) - }) -} - -func TestDevicesParserParseDeviceInIsula(t *testing.T) { - convey.Convey("TestDevicesParserParseDeviceInIsula", t, func() { - convey.Convey("should return error when result channel is nil", func() { - dp := &DevicesParser{} - ctx := context.Background() - container := &CommonContainer{Id: "test-container"} - - err := dp.parseDeviceInIsula(ctx, container, nil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "empty result channel") - }) - - convey.Convey("should return error when container id is too long", func() { - dp := &DevicesParser{} - ctx := context.Background() - longId := string(make([]byte, maxCgroupPath+1)) - container := &CommonContainer{Id: longId} - resultChan := make(chan DevicesInfo, 1) - - err := dp.parseDeviceInIsula(ctx, container, resultChan) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestDevicesParserCollect(t *testing.T) { - convey.Convey("TestDevicesParserCollect", t, func() { - convey.Convey("should return error when receiving channel is nil", func() { - dp := &DevicesParser{} - ctx := context.Background() - - result, err := dp.collect(ctx, nil, 1) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "receiving channel is empty") - convey.So(result, convey.ShouldBeNil) - }) - - convey.Convey("should return nil when count is negative", func() { - dp := &DevicesParser{} - ctx := context.Background() - resultChan := make(chan DevicesInfo) - - result, err := dp.collect(ctx, resultChan, -1) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldBeNil) - }) - }) -} - -func TestDevicesParserDoParse(t *testing.T) { - convey.Convey("TestDevicesParserDoParse", t, func() { - const time100ms = 100 * time.Millisecond - convey.Convey("should handle error when get containers fails", func() { - dp := &DevicesParser{ - err: make(chan error, 1), - } - mockOperator := &RuntimeOperatorTool{} - dp.RuntimeOperator = mockOperator - - patches := gomonkey.ApplyMethod(mockOperator, "GetContainers", - func(*RuntimeOperatorTool, context.Context) ([]*CommonContainer, error) { - return nil, errors.New("get containers failed") - }) - defer patches.Reset() - - resultChan := make(chan DevicesInfos, 1) - dp.doParse(resultChan) - - select { - case err := <-dp.err: - convey.So(err, convey.ShouldNotBeNil) - case <-time.After(time100ms): - convey.So("timeout", convey.ShouldEqual, "should receive error") - } - }) - }) -} - -func TestDevicesParserFetchAndParse(t *testing.T) { - const time10ms = 10 * time.Millisecond - convey.Convey("TestDevicesParserFetchAndParse", t, func() { - convey.Convey("should return early when err channel is nil", func() { - dp := &DevicesParser{ - err: nil, - } - visited := make(chan bool, 1) - patches := gomonkey.ApplyPrivateMethod(dp, "doParse", - func(*DevicesParser, chan<- DevicesInfos) error { - visited <- true - return nil - }) - defer patches.Reset() - - dp.FetchAndParse(nil) - time.Sleep(time10ms) - convey.So(len(visited), convey.ShouldEqual, 0) - }) - - convey.Convey("should start parsing when initialized", func() { - dp := &DevicesParser{ - err: make(chan error, 1), - RuntimeOperator: &RuntimeOperatorTool{}, - } - visited := make(chan bool, 1) - patches := gomonkey.ApplyPrivateMethod(dp, "doParse", - func(*DevicesParser, chan<- DevicesInfos) error { - visited <- true - return nil - }) - defer patches.Reset() - - dp.FetchAndParse(nil) - time.Sleep(time10ms) - convey.So(len(visited), convey.ShouldEqual, 1) - }) - }) -} - -func TestDevicesParserGetDeviceIDsByMinusStyle(t *testing.T) { - convey.Convey("TestDevicesParserGetDeviceIDsByMinusStyle", t, func() { - testCases := []struct { - name string - devices string - expected []int - }{ - {name: "should return empty slice when devices string is invalid", devices: "invalid-devices", expected: []int{}}, - {name: "should return empty slice when min device ID is invalid", devices: "invalid-5", expected: []int{}}, - {name: "should return empty slice when max device ID is invalid", devices: "0-invalid", expected: []int{}}, - {name: "should return empty slice when min ID is bigger than max ID", devices: "5-3", expected: []int{}}, - {name: "should return empty slice when max ID is too large", devices: "0-99999", expected: []int{}}, - {name: "should return device IDs when range is valid", devices: "0-2", expected: []int{0, 1, 2}}, - {name: "should return single device ID when min equals max", devices: "1-1", expected: []int{1}}, - } - for _, tc := range testCases { - convey.Convey(tc.name, func() { - dp := &DevicesParser{} - result := dp.getDeviceIDsByMinusStyle(tc.devices, "test-container") - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetNPUMajorID(t *testing.T) { - testCases := builderTestGetNPUMajorIDCases() - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - _, cleanup := tc.setup(t) - defer cleanup() - result, err := getNPUMajorID() - if tc.hasError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - } - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } -} - -type TestGetNPUMajorIDCase struct { - name string - setup func(*testing.T) (*gomonkey.Patches, func()) - expected []string - hasError bool -} - -func builderTestGetNPUMajorIDCases() []TestGetNPUMajorIDCase { - testCases := []TestGetNPUMajorIDCase{{name: "should return error when path check fails", - setup: func(*testing.T) (*gomonkey.Patches, func()) { - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) - return patches, func() { patches.Reset() } - }, expected: nil, hasError: true}, - {name: "should return error when file open fails", - setup: func(*testing.T) (*gomonkey.Patches, func()) { - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, "/proc/devices", nil) - p1.ApplyFuncReturn(os.Open, nil, errors.New("file open failed")) - return p1, func() { p1.Reset() } - }, expected: []string{}, hasError: true}, - {name: "should return empty slice when no NPU devices found", - setup: func(t *testing.T) (*gomonkey.Patches, func()) { - tmpFile, clean, err := mkTemp("1 mem\n2 pty\n") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) - return p1, func() { clean(); p1.Reset() } - }, expected: []string{}, hasError: false}, - {name: "should return major IDs when NPU devices found", - setup: func(t *testing.T) (*gomonkey.Patches, func()) { - tmpFile, clean, err := mkTemp("195 devdrv-cdev\n196 devdrv-cdev\n") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) - return p1, func() { clean(); p1.Reset() } - }, expected: []string{"195", "196"}, hasError: false}, - {name: "should return major IDs when mixed devices found", - setup: func(t *testing.T) (*gomonkey.Patches, func()) { - tmpFile, clean, err := mkTemp("1 mem\n195 devdrv-cdev\n2 pty\n196 devdrv-cdev\n") - if err != nil { - t.Fatalf("failed to create temp file: %v", err) - } - p1 := gomonkey.ApplyFuncReturn(utils.CheckPath, tmpFile, nil) - return p1, func() { clean(); p1.Reset() } - }, expected: []string{"195", "196"}, hasError: false}, - } - return testCases -} - -func TestNpuMajor(t *testing.T) { - convey.Convey("TestNpuMajor", t, func() { - convey.Convey("should return cached major IDs", func() { - patches := gomonkey.ApplyFuncReturn(getNPUMajorID, []string{"123", "456"}, nil) - defer patches.Reset() - - result := npuMajor() - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -func TestFilterNPUDevices(t *testing.T) { - convey.Convey("TestFilterNPUDevices", t, func() { - const mockMajorID = 236 - convey.Convey("should return error when spec is empty", func() { - spec := v1.Spec{} - result, err := filterNPUDevices(spec) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "empty spec info") - convey.So(result, convey.ShouldBeNil) - }) - - convey.Convey("should return devices when spec is valid", func() { - spec := v1.Spec{ - Linux: &v1.Linux{ - Resources: &v1.LinuxResources{ - Devices: []v1.LinuxDeviceCgroup{{Type: "c", Major: int64Ptr(mockMajorID), Minor: int64Ptr(0)}}, - }, - }, - } - patches := gomonkey.ApplyFuncReturn(npuMajor, []string{"236"}) - defer patches.Reset() - - result, err := filterNPUDevices(spec) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -// mkTemp creates a temporary file with the given content and returns the file name, -// a cleanup function, and an error. The file is closed before returning. -func mkTemp(content string) (string, func(), error) { - f, err := os.CreateTemp("", "test_*") - if err != nil { - return "", func() {}, err - } - if _, err = f.WriteString(content); err != nil { - clean(f) - return "", func() {}, err - } - if _, err = f.Seek(0, 0); err != nil { - clean(f) - return "", func() {}, err - } - name := f.Name() - return name, func() { clean(f) }, nil -} - -func clean(f *os.File) { - if f == nil { - return - } - if err := f.Close(); err != nil { - logger.Errorf("an error occurred where close file [%v],err :%v", f.Name(), err) - } - if err := os.Remove(f.Name()); err != nil { - logger.Errorf("an error occurred where remove file [%v],err :%v", f.Name(), err) - } -} - -func TestFilterNPUDevicesInIsula(t *testing.T) { - convey.Convey("TestFilterNPUDevicesInIsula", t, func() { - convey.Convey("should return error when container is privileged", func() { - containerInfo := isula.ContainerJson{ - HostConfig: &isula.HostConfig{ - Privileged: true, - }, - } - - result, err := filterNPUDevicesInIsula(containerInfo) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "privileged container") - convey.So(result, convey.ShouldBeNil) - }) - - convey.Convey("should return devices when container is not privileged", func() { - containerInfo := isula.ContainerJson{ - HostConfig: &isula.HostConfig{ - Privileged: false, - Devices: []isula.DeviceInfo{ - { - PathInContainer: "/dev/npu0", - }, - }, - }, - } - - patches := gomonkey.ApplyFuncReturn(getDevIdFromPath, 0, nil) - defer patches.Reset() - - result, err := filterNPUDevicesInIsula(containerInfo) - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -// Helper function for creating int64 pointers -func int64Ptr(v int64) *int64 { - return &v -} - -func TestParseDiffEnvFmt(t *testing.T) { - convey.Convey("TestParseDiffEnvFmt", t, func() { - dp := &DevicesParser{} - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - {name: "should parse comma style devices when valid", - devices: testDeviceComma, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - {name: "should parse minus style devices when valid", - devices: testDeviceRange, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - {name: "should parse ascend style devices when valid", - devices: testAscendDevices, - containerID: "test-container", - expected: []int{device0, device1}, - }, - {name: "should parse comma minus style devices when valid", - devices: testDeviceCommaRange, - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - {name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.parseDiffEnvFmt(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByCommaStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByCommaStyle", t, func() { - dp := &DevicesParser{} - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - {name: "should parse comma separated devices when valid", - devices: "0,1,2,3", - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - {name: "should parse single device when valid", - devices: "0", - containerID: "test-container", - expected: []int{device0}, - }, - {name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - {name: "should parse devices with spaces when valid", - devices: testDeviceComma, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByCommaStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByAscendStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByAscendStyle", t, func() { - dp := &DevicesParser{} - - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - { - name: "should parse ascend devices when valid", - devices: "Ascend-0,Ascend-1,Ascend-2", - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - { - name: "should parse single ascend device when valid", - devices: testAscendDevice0, - containerID: "test-container", - expected: []int{0}, - }, - { - name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - { - name: "should parse mixed case ascend devices when valid", - devices: "ascend-0,ASCEND-1", - containerID: "test-container", - expected: []int{device0, device1}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByAscendStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByMinusStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByMinusStyle", t, func() { - dp := &DevicesParser{} - - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - { - name: "should parse range devices when valid", - devices: "0-3", - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - { - name: "should parse single device range when valid", - devices: "0-0", - containerID: "test-container", - expected: []int{device0}, - }, - { - name: "should return empty slice when devices are empty", - devices: "", - containerID: "test-container", - expected: []int{}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByMinusStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestGetDeviceIDsByCommaMinusStyle(t *testing.T) { - convey.Convey("TestGetDeviceIDsByCommaMinusStyle", t, func() { - dp := &DevicesParser{} - - testCases := []struct { - name string - devices string - containerID string - expected []int - }{ - { - name: "should parse comma minus devices when valid", - devices: testDeviceCommaRange, - containerID: "test-container", - expected: []int{device0, device1, device2, device3}, - }, - { - name: "should parse single range when valid", - devices: testDeviceRange, - containerID: "test-container", - expected: []int{device0, device1, device2}, - }, - { - name: "should return nil when devices are empty", - devices: "", - containerID: "test-container", - expected: nil, - }, - { - name: "should parse mixed ranges when valid", - devices: testMixedDevices, - containerID: "test-container", - expected: []int{device0, device1, device3}, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := dp.getDeviceIDsByCommaMinusStyle(tc.devices, tc.containerID) - convey.So(result, convey.ShouldResemble, tc.expected) - }) - } - }) -} - -func TestContains(t *testing.T) { - convey.Convey("TestContains", t, func() { - testCases := []struct { - name string - slice []string - target string - expected bool - }{ - { - name: "should return true when target exists in slice", - slice: []string{"a", "b", "c"}, - target: "b", - expected: true, - }, - { - name: "should return false when target does not exist in slice", - slice: []string{"a", "b", "c"}, - target: "d", - expected: false, - }, - { - name: "should return false when slice is empty", - slice: []string{}, - target: "a", - expected: false, - }, - { - name: "should return false when slice is nil", - slice: nil, - target: "a", - expected: false, - }, - { - name: "should return false when target is empty string", - slice: []string{"a", "b", "c"}, - target: "", - expected: false, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := contains(tc.slice, tc.target) - convey.So(result, convey.ShouldEqual, tc.expected) - }) - } - }) -} - -func TestContactError(t *testing.T) { - convey.Convey("TestContactError", t, func() { - testCases := []struct { - name string - err error - msg string - expected string - }{ - { - name: "should concatenate error with message when both provided", - err: errors.New(testOriginalError), - msg: testErrorMessage, - expected: testContactedError, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := contactError(tc.err, tc.msg) - convey.So(result.Error(), convey.ShouldEqual, tc.expected) - }) - } - }) -} - -func TestGetDevIdFromPath(t *testing.T) { - convey.Convey("TestGetDevIdFromPath", t, func() { - testCases := []struct { - name string - pattern string - path string - expected int - hasError bool - }{ - {name: "should extract device id when path is valid", - pattern: testDevicePattern, - path: "/dev/npu0", - expected: 0, - hasError: false, - }, - {name: "should extract device id when path has multiple digits", - pattern: testDevicePattern, - path: "/dev/npu123", - expected: 123, - hasError: false, - }, - {name: "should return error when device path is invalid", - pattern: testDevicePattern, - path: "/dev/cpu0", - expected: 0, - hasError: true, - }, - {name: "should return error when path is empty", - pattern: testDevicePattern, - path: "", - expected: 0, - hasError: true, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result, err := getDevIdFromPath(tc.pattern, tc.path) - if tc.hasError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - convey.So(result, convey.ShouldEqual, tc.expected) - } - }) - } - }) -} - -func TestWithDefault(t *testing.T) { - convey.Convey("TestWithDefault", t, func() { - const time0s = 0 - const time3s = 3 * time.Second - const time5s = 5 * time.Second - testCases := []struct { - name string - v time.Duration - d time.Duration - expected time.Duration - }{ - {name: "should return default when duration is zero", - v: time0s, - d: time5s, - expected: time5s, - }, - {name: "should return value when duration is non-zero", - v: time3s, - d: time5s, - expected: time3s, - }, - {name: "should return value when duration is negative", - v: -1 * time.Second, - d: time5s, - expected: -1 * time.Second, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - result := withDefault(tc.v, tc.d) - convey.So(result, convey.ShouldEqual, tc.expected) - }) - } - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go deleted file mode 100644 index daab834..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/runtime_ops.go +++ /dev/null @@ -1,413 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container for monitoring containers' npu allocation -package container - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "strings" - "syscall" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/metadata" - "google.golang.org/grpc/status" - criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" - "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" - - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - labelK8sPodNamespace = "io.kubernetes.pod.namespace" - labelK8sPodName = "io.kubernetes.pod.name" - labelContainerName = "io.kubernetes.container.name" - - // DefaultIsuladAddr default isulad sock adress - DefaultIsuladAddr = "unix:///run/isulad.sock" - // DefaultDockerShim default docker shim sock address - DefaultDockerShim = "unix:///run/dockershim.sock" - // DefaultCRIDockerd default cri-dockerd sock address - DefaultCRIDockerd = "unix:///run/cri-dockerd.sock" - // DefaultContainerdAddr default containerd sock address - DefaultContainerdAddr = "unix:///run/containerd/containerd.sock" - // DefaultDockerAddr default docker containerd sock address - DefaultDockerAddr = "unix:///run/docker/containerd/docker-containerd.sock" - defaultDockerOnEuler = "unix:///run/docker/containerd/containerd.sock" - grpcHeader = "containerd-namespace" - unixPre = "unix://" - - // IsulaContainer represents isula container type - IsulaContainer = "isula" - // DefaultContainer represents default container type - DefaultContainer = "docker-containerd" - excludePermissions = 0002 - - criV1alpha2 = "runtime.v1alpha2.RuntimeService" -) - -// CommonContainer wraps some common container attribute of isulad and containerd -type CommonContainer struct { - Id string - Labels map[string]string -} - -// RuntimeOperator wraps operations against container runtime -type RuntimeOperator interface { - Init() error - Close() error - GetContainers(ctx context.Context) ([]*CommonContainer, error) - GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) - GetIsulaContainerInfoByID(ctx context.Context, id string) (isula.ContainerJson, error) - GetContainerType() string -} - -// RuntimeOperatorTool implements RuntimeOperator interface -type RuntimeOperatorTool struct { - criConn *grpc.ClientConn - conn *grpc.ClientConn - criClient interface{} - client interface{} - // CriEndpoint CRI server endpoint - CriEndpoint string - // OciEndpoint containerd Server endpoint - OciEndpoint string - // Namespace the namespace of containerd - Namespace string - // UseCriBackup use cri back up address or not - UseCriBackup bool - // UseOciBackup use oci back up address or not - UseOciBackup bool -} - -// Init initializes container runtime operator -func (operator *RuntimeOperatorTool) Init() error { - start := syscall.Getuid() - logger.Debugf("the init uid is:%d", start) - if start != 0 { - err := syscall.Setuid(0) - if err != nil { - return fmt.Errorf("raise uid failed: %v", err) - } - logger.Debugf("raise uid to:%d", 0) - defer func() { - err = syscall.Setuid(start) - if err != nil { - logger.Errorf("recover uid failed: %v", err) - } - logger.Debugf("recover uid to:%d", start) - }() - } - if err := sockCheck(operator); err != nil { - hwlog.RunLog.Error("check socket path failed") - return err - } - - if err := operator.initCriClient(); err != nil { - return fmt.Errorf("init CRI client failed, %s", err) - } - - if err := operator.initOciClient(); err != nil { - return fmt.Errorf("init OCI client failed, %s", err) - } - return nil -} - -func (operator *RuntimeOperatorTool) initCriClient() error { - criConn, err := GetConnection(operator.CriEndpoint) - if err != nil || criConn == nil { - msg := fmt.Sprintf("connecting to CRI server failed: %v", err) - if operator.UseCriBackup { - logger.Warnf("%v, will use cri-dockerd address to try again", msg) - if utils.IsExist(strings.TrimPrefix(DefaultCRIDockerd, unixPre)) { - criConn, err = GetConnection(DefaultCRIDockerd) - } - } else { - logger.Warn(msg) - } - } - if err != nil { - return fmt.Errorf("connecting to CRI server failed: %v", err) - } - if operator.CriEndpoint == DefaultIsuladAddr { - operator.criClient = isula.NewRuntimeServiceClient(criConn) - } else { - operator.criClient = v1alpha2.NewRuntimeServiceClient(criConn) - } - operator.criConn = criConn - return nil -} - -func (operator *RuntimeOperatorTool) initOciClient() error { - conn, err := GetConnection(operator.OciEndpoint) - if err != nil || conn == nil { - msg := fmt.Sprintf("failed to get OCI connection: %v", err) - if operator.UseOciBackup { - logger.Warnf("%v, will use backup address to try again", msg) - if utils.IsExist(strings.TrimPrefix(DefaultContainerdAddr, unixPre)) { - conn, err = GetConnection(DefaultContainerdAddr) - - } else if utils.IsExist(strings.TrimPrefix(defaultDockerOnEuler, unixPre)) { - conn, err = GetConnection(defaultDockerOnEuler) - } - } else { - logger.Warn(msg) - } - } - if err != nil { - return fmt.Errorf("connecting to OCI server failed: %v", err) - } - if operator.OciEndpoint == DefaultIsuladAddr { - operator.client = isula.NewContainerServiceClient(conn) - } else { - operator.client = v1.NewContainersClient(conn) - } - operator.conn = conn - return nil -} - -func sockCheck(operator *RuntimeOperatorTool) error { - absPath, err := utils.CheckPath(strings.TrimPrefix(operator.CriEndpoint, unixPre)) - if err != nil { - return err - } - if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { - return err - } - - absPath, err = utils.CheckPath(strings.TrimPrefix(operator.OciEndpoint, unixPre)) - if err != nil { - return err - } - if err := utils.DoCheckOwnerAndPermission(absPath, excludePermissions, 0); err != nil { - return err - } - return nil -} - -// Close closes container runtime operator -func (operator *RuntimeOperatorTool) Close() error { - err := operator.conn.Close() - if err != nil { - return err - } - err = operator.criConn.Close() - if err != nil { - return err - } - return nil -} - -// GetContainers returns all containers' IDs -func (operator *RuntimeOperatorTool) GetContainers(ctx context.Context) ([]*CommonContainer, error) { - if utils.IsNil(operator.criClient) || operator.criConn == nil { - return nil, errors.New("criClient is empty") - } - if client, ok := operator.criClient.(v1alpha2.RuntimeServiceClient); ok { - containers, err := getContainersByContainerdV1alpha2(ctx, client) - if isUnimplementedError(err, criV1alpha2) { - v1Client := criv1.NewRuntimeServiceClient(operator.criConn) - return getContainersByContainerdV1(ctx, v1Client) - } - return containers, err - } - if client, ok := operator.criClient.(isula.RuntimeServiceClient); ok { - return getContainersByIsulad(ctx, client) - } - - logger.Errorf("client %v is unexpected", operator.criClient) - return nil, errors.New("unexpected client type") -} - -func isUnimplementedError(err error, serviceName string) bool { - if err == nil { - return false - } - st, ok := status.FromError(err) - if ok { - return st.Code() == codes.Unimplemented && strings.Contains(st.Message(), serviceName) - } - errStr := err.Error() - if strings.Contains(errStr, "code = Unimplemented") && - strings.Contains(errStr, "desc = ") && strings.Contains(errStr, serviceName) { - return true - } - return false -} - -// GetContainerInfoByID use oci interface to get container -func (operator *RuntimeOperatorTool) GetContainerInfoByID(ctx context.Context, id string) (v1.Spec, error) { - if utils.IsNil(operator.client) || operator.conn == nil { - return v1.Spec{}, errors.New("oci client is empty") - } - - s := v1.Spec{} - if client, ok := operator.client.(v1.ContainersClient); ok { - resp, err := client.Get(setGrpcNamespaceHeader(ctx, operator.Namespace), &v1.GetContainerRequest{ - Id: id, - }) - if err != nil { - hwlog.RunLog.Error("get call OCI get method failed") - return v1.Spec{}, err - } - if err = json.Unmarshal(resp.Container.Spec.Value, &s); err != nil { - hwlog.RunLog.Error("unmarshal OCI response failed") - return v1.Spec{}, err - } - return s, nil - } - - return s, errors.New("unexpected containerd client") -} - -// GetIsulaContainerInfoByID return isula container info -func (operator *RuntimeOperatorTool) GetIsulaContainerInfoByID(ctx context.Context, - id string) (isula.ContainerJson, error) { - containerJsonInfo := isula.ContainerJson{} - if utils.IsNil(operator.client) || operator.conn == nil { - return containerJsonInfo, errors.New("oci client is empty") - } - - if client, ok := operator.client.(isula.ContainerServiceClient); ok { - resp, err := client.Inspect(setGrpcNamespaceHeader(ctx, operator.Namespace), &isula.InspectContainerRequest{ - Id: id, - }) - if err != nil { - hwlog.RunLog.Error("call isula OCI Inspect method failed") - return containerJsonInfo, err - } - if err = json.Unmarshal([]byte(resp.ContainerJSON), &containerJsonInfo); err != nil { - logger.Errorf("unmarshal err: %v", err) - return containerJsonInfo, err - } - return containerJsonInfo, nil - } - - return containerJsonInfo, errors.New("unexpected isula client") -} - -// GetContainerType return container type -func (operator *RuntimeOperatorTool) GetContainerType() string { - if operator.OciEndpoint == DefaultIsuladAddr { - return IsulaContainer - } - return DefaultContainer -} - -type nsKey struct{} - -func setGrpcNamespaceHeader(ctx context.Context, namespace string) context.Context { - context.WithValue(ctx, nsKey{}, namespace) - ns := metadata.Pairs(grpcHeader, namespace) - md, ok := metadata.FromOutgoingContext(ctx) - if !ok { - md = ns - } else { - md = metadata.Join(ns, md) - } - return metadata.NewOutgoingContext(ctx, md) -} - -func getContainersByContainerdV1alpha2(ctx context.Context, - client v1alpha2.RuntimeServiceClient) ([]*CommonContainer, error) { - var allContainers []*CommonContainer - request := genContainerRequestV1alpha2() - r, err := client.ListContainers(ctx, request) - if err != nil { - hwlog.RunLog.Warn(err) - return nil, err - } - for _, container := range r.Containers { - allContainers = append(allContainers, &CommonContainer{ - Id: container.Id, - Labels: container.Labels, - }) - } - return allContainers, nil -} - -func getContainersByContainerdV1(ctx context.Context, client criv1.RuntimeServiceClient) ([]*CommonContainer, error) { - var allContainers []*CommonContainer - request := genContainerRequestV1() - r, err := client.ListContainers(ctx, request) - if err != nil { - hwlog.RunLog.Error(err) - return nil, err - } - for _, container := range r.Containers { - allContainers = append(allContainers, &CommonContainer{ - Id: container.Id, - Labels: container.Labels, - }) - } - return allContainers, nil -} - -func getContainersByIsulad(ctx context.Context, client isula.RuntimeServiceClient) ([]*CommonContainer, error) { - var allContainers []*CommonContainer - request := genIsulaRequest() - r, err := client.ListContainers(ctx, request) - if err != nil { - hwlog.RunLog.Error(err) - return nil, err - } - for _, container := range r.Containers { - allContainers = append(allContainers, &CommonContainer{ - Id: container.Id, - Labels: container.Labels, - }) - } - return allContainers, nil -} - -func genContainerRequestV1alpha2() *v1alpha2.ListContainersRequest { - filter := &v1alpha2.ContainerFilter{} - st := &v1alpha2.ContainerStateValue{} - st.State = v1alpha2.ContainerState_CONTAINER_RUNNING - filter.State = st - request := &v1alpha2.ListContainersRequest{ - Filter: filter, - } - return request -} - -func genContainerRequestV1() *criv1.ListContainersRequest { - filter := &criv1.ContainerFilter{} - st := &criv1.ContainerStateValue{} - st.State = criv1.ContainerState_CONTAINER_RUNNING - filter.State = st - request := &criv1.ListContainersRequest{ - Filter: filter, - } - return request -} - -func genIsulaRequest() *isula.ListContainersRequest { - filter := &isula.ContainerFilter{} - st := &isula.ContainerStateValue{} - st.State = isula.ContainerState_CONTAINER_RUNNING - filter.State = st - request := &isula.ListContainersRequest{ - Filter: filter, - } - return request -} diff --git a/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go b/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go deleted file mode 100644 index 2bc135c..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/runtime_ops_test.go +++ /dev/null @@ -1,568 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container provides utilities for container monitoring and testing. -package container - -import ( - "context" - "errors" - "fmt" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" - "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" - - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/container/isula" - "huawei.com/npu-exporter/v6/collector/container/v1" -) - -const ( - // Test constants for runtime operations - testNamespace = "test-namespace" - - // Test error messages - testInitCriError = "init CRI client failed" - testInitOciError = "init OCI client failed" - testSockCheckError = "socket check failed" - testCriClientEmptyError = "criClient is empty" - testOciClientEmptyError = "oci client is empty" - testUnexpectedClientError = "unexpected client type" - testUnexpectedContainerdClientError = "unexpected containerd client" - testUnexpectedIsulaClientError = "unexpected isula client" - testCriV1alpha2 = "runtime.v1alpha2.RuntimeService" - testCriV1 = "runtime.v1.RuntimeService" -) - -func TestRuntimeOperatorToolInit(t *testing.T) { - r := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - convey.Convey("should initialize successfully when all components succeed", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, nil) - defer patches.Reset() - patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) - patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) - err := operator.Init() - convey.So(err, convey.ShouldBeNil) - }) - convey.Convey("should return error when socket check fails", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, errors.New(testSockCheckError)) - defer patches.Reset() - err := operator.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testSockCheckError) - }) - convey.Convey("should return error when CRI client init fails", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, nil) - defer patches.Reset() - patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, errors.New(testInitCriError)) - patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, nil) - err := operator.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testInitCriError) - }) - convey.Convey("should return error when OCI client init fails", t, func() { - operator := r - patches := gomonkey.ApplyFuncReturn(sockCheck, nil) - defer patches.Reset() - patches.ApplyFuncReturn((*RuntimeOperatorTool).initCriClient, nil) - patches.ApplyFuncReturn((*RuntimeOperatorTool).initOciClient, errors.New(testInitOciError)) - err := operator.Init() - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testInitOciError) - }) -} - -func TestRuntimeOperatorToolInitCriClient(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolInitCriClient", t, func() { - convey.Convey("should initialize CRI client successfully for containerd", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - UseOciBackup: false, - UseCriBackup: false, - } - - patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - defer patches.Reset() - - err := operator.initCriClient() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should initialize CRI client successfully for isulad", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: DefaultIsuladAddr, - UseOciBackup: false, - UseCriBackup: false, - } - - patches := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - defer patches.Reset() - - err := operator.initCriClient() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when connection fails and no backup", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - UseOciBackup: false, - UseCriBackup: false, - } - - patches := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) - defer patches.Reset() - - err := operator.initCriClient() - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestRuntimeOperatorToolInitOciClient(t *testing.T) { - testCases := buildInitOciClientTestCases() - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - operator, patches := tc.setup() - if patches != nil { - defer patches.Reset() - } - err := operator.initOciClient() - if tc.hasError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - } - }) - } -} - -type initOciClientTestCase struct { - name string - setup func() (*RuntimeOperatorTool, *gomonkey.Patches) - hasError bool -} - -func buildInitOciClientTestCases() []initOciClientTestCase { - return []initOciClientTestCase{ - {name: "should initialize OCI client successfully for containerd", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} - p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - return op, p - }, - hasError: false}, - {name: "should initialize OCI client successfully for isulad", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: DefaultIsuladAddr, UseOciBackup: false} - p := gomonkey.ApplyFuncReturn(GetConnection, &grpc.ClientConn{}, nil) - return op, p - }, - hasError: false}, - {name: "should return error when connection fails and no backup", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: false} - p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("connection failed")) - return op, p - }, - hasError: true}, - {name: "should return error when OCI endpoint is empty", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: "", UseOciBackup: false} - return op, nil - }, - hasError: true}, - {name: "should try backup when primary connection fails", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} - p := gomonkey.ApplyFunc(GetConnection, func(endpoint string) (*grpc.ClientConn, error) { - if endpoint == testContainerdEndpoint { - return nil, errors.New("primary failed") - } - return nil, errors.New("backup failed") - }) - return op, p - }, - hasError: true}, - {name: "should return error when all connections fail", - setup: func() (*RuntimeOperatorTool, *gomonkey.Patches) { - op := &RuntimeOperatorTool{OciEndpoint: testContainerdEndpoint, UseOciBackup: true} - p := gomonkey.ApplyFuncReturn(GetConnection, nil, errors.New("all failed")) - return op, p - }, - hasError: true}, - } -} - -func TestSockCheck(t *testing.T) { - convey.Convey("TestSockCheck", t, func() { - convey.Convey("should pass when socket paths are valid", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) - defer patches.Reset() - patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, nil) - - err := sockCheck(operator) - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when CRI endpoint check fails", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "", errors.New("path check failed")) - defer patches.Reset() - - err := sockCheck(operator) - convey.So(err, convey.ShouldNotBeNil) - }) - - convey.Convey("should return error when CRI endpoint permission check fails", func() { - operator := &RuntimeOperatorTool{ - CriEndpoint: testContainerdEndpoint, - OciEndpoint: testContainerdEndpoint, - } - - patches := gomonkey.ApplyFuncReturn(utils.CheckPath, "/run/containerd.sock", nil) - defer patches.Reset() - patches.ApplyFuncReturn(utils.DoCheckOwnerAndPermission, errors.New("permission check failed")) - - err := sockCheck(operator) - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestRuntimeOperatorToolClose(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolClose", t, func() { - convey.Convey("should close connections successfully", func() { - operator := &RuntimeOperatorTool{ - conn: &grpc.ClientConn{}, - criConn: &grpc.ClientConn{}, - } - - patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { - return nil - }) - defer patches.Reset() - - err := operator.Close() - convey.So(err, convey.ShouldBeNil) - }) - - convey.Convey("should return error when OCI connection close fails", func() { - operator := &RuntimeOperatorTool{ - conn: &grpc.ClientConn{}, - criConn: &grpc.ClientConn{}, - } - - patches := gomonkey.ApplyFunc((*grpc.ClientConn).Close, func(*grpc.ClientConn) error { - return errors.New("close failed") - }) - defer patches.Reset() - - err := operator.Close() - convey.So(err, convey.ShouldNotBeNil) - }) - }) -} - -func TestRuntimeOperatorToolGetContainers(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetContainers", t, func() { - convey.Convey("should return error when CRI client is empty", func() { - operator := &RuntimeOperatorTool{} - - patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) - defer patches.Reset() - - containers, err := operator.GetContainers(context.Background()) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) - convey.So(containers, convey.ShouldBeNil) - }) - - convey.Convey("should return error when CRI connection is nil", func() { - operator := &RuntimeOperatorTool{ - criClient: "mock-client", - } - - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - - containers, err := operator.GetContainers(context.Background()) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testCriClientEmptyError) - convey.So(containers, convey.ShouldBeNil) - }) - - convey.Convey("should return error when client type is unexpected", func() { - operator := &RuntimeOperatorTool{ - criClient: "unexpected", - criConn: &grpc.ClientConn{}, - } - - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - - containers, err := operator.GetContainers(context.Background()) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testUnexpectedClientError) - convey.So(containers, convey.ShouldBeNil) - }) - }) -} - -func TestIsUnimplementedError(t *testing.T) { - tests := []struct { - name string - err error - serviceName string - want bool - }{ - { - name: "nil error returns false", - err: nil, - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "non-grpc error returns false", - err: errors.New("unknown service " + testCriV1alpha2), - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "mismatched code returns false", - err: status.Error(codes.NotFound, "unknown service "+testCriV1alpha2), - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "mismatched message returns false", - err: status.Error(codes.Unimplemented, "unknown service "+testCriV1), - serviceName: testCriV1alpha2, - want: false, - }, - { - name: "matched unimplemented error returns true", - err: status.Error(codes.Unimplemented, "unknown service "+testCriV1alpha2), - serviceName: testCriV1alpha2, - want: true, - }, - { - name: "real grpc error format returns true", - err: fmt.Errorf("rpc error: code = Unimplemented desc = unknown service " + testCriV1alpha2), - serviceName: testCriV1alpha2, - want: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := isUnimplementedError(tt.err, tt.serviceName); got != tt.want { - t.Errorf("isUnimplementedError() = %v, want %v (err: %v)", got, tt.want, tt.err) - } - }) - } -} - -func TestRuntimeOperatorToolGetContainerInfoByID(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetContainerInfoByID", t, func() { - convey.Convey("should return error when OCI client is empty", func() { - operator := &RuntimeOperatorTool{} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when OCI connection is nil", func() { - operator := &RuntimeOperatorTool{client: "mock-client"} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when client type is unexpected", func() { - operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testUnexpectedContainerdClientError) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when GetContainer call fails", func() { - operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - convey.Convey("should return error when JSON unmarshal fails", func() { - operator := &RuntimeOperatorTool{client: "mock-containers-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - spec, err := operator.GetContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(spec, convey.ShouldResemble, v1.Spec{}) - }) - - }) -} - -func TestRuntimeOperatorToolGetIsulaContainerInfoByID(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetIsulaContainerInfoByID", t, func() { - convey.Convey("should return error when OCI client is empty", func() { - operator := &RuntimeOperatorTool{} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, true) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when OCI connection is nil", func() { - operator := &RuntimeOperatorTool{client: "mock-client"} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testOciClientEmptyError) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when client type is unexpected", func() { - operator := &RuntimeOperatorTool{client: "unexpected", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldEqual, testUnexpectedIsulaClientError) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when Inspect call fails", func() { - operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - convey.Convey("should return error when JSON unmarshal fails", func() { - operator := &RuntimeOperatorTool{client: "mock-isula-client", conn: &grpc.ClientConn{}} - patches := gomonkey.ApplyFuncReturn(utils.IsNil, false) - defer patches.Reset() - containerInfo, err := operator.GetIsulaContainerInfoByID(context.Background(), testContainerID) - convey.So(err, convey.ShouldNotBeNil) - convey.So(containerInfo, convey.ShouldResemble, isula.ContainerJson{}) - }) - - }) -} - -func TestRuntimeOperatorToolGetContainerType(t *testing.T) { - convey.Convey("TestRuntimeOperatorToolGetContainerType", t, func() { - convey.Convey("should return isula when endpoint is isulad", func() { - operator := &RuntimeOperatorTool{ - OciEndpoint: DefaultIsuladAddr, - } - - containerType := operator.GetContainerType() - convey.So(containerType, convey.ShouldEqual, IsulaContainer) - }) - - convey.Convey("should return default when endpoint is not isulad", func() { - operator := &RuntimeOperatorTool{ - OciEndpoint: testContainerdEndpoint, - } - - containerType := operator.GetContainerType() - convey.So(containerType, convey.ShouldEqual, DefaultContainer) - }) - }) -} - -func TestSetGrpcNamespaceHeader(t *testing.T) { - convey.Convey("TestSetGrpcNamespaceHeader", t, func() { - convey.Convey("should set namespace header when context has no metadata", func() { - ctx := context.Background() - result := setGrpcNamespaceHeader(ctx, testNamespace) - convey.So(result, convey.ShouldNotBeNil) - }) - - convey.Convey("should set namespace header when context has existing metadata", func() { - ctx := context.Background() - ctx = context.WithValue(ctx, "test", "value") - result := setGrpcNamespaceHeader(ctx, testNamespace) - convey.So(result, convey.ShouldNotBeNil) - }) - }) -} - -func TestGenContainerRequestV1alpha2(t *testing.T) { - convey.Convey("TestGenContainerRequestV1alpha2", t, func() { - convey.Convey("should generate valid container request", func() { - request := genContainerRequestV1alpha2() - convey.So(request, convey.ShouldNotBeNil) - convey.So(request.Filter, convey.ShouldNotBeNil) - convey.So(request.Filter.State, convey.ShouldNotBeNil) - convey.So(request.Filter.State.State, convey.ShouldEqual, v1alpha2.ContainerState_CONTAINER_RUNNING) - }) - }) -} - -func TestGenContainerRequestV1(t *testing.T) { - convey.Convey("TestGenContainerRequestV1", t, func() { - convey.Convey("should generate valid container request", func() { - request := genContainerRequestV1() - convey.So(request, convey.ShouldNotBeNil) - convey.So(request.Filter, convey.ShouldNotBeNil) - convey.So(request.Filter.State, convey.ShouldNotBeNil) - convey.So(request.Filter.State.State, convey.ShouldEqual, criv1.ContainerState_CONTAINER_RUNNING) - }) - }) -} - -func TestGenIsulaRequest(t *testing.T) { - convey.Convey("TestGenIsulaRequest", t, func() { - convey.Convey("should generate valid isula request", func() { - request := genIsulaRequest() - convey.So(request, convey.ShouldNotBeNil) - convey.So(request.Filter, convey.ShouldNotBeNil) - convey.So(request.Filter.State, convey.ShouldNotBeNil) - convey.So(request.Filter.State.State, convey.ShouldEqual, isula.ContainerState_CONTAINER_RUNNING) - }) - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils.go b/mind-cluster/component/npu-exporter/collector/container/utils.go deleted file mode 100644 index b5ff57e..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/utils.go +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package container for monitoring containers' npu allocation -package container - -import ( - "context" - "errors" - "fmt" - "net" - "net/url" - "strings" - "time" - - "google.golang.org/grpc" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - defaultTimeout = 5 * time.Second - unixPrefix = "unix" - // MaxLenDNS configName max len - MaxLenDNS = 512 - // MinLenDNS configName min len - MinLenDNS = 1 - maxContainers = 1024 - maxCgroupPath = 2048 - - maxDevicesNum = 100000 - maxEnvNum = 10000 -) - -// CgroupVersion is the cgroups mode of the host system -type CgroupVersion int - -// GetConnection return the grpc connection -func GetConnection(endPoint string) (*grpc.ClientConn, error) { - if endPoint == "" { - return nil, fmt.Errorf("endpoint is not set") - } - logger.Debugf("connect using endpoint '%s' with '%s' timeout", - utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://")), defaultTimeout) - addr, dialer, err := getAddressAndDialer(endPoint) - if err != nil { - hwlog.RunLog.Error(err) - return nil, err - } - ctx, cancelFn := context.WithTimeout(context.Background(), defaultTimeout) - defer cancelFn() - conn, err := grpc.DialContext(ctx, addr, grpc.WithInsecure(), grpc.WithBlock(), grpc.WithContextDialer(dialer)) - if err != nil { - return nil, err - } - logger.Debugf("connected successfully using endpoint: %s", - utils.MaskPrefix(strings.TrimPrefix(endPoint, unixPrefix+"://"))) - return conn, nil -} - -func parseSocketEndpoint(endpoint string) (string, string, error) { - u, err := url.Parse(endpoint) - if err != nil { - return "", "", err - } - - switch u.Scheme { - case "unix": - return "unix", u.Path, nil - case "tcp": - return "tcp", u.Host, nil - default: - return u.Scheme, "", fmt.Errorf("protocol %q not supported", u.Scheme) - } -} - -// getAddressAndDialer returns the address parsed from the given socket endpoint and dialer -func getAddressAndDialer(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { - prefix, addr, err := parseSocketEndpoint(endpoint) - if err != nil { - return "", nil, err - } - if prefix != unixPrefix { - return "", nil, fmt.Errorf("only support unix socket") - } - return addr, dial, nil -} - -// dial return the context dialer -func dial(ctx context.Context, addr string) (net.Conn, error) { - return (&net.Dialer{}).DialContext(ctx, unixPrefix, addr) -} - -func validDNSRe(dnsContent string) error { - if len(dnsContent) < MinLenDNS || len(dnsContent) > MaxLenDNS { - return errors.New("param len invalid") - } - return nil -} - -func makeUpDeviceInfo(c *CommonContainer) (DevicesInfo, error) { - deviceInfo := DevicesInfo{} - var names []string - - ns := c.Labels[labelK8sPodNamespace] - names = append(names, ns) - podName := c.Labels[labelK8sPodName] - names = append(names, podName) - containerName := c.Labels[labelContainerName] - names = append(names, containerName) - for _, v := range names { - if err := validDNSRe(v); err != nil { - return DevicesInfo{}, err - } - } - - deviceInfo.ID = c.Id - deviceInfo.Name = ns + "_" + podName + "_" + containerName - return deviceInfo, nil -} diff --git a/mind-cluster/component/npu-exporter/collector/container/utils_test.go b/mind-cluster/component/npu-exporter/collector/container/utils_test.go deleted file mode 100644 index 32e6716..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/utils_test.go +++ /dev/null @@ -1,329 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// package container test methods in utils -package container - -import ( - "context" - "errors" - "net" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "google.golang.org/grpc" - - "ascend-common/common-utils/hwlog" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - testContainerID = "container123" - testPodNamespace = "default" - testPodName = "test-pod" - testContainerName = "test-container" - testUnixSocket = "unix:///test.sock" - testInvalidEndpoint = "invalid://endpoint" - testDialError = "dial error" - testGrpcDialError = "grpc dial error" - testInvalidEndpointError = "invalid endpoint" - testEndpointNotSetError = "endpoint is not set" - testDNSContent = "test-dns" - testMinDNSContent = "a" - testEmptyDNSContent = "" - testTarget = "test" - testUnixScheme = "unix" - testTcpScheme = "tcp" - testUnixAddr = "/tmp/test.sock" - testTcpAddr = "localhost:8080" - testInvalidURL = "://invalid" - testEmptyNamespace = "" - testEmptyPodName = "" - testEmptyContainerName = "" -) - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") -} - -func TestGetConnection(t *testing.T) { - convey.Convey("TestGetConnection", t, func() { - convey.Convey("should return error when endpoint is empty", func() { - testEmptyEndpoint() - }) - convey.Convey("should return error when endpoint is invalid", func() { - testInvalidEndpointFunc() - }) - convey.Convey("should return error when grpc dial context fails", func() { - testGrpcDialErrorFunc() - }) - convey.Convey("should return connection when successful", func() { - testSuccessfulConnection() - }) - }) -} - -func testEmptyEndpoint() { - conn, err := GetConnection("") - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testEndpointNotSetError) -} - -func testInvalidEndpointFunc() { - patches := gomonkey.ApplyFuncReturn(getAddressAndDialer, "", nil, errors.New(testInvalidEndpointError)) - defer patches.Reset() - conn, err := GetConnection(testInvalidEndpoint) - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testInvalidEndpointError) -} - -func testGrpcDialErrorFunc() { - patches := gomonkey.ApplyFunc(getAddressAndDialer, - func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { - return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { - return nil, errors.New(testDialError) - }, nil - }) - defer patches.Reset() - patches.ApplyFuncReturn(grpc.DialContext, nil, errors.New(testGrpcDialError)) - conn, err := GetConnection(testUnixSocket) - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, testGrpcDialError) -} - -func testSuccessfulConnection() { - mockConn := &grpc.ClientConn{} - patches := gomonkey.ApplyFunc(getAddressAndDialer, - func(endpoint string) (string, func(ctx context.Context, addr string) (net.Conn, error), error) { - return testTarget, func(ctx context.Context, addr string) (net.Conn, error) { - return nil, nil - }, nil - }) - defer patches.Reset() - patches.ApplyFuncReturn(grpc.DialContext, mockConn, nil) - conn, err := GetConnection(testUnixSocket) - convey.So(conn, convey.ShouldEqual, mockConn) - convey.So(err, convey.ShouldBeNil) -} - -func TestParseSocketEndpoint(t *testing.T) { - testCases := []struct { - name string - endpoint string - expectedScheme string - expectedAddr string - expectedError bool - }{ - {name: "should parse unix endpoint when valid", endpoint: "unix:///tmp/test.sock", - expectedScheme: testUnixScheme, expectedAddr: testUnixAddr, expectedError: false}, - {name: "should parse tcp endpoint when valid", endpoint: "tcp://localhost:8080", - expectedScheme: testTcpScheme, expectedAddr: testTcpAddr, expectedError: false}, - {name: "should return error when scheme is invalid", endpoint: "http://localhost:8080", - expectedScheme: "http", expectedAddr: "", expectedError: true}, - {name: "should return error when url is invalid", endpoint: testInvalidURL, - expectedScheme: "", expectedAddr: "", expectedError: true}, - } - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - scheme, addr, err := parseSocketEndpoint(tc.endpoint) - convey.So(scheme, convey.ShouldEqual, tc.expectedScheme) - convey.So(addr, convey.ShouldEqual, tc.expectedAddr) - if tc.expectedError { - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(err, convey.ShouldBeNil) - } - }) - } -} - -func TestGetAddressAndDialer(t *testing.T) { - convey.Convey("TestGetAddressAndDialer", t, func() { - testCases := []struct { - name string - endpoint string - expectedAddr string - expectedError bool - }{ - { - name: "should return address when unix endpoint is valid", - endpoint: "unix:///tmp/test.sock", - expectedAddr: "/tmp/test.sock", - expectedError: false, - }, - { - name: "should return error when scheme is invalid", - endpoint: "tcp://localhost:8080", - expectedAddr: "", - expectedError: true, - }, - { - name: "should return error when parse fails", - endpoint: "://invalid", - expectedAddr: "", - expectedError: true, - }, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - addr, dialer, err := getAddressAndDialer(tc.endpoint) - convey.So(addr, convey.ShouldEqual, tc.expectedAddr) - if tc.expectedError { - convey.So(dialer, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - } else { - convey.So(dialer, convey.ShouldNotBeNil) - convey.So(err, convey.ShouldBeNil) - } - }) - } - }) -} - -func TestDial(t *testing.T) { - convey.Convey("should call net.Dialer.DialContext when dialing", t, func() { - var dialerCalled bool - patches := gomonkey.ApplyMethod(&net.Dialer{}, "DialContext", - func(d *net.Dialer, ctx context.Context, network, address string) (net.Conn, error) { - dialerCalled = true - return nil, errors.New("mock dial error") - }) - defer patches.Reset() - ctx := context.Background() - conn, err := dial(ctx, "/tmp/test.sock") - convey.So(conn, convey.ShouldBeNil) - convey.So(err, convey.ShouldNotBeNil) - convey.So(dialerCalled, convey.ShouldBeTrue) - }) -} - -func TestValidDNSRe(t *testing.T) { - convey.Convey("TestValidDNSRe", t, func() { - testCases := []struct { - name string - dnsContent string - expectedError bool - }{ - {name: "should pass validation when dns content has valid length", - dnsContent: testDNSContent, expectedError: false}, - {name: "should return error when dns content is empty", - dnsContent: testEmptyDNSContent, expectedError: true}, - {name: "should return error when dns content is too long", - dnsContent: string(make([]byte, MaxLenDNS+1)), expectedError: true}, - {name: "should pass validation when dns content has minimum valid length", - dnsContent: testMinDNSContent, expectedError: false}, - {name: "should pass validation when dns content has maximum valid length", - dnsContent: string(make([]byte, MaxLenDNS)), expectedError: false}, - } - - for _, tc := range testCases { - convey.Convey(tc.name, func() { - err := validDNSRe(tc.dnsContent) - if tc.expectedError { - convey.So(err, convey.ShouldNotBeNil) - convey.So(err.Error(), convey.ShouldContainSubstring, "param len invalid") - } else { - convey.So(err, convey.ShouldBeNil) - } - }) - } - }) -} - -func TestMakeUpDeviceInfo(t *testing.T) { - testCases := getMakeUpDeviceInfoTestCases() - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - deviceInfo, err := makeUpDeviceInfo(tc.container) - validateMakeUpDeviceInfoResult(deviceInfo, err, tc) - }) - } -} - -func getMakeUpDeviceInfoTestCases() []struct { - name string - container *CommonContainer - expectedError bool - expectedName string -} { - return []struct { - name string - container *CommonContainer - expectedError bool - expectedName string - }{ - {name: "should return valid device info when container has all labels", - container: createValidContainer(), expectedError: false, expectedName: "default_test-pod_test-container"}, - {name: "should return error when container has invalid namespace length", - container: createContainerWithEmptyNamespace(), expectedError: true, expectedName: ""}, - {name: "should return error when container has invalid pod name length", - container: createContainerWithEmptyPodName(), expectedError: true, expectedName: ""}, - {name: "should return error when container has invalid container name length", - container: createContainerWithEmptyContainerName(), expectedError: true, expectedName: ""}, - {name: "should return error when container has too long namespace", - container: createContainerWithLongNamespace(), expectedError: true, expectedName: ""}, - } -} - -func createValidContainer() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, - labelContainerName: testContainerName}} -} -func createContainerWithEmptyNamespace() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testEmptyNamespace, labelK8sPodName: testPodName, - labelContainerName: testContainerName}} -} -func createContainerWithEmptyPodName() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testEmptyPodName, - labelContainerName: testContainerName}} -} -func createContainerWithEmptyContainerName() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: testPodNamespace, labelK8sPodName: testPodName, - labelContainerName: testEmptyContainerName}} -} - -func createContainerWithLongNamespace() *CommonContainer { - return &CommonContainer{Id: testContainerID, Labels: map[string]string{ - labelK8sPodNamespace: string(make([]byte, MaxLenDNS+1)), - labelK8sPodName: testPodName, labelContainerName: testContainerName}} -} - -func validateMakeUpDeviceInfoResult(deviceInfo DevicesInfo, err error, tc struct { - name string - container *CommonContainer - expectedError bool - expectedName string -}) { - if tc.expectedError { - convey.So(err, convey.ShouldNotBeNil) - convey.So(deviceInfo, convey.ShouldResemble, DevicesInfo{}) - } else { - convey.So(err, convey.ShouldBeNil) - convey.So(deviceInfo.ID, convey.ShouldEqual, tc.container.Id) - convey.So(deviceInfo.Name, convey.ShouldEqual, tc.expectedName) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go deleted file mode 100644 index 46762f3..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.pb.go +++ /dev/null @@ -1,310 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// source: containerd.proto -// protoc:3.13.0 -// protoc-gen-go 1.3.5 - -package v1 - -import ( - "context" - "fmt" - "math" - - "github.com/golang/protobuf/proto" - "github.com/golang/protobuf/ptypes/any" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" -) - -// Reference imports to suppress errors if they are not otherwise used. -var _ = fmt.Errorf -var _ = math.Inf -var _ = proto.Marshal - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the proto package it is being compiled against. -// A compilation error at this line likely means your copy of the -// proto package needs to be updated. -const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package - -type Container struct { - // ID the container id - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - // Labels the container labels - Labels map[string]string `protobuf:"bytes,2,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` - // Image the container image - Image string `protobuf:"bytes,3,opt,name=image,proto3" json:"image,omitempty"` - // Spec runtime specific. - Spec *any.Any `protobuf:"bytes,5,opt,name=spec,proto3" json:"spec,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -// Reset reset the object -func (m *Container) Reset() { *m = Container{} } - -// String -func (m *Container) String() string { return proto.CompactTextString(m) } - -// ProtoMessage -func (*Container) ProtoMessage() {} - -// Descriptor -func (*Container) Descriptor() ([]byte, []int) { - return fileDescriptor_29bcc067d8d1b7d0, []int{0} -} - -// XXX_Unmarshal -func (m *Container) XXX_Unmarshal(b []byte) error { - return xxx_messageInfo_Container.Unmarshal(m, b) -} - -// XXX_Marshal -func (m *Container) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - return xxx_messageInfo_Container.Marshal(b, m, deterministic) -} - -// XXX_Merge -func (m *Container) XXX_Merge(src proto.Message) { - xxx_messageInfo_Container.Merge(m, src) -} - -// XXX_Size -func (m *Container) XXX_Size() int { - return xxx_messageInfo_Container.Size(m) -} - -// XXX_DiscardUnknown -func (m *Container) XXX_DiscardUnknown() { - xxx_messageInfo_Container.DiscardUnknown(m) -} - -var xxx_messageInfo_Container proto.InternalMessageInfo - -// GetId -func (m *Container) GetId() string { - if m != nil { - return m.Id - } - return "" -} - -// GetLabels -func (m *Container) GetLabels() map[string]string { - if m != nil { - return m.Labels - } - return nil -} - -// GetImage -func (m *Container) GetImage() string { - if m != nil { - return m.Image - } - return "" -} - -// GetSpec -func (m *Container) GetSpec() *any.Any { - if m != nil { - return m.Spec - } - return nil -} - -type GetContainerRequest struct { - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetContainerRequest) Reset() { *m = GetContainerRequest{} } -func (m *GetContainerRequest) String() string { return proto.CompactTextString(m) } -func (*GetContainerRequest) ProtoMessage() {} -func (*GetContainerRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_29bcc067d8d1b7d0, []int{1} -} - -func (m *GetContainerRequest) XXX_Unmarshal(b []byte) error { - return xxx_messageInfo_GetContainerRequest.Unmarshal(m, b) -} -func (m *GetContainerRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - return xxx_messageInfo_GetContainerRequest.Marshal(b, m, deterministic) -} -func (m *GetContainerRequest) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetContainerRequest.Merge(m, src) -} -func (m *GetContainerRequest) XXX_Size() int { - return xxx_messageInfo_GetContainerRequest.Size(m) -} -func (m *GetContainerRequest) XXX_DiscardUnknown() { - xxx_messageInfo_GetContainerRequest.DiscardUnknown(m) -} - -var xxx_messageInfo_GetContainerRequest proto.InternalMessageInfo - -func (m *GetContainerRequest) GetId() string { - if m != nil { - return m.Id - } - return "" -} - -type GetContainerResponse struct { - Container *Container `protobuf:"bytes,1,opt,name=container,proto3" json:"container,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetContainerResponse) Reset() { *m = GetContainerResponse{} } -func (m *GetContainerResponse) String() string { return proto.CompactTextString(m) } -func (*GetContainerResponse) ProtoMessage() {} -func (*GetContainerResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_29bcc067d8d1b7d0, []int{2} -} - -func (m *GetContainerResponse) XXX_Unmarshal(b []byte) error { - return xxx_messageInfo_GetContainerResponse.Unmarshal(m, b) -} -func (m *GetContainerResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - return xxx_messageInfo_GetContainerResponse.Marshal(b, m, deterministic) -} -func (m *GetContainerResponse) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetContainerResponse.Merge(m, src) -} -func (m *GetContainerResponse) XXX_Size() int { - return xxx_messageInfo_GetContainerResponse.Size(m) -} -func (m *GetContainerResponse) XXX_DiscardUnknown() { - xxx_messageInfo_GetContainerResponse.DiscardUnknown(m) -} - -var xxx_messageInfo_GetContainerResponse proto.InternalMessageInfo - -func (m *GetContainerResponse) GetContainer() *Container { - if m != nil { - return m.Container - } - return nil -} - -func init() { - proto.RegisterType((*Container)(nil), "containerd.services.containers.v1.Container") - proto.RegisterMapType((map[string]string)(nil), "containerd.services.containers.v1.Container.LabelsEntry") - proto.RegisterType((*GetContainerRequest)(nil), "containerd.services.containers.v1.GetContainerRequest") - proto.RegisterType((*GetContainerResponse)(nil), "containerd.services.containers.v1.GetContainerResponse") -} - -func init() { - proto.RegisterFile("containerd.proto", fileDescriptor_29bcc067d8d1b7d0) -} - -var fileDescriptor_29bcc067d8d1b7d0 = []byte{ - // 327 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x51, 0x4f, 0x4b, 0xc3, 0x30, - 0x14, 0xa7, 0xad, 0x1b, 0xec, 0x15, 0x64, 0xc4, 0x1d, 0xea, 0x4e, 0x73, 0x20, 0xf4, 0xa0, 0xa9, - 0xab, 0xa0, 0x53, 0x4f, 0x2a, 0x32, 0x10, 0x0f, 0xd2, 0xa3, 0xb7, 0xb6, 0x7b, 0xce, 0x62, 0x96, - 0xd4, 0x24, 0xad, 0xf6, 0xee, 0x87, 0xf5, 0x63, 0xc8, 0xd2, 0xad, 0x4e, 0x11, 0x74, 0xb7, 0xf7, - 0x5e, 0x7f, 0x7f, 0x1b, 0xe8, 0xa6, 0x82, 0xeb, 0x38, 0xe3, 0x28, 0xa7, 0x34, 0x97, 0x42, 0x0b, - 0xb2, 0xb7, 0x76, 0x51, 0x28, 0xcb, 0x2c, 0x45, 0x45, 0x9b, 0x9b, 0xa2, 0xe5, 0xa8, 0xbf, 0x3b, - 0x13, 0x62, 0xc6, 0x30, 0x30, 0x84, 0xa4, 0x78, 0x0c, 0x62, 0x5e, 0xd5, 0xec, 0xe1, 0x87, 0x05, - 0x9d, 0xeb, 0x15, 0x98, 0x6c, 0x83, 0x9d, 0x4d, 0x3d, 0x6b, 0x60, 0xf9, 0x9d, 0xc8, 0xce, 0xa6, - 0xe4, 0x1e, 0xda, 0x2c, 0x4e, 0x90, 0x29, 0xcf, 0x1e, 0x38, 0xbe, 0x1b, 0x8e, 0xe9, 0x9f, 0x66, - 0xb4, 0x51, 0xa3, 0x77, 0x86, 0x7a, 0xc3, 0xb5, 0xac, 0xa2, 0xa5, 0x0e, 0xe9, 0x41, 0x2b, 0x9b, - 0xc7, 0x33, 0xf4, 0x1c, 0x63, 0x52, 0x2f, 0xc4, 0x87, 0x2d, 0x95, 0x63, 0xea, 0xb5, 0x06, 0x96, - 0xef, 0x86, 0x3d, 0x5a, 0xe7, 0xa5, 0xab, 0xbc, 0xf4, 0x92, 0x57, 0x91, 0x41, 0xf4, 0xcf, 0xc0, - 0x5d, 0x93, 0x25, 0x5d, 0x70, 0x9e, 0xb1, 0x5a, 0x26, 0x5e, 0x8c, 0x0b, 0x83, 0x32, 0x66, 0x05, - 0x7a, 0x76, 0x6d, 0x60, 0x96, 0x73, 0x7b, 0x6c, 0x0d, 0xf7, 0x61, 0x67, 0x82, 0xba, 0x89, 0x17, - 0xe1, 0x4b, 0x81, 0x4a, 0xff, 0xec, 0x3c, 0x4c, 0xa0, 0xf7, 0x1d, 0xa6, 0x72, 0xc1, 0x15, 0x92, - 0x5b, 0xe8, 0x34, 0x45, 0x0d, 0xdc, 0x0d, 0x0f, 0x36, 0xf9, 0x1d, 0xd1, 0x17, 0x3d, 0x7c, 0xb7, - 0x00, 0x9a, 0x0f, 0x8a, 0x94, 0xe0, 0x4c, 0x50, 0x93, 0x93, 0x7f, 0xc8, 0xfd, 0xd2, 0xa0, 0x7f, - 0xba, 0x31, 0xaf, 0xae, 0x74, 0x75, 0xf4, 0x40, 0x9f, 0x8a, 0xf8, 0x15, 0x33, 0x9a, 0x8a, 0x79, - 0xc0, 0xf3, 0xe2, 0x10, 0xdf, 0x72, 0x21, 0x35, 0xca, 0x20, 0x15, 0x8c, 0x61, 0xaa, 0xc5, 0x62, - 0x5a, 0xd2, 0x2e, 0xca, 0x51, 0xd2, 0x36, 0x4f, 0x72, 0xfc, 0x19, 0x00, 0x00, 0xff, 0xff, 0x30, - 0xcc, 0x1c, 0x74, 0x87, 0x02, 0x00, 0x00, -} - -// Reference imports to suppress errors if they are not otherwise used. -var _ context.Context -var _ grpc.ClientConnInterface - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -const _ = grpc.SupportPackageIsVersion6 - -// ContainersClient is the client API for Containers service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. -type ContainersClient interface { - Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) -} - -type containersClient struct { - cc grpc.ClientConnInterface -} - -func NewContainersClient(cc grpc.ClientConnInterface) ContainersClient { - return &containersClient{cc} -} - -func (c *containersClient) Get(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerResponse, error) { - out := new(GetContainerResponse) - err := c.cc.Invoke(ctx, "/containerd.services.containers.v1.Containers/Get", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// ContainersServer is the server API for Containers service. -type ContainersServer interface { - Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) -} - -// UnimplementedContainersServer can be embedded to have forward compatible implementations. -type UnimplementedContainersServer struct { -} - -func (*UnimplementedContainersServer) Get(context.Context, *GetContainerRequest) (*GetContainerResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method Get not implemented") -} - -func RegisterContainersServer(s *grpc.Server, srv ContainersServer) { - s.RegisterService(&_Containers_desc, srv) -} - -func _Containers_Get_Method(srv interface{}, ctx context.Context, desc func(interface{}) error, itcpt grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(GetContainerRequest) - if err := desc(in); err != nil { - return nil, err - } - if itcpt == nil { - return srv.(ContainersServer).Get(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/containerd.services.containers.v1.Containers/Get", - } - handler := func(ctx context.Context, request interface{}) (interface{}, error) { - return srv.(ContainersServer).Get(ctx, request.(*GetContainerRequest)) - } - return itcpt(ctx, in, info, handler) -} - -var _Containers_desc = grpc.ServiceDesc{ - ServiceName: "containerd.services.containers.v1.Containers", - HandlerType: (*ContainersServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "Get", - Handler: _Containers_Get_Method, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "containerd.proto", -} diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto b/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto deleted file mode 100644 index 48a4a4b..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/v1/containerd.proto +++ /dev/null @@ -1,62 +0,0 @@ -syntax = "proto3"; - -package containerd.services.containers.v1; - - -import "google/protobuf/any.proto"; -import "google/protobuf/timestamp.proto"; - -option go_package = "huawei.com/npu-exporter/v6/collector/container;v1"; - -// Containers provides metadata storage for containers used in the execution -// service. -service Containers { - rpc Get(GetContainerRequest) returns (GetContainerResponse); -} - -message Container { - // ID is the user-specified identifier. - string id = 1; - - // Labels provides an area to include arbitrary data on containers. - map labels = 2; - - // Image contains the reference of the image used to build the - string image = 3; - - message Runtime { - // Name is the name of the runtime. - string name = 1; - // Options runtime initialization options. - google.protobuf.Any options = 2; - } - // Runtime specifies runtime. - Runtime runtime = 4; - - // Spec opencotainer spec. - google.protobuf.Any spec = 5; - - // Snapshotter is the snapshotter name used for rootfs - string snapshotter = 6; - - // SnapshotKey the snapshot key to use for the container's root - string snapshot_key = 7; - - // CreatedAt is the create time of container. - google.protobuf.Timestamp created_at = 8 ; - - // UpdatedAt is the last update of container. - google.protobuf.Timestamp updated_at = 9 ; - - // Extensions allow clients to provide zero or more blobs that are directly - map extensions = 10 ; -} - -message GetContainerRequest { - string id = 1; -} - -message GetContainerResponse { - Container container = 1 ; -} - diff --git a/mind-cluster/component/npu-exporter/collector/container/v1/spec.go b/mind-cluster/component/npu-exporter/collector/container/v1/spec.go deleted file mode 100644 index 2efa216..0000000 --- a/mind-cluster/component/npu-exporter/collector/container/v1/spec.go +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package v1 implement the containerd client -package v1 - -// Spec is the base configuration for the container. -type Spec struct { - // Linux is platform-specific configuration for Linux based containers. - Linux *Linux `json:"linux,omitempty" platform:"linux"` - // Process for get capabilities - Process *Process `json:"process,omitempty" platform:"linux"` -} - -// Process is the base configuration for the container. -type Process struct { - // Env for container env - Env []string `json:"env,omitempty" platform:"linux"` -} - -// Linux contains platform-specific configuration for Linux based containers. -type Linux struct { - // Resources contain cgroup information for handling resource constraints - // for the container - Resources *LinuxResources `json:"resources,omitempty"` - // Devices are a list of device nodes that are created for the container -} - -// LinuxResources has container runtime resource constraints -type LinuxResources struct { - // Devices configures the device allowlist. - Devices []LinuxDeviceCgroup `json:"devices,omitempty"` -} - -// LinuxDeviceCgroup represents a device rule for the devices specified to -// the device controller -type LinuxDeviceCgroup struct { - // Allow or deny - Allow bool `json:"allow"` - // Device type, block, char, etc. - Type string `json:"type,omitempty"` - // Major is the device's major number. - Major *int64 `json:"major,omitempty"` - // Minor is the device's minor number. - Minor *int64 `json:"minor,omitempty"` - // Cgroup access permissions format, rwm. - Access string `json:"access,omitempty"` -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go deleted file mode 100644 index 53a7645..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_ddr.go +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - descTotalMemory = colcommon.BuildDesc("npu_chip_info_total_memory", "the npu total memory") - descUsedMemory = colcommon.BuildDesc("npu_chip_info_used_memory", "the npu used memory") - - notSupportedDdrDevices = map[string]bool{ - api.Ascend910B: true, - api.Ascend910A3: true, - } -) - -type ddrCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo the memoryInfo of the chip - extInfo *common.MemoryInfo -} - -// DdrCollector collect ddr info -type DdrCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the metric is supported -func (c *DdrCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := !notSupportedDdrDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "there is no DDR module. DDR information cannot be queried.") - return isSupport -} - -// Describe description of the metric -func (c *DdrCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descTotalMemory - ch <- descUsedMemory -} - -// CollectToCache collect the metric to cache -func (c *DdrCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - - for _, chip := range chipList { - logicID := chip.LogicID - mem, err := n.Dmgr.GetDeviceMemoryInfo(logicID) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForDDR, logicID, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForDDR, logicID) - - c.LocalCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mem}) - } - colcommon.UpdateCache[ddrCache](n, colcommon.GetCacheKey(c), &c.LocalCache) - -} - -// UpdatePrometheus update prometheus metrics -func (c *DdrCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache ddrCache, cardLabel []string) { - extInfo := cache.extInfo - if extInfo == nil { - return - } - memorySize := extInfo.MemorySize - memoryAvailable := extInfo.MemoryAvailable - - doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, descTotalMemory) - doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, descUsedMemory) - - // vnpu not support this metrics - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - - containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) - if !c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { - doUpdateMetric(ch, cache.timestamp, memorySize, cardLabel, npuCtrTotalMemory) - doUpdateMetric(ch, cache.timestamp, memorySize-memoryAvailable, cardLabel, npuCtrUsedMemory) - } - } - - updateFrame[ddrCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf update telegraf metrics -func (c *DdrCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - logger.Debugf("cacheKey(%v) not found", chip.PhyId) - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - memoryInfo := cache.extInfo - if memoryInfo == nil { - logger.Debugf("info in cache is nil,cacheKey(%v)", chip.PhyId) - continue - } - memorySize := memoryInfo.MemorySize - memoryAvailable := memoryInfo.MemoryAvailable - - doUpdateTelegraf(fieldMap, descTotalMemory, memorySize, "") - doUpdateTelegraf(fieldMap, descUsedMemory, memorySize-memoryAvailable, "") - - } - return fieldsMap -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go deleted file mode 100644 index d9f5601..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm.go +++ /dev/null @@ -1,228 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -var ( - descHbmUsedMemory = colcommon.BuildDesc("npu_chip_info_hbm_used_memory", "the npu hbm used memory") - descHbmTotalMemory = colcommon.BuildDesc("npu_chip_info_hbm_total_memory", "the npu hbm total memory") - descHbmUtilization = colcommon.BuildDesc("npu_chip_info_hbm_utilization", "the npu hbm utilization") - descHbmTemperature = colcommon.BuildDesc("npu_chip_info_hbm_temperature", "the npu hbm temperature") - descHbmBWUtil = colcommon.BuildDesc("npu_chip_info_hbm_bandwidth_utilization", "the npu hbm bandwidth util rate") - - descEccEnableFlag = colcommon.BuildDesc("npu_chip_info_hbm_ecc_enable_flag", - "whether HBM ecc detection is enabled") - descEccSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_error_cnt", - "HBM Single Bit Error Count") - descEccDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_error_cnt", - "HBM Double Bit Error Count") - - descEccTotalSingleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_single_bit_error_cnt", - "HBM Single Bit Aggregate Total Err Cnt") - descEccTotalDoubleBitErrorCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_total_double_bit_error_cnt", - "HBM Double Bit Aggregate Total Err Cnt") - descEccSingleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_single_bit_isolated_pages_cnt", - "HBM Single Bit Isolated Pages Count") - descEccDoubleBitIoslatedPagesCnt = colcommon.BuildDesc("npu_chip_info_hbm_ecc_double_bit_isolated_pages_cnt", - "HBM Double Bit Isolated Pages Count") -) - -var ( - supportedHbmDevices = map[string]bool{ - api.Ascend910A: true, - api.Ascend910B: true, - api.Ascend910A3: true, - } -) - -type hbmCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo the hbm info - extInfo *common.HbmAggregateInfo - // hbmUtilization the hbm utilization - hbmUtilization uint32 -} - -// HbmCollector collects hbm info -type HbmCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *HbmCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedHbmDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe describes all the metrics that will be exposed. -func (c *HbmCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descHbmUsedMemory - ch <- descHbmTotalMemory - ch <- descHbmUtilization - ch <- descHbmTemperature - ch <- descHbmBWUtil - - ch <- descEccEnableFlag - ch <- descEccSingleBitErrorCnt - ch <- descEccDoubleBitErrorCnt - ch <- descEccTotalSingleBitErrorCnt - ch <- descEccTotalDoubleBitErrorCnt - ch <- descEccSingleBitIoslatedPagesCnt - ch <- descEccDoubleBitIoslatedPagesCnt -} - -// CollectToCache collects hbm info -func (c *HbmCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - getAllHBMEccInfo(c, chip.LogicID, n.Dmgr, &chip) - } - colcommon.UpdateCache[hbmCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus updates the prometheus metrics. -func (c *HbmCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hbmCache, cardLabel []string) { - extInfo := cache.extInfo - if extInfo == nil { - return - } - timestamp := cache.timestamp - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.hbmUtilization), cardLabel, descHbmUtilization) - - c.updateHbmInfo(ch, cache, cardLabel, containerMap, chipWithVnpu) - - eccInfo := extInfo.ECCInfo - updateHbmEccInfo(ch, eccInfo, timestamp, cardLabel) - } - - updateFrame[hbmCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf updates the telegraf metrics. -func (c *HbmCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - caches := colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - - doUpdateTelegrafWithValidateNum(fieldMap, descHbmUtilization, float64(cache.hbmUtilization), "") - - hbmInfo := extInfo.HbmInfo - if hbmInfo != nil { - doUpdateTelegraf(fieldMap, descHbmUsedMemory, hbmInfo.Usage, "") - doUpdateTelegraf(fieldMap, descHbmTotalMemory, hbmInfo.MemorySize, "") - doUpdateTelegraf(fieldMap, descHbmTemperature, hbmInfo.Temp, "") - doUpdateTelegraf(fieldMap, descHbmBWUtil, hbmInfo.BandWidthUtilRate, "") - } - - eccInfo := extInfo.ECCInfo - if eccInfo != nil { - doUpdateTelegraf(fieldMap, descEccEnableFlag, eccInfo.EnableFlag, "") - doUpdateTelegraf(fieldMap, descEccSingleBitErrorCnt, eccInfo.SingleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccDoubleBitErrorCnt, eccInfo.DoubleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccTotalSingleBitErrorCnt, eccInfo.TotalSingleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccTotalDoubleBitErrorCnt, eccInfo.TotalDoubleBitErrorCnt, "") - doUpdateTelegraf(fieldMap, descEccSingleBitIoslatedPagesCnt, eccInfo.SingleBitIsolatedPagesCnt, "") - doUpdateTelegraf(fieldMap, descEccDoubleBitIoslatedPagesCnt, eccInfo.DoubleBitIsolatedPagesCnt, "") - - } - } - return fieldsMap - -} - -func getAllHBMEccInfo(c *HbmCollector, logicID int32, dmgr devmanager.DeviceInterface, chip *colcommon.HuaWeiAIChip) { - - hbmInfo := &common.HbmAggregateInfo{} - var utilizationRate uint32 - var err error - hbmInfo.HbmInfo, err = dmgr.GetDeviceHbmInfo(logicID) - handleErr(err, colcommon.DomainForHBM, logicID) - - utilizationRate, err = dmgr.GetDeviceUtilizationRate(logicID, common.HbmUtilization) - handleErr(err, colcommon.DomainForHbmUtilization, logicID) - - hbmInfo.ECCInfo, err = dmgr.GetDeviceEccInfo(logicID, common.DcmiDeviceTypeHBM) - handleErr(err, colcommon.DomainForHBMECC, logicID) - c.LocalCache.Store(chip.PhyId, hbmCache{ - chip: *chip, - timestamp: time.Now(), - extInfo: hbmInfo, - hbmUtilization: utilizationRate}, - ) -} - -func updateHbmEccInfo(ch chan<- prometheus.Metric, eccInfo *common.ECCInfo, timestamp time.Time, cardLabel []string) { - if eccInfo == nil { - return - } - doUpdateMetric(ch, timestamp, eccInfo.EnableFlag, cardLabel, descEccEnableFlag) - doUpdateMetric(ch, timestamp, eccInfo.SingleBitErrorCnt, cardLabel, descEccSingleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.DoubleBitErrorCnt, cardLabel, descEccDoubleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.TotalSingleBitErrorCnt, cardLabel, descEccTotalSingleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.TotalDoubleBitErrorCnt, cardLabel, descEccTotalDoubleBitErrorCnt) - doUpdateMetric(ch, timestamp, eccInfo.SingleBitIsolatedPagesCnt, cardLabel, descEccSingleBitIoslatedPagesCnt) - doUpdateMetric(ch, timestamp, eccInfo.DoubleBitIsolatedPagesCnt, cardLabel, descEccDoubleBitIoslatedPagesCnt) -} - -func (c *HbmCollector) updateHbmInfo(ch chan<- prometheus.Metric, cache hbmCache, cardLabel []string, - containerMap map[int32]container.DevicesInfo, chipWithVnpu colcommon.HuaWeiAIChip) { - hbmInfo := cache.extInfo - if hbmInfo == nil || hbmInfo.HbmInfo == nil { - return - } - timestamp := cache.timestamp - doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, descHbmUsedMemory) - doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, descHbmTotalMemory) - doUpdateMetric(ch, timestamp, hbmInfo.Temp, cardLabel, descHbmTemperature) - doUpdateMetric(ch, timestamp, hbmInfo.BandWidthUtilRate, cardLabel, descHbmBWUtil) - - // vnpu not support this metrics - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - - containerNameArray := getContainerNameArray(geenContainerInfo(&chipWithVnpu, containerMap)) - if c.Is910Series && len(containerNameArray) == colcommon.ContainerNameLen { - doUpdateMetric(ch, timestamp, hbmInfo.MemorySize, cardLabel, npuCtrTotalMemory) - doUpdateMetric(ch, timestamp, hbmInfo.Usage, cardLabel, npuCtrUsedMemory) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go deleted file mode 100644 index 4bf59cd..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hbm_test.go +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" -) - -type TestCase struct { - name string - initFunc func() - expectMetricLen int -} - -const ( - expectMetricLen4 = 4 - expectMetricLen6 = 6 - vdevId = 132 - maxMetrics = 10 - mockNs = "mockNs" - mockPodName = "mockPodName" -) - -func TestUpdateHbmInfo(t *testing.T) { - collector := HbmCollector{} - ch := make(chan int, maxMetrics) - defer close(ch) - cache := buildHbmCache() - chipWithVnpu := &colcommon.HuaWeiAIChip{} - cases := buildTestCases(&collector, chipWithVnpu, &cache) - patch := gomonkey.NewPatches() - patch.ApplyFunc(doUpdateMetric, func(_ chan<- prometheus.Metric, _ time.Time, _ interface{}, _ []string, - desc *prometheus.Desc) { - ch <- 0 - }) - patch.ApplyFuncReturn(geenContainerInfo, nil) - patch.ApplyFuncReturn(getContainerNameArray, []string{mockNs, mockPodName, mockContainerName}) - defer patch.Reset() - - for _, c := range cases { - convey.Convey(c.name, t, func() { - ch = make(chan int, maxMetrics) - c.initFunc() - collector.updateHbmInfo(nil, cache, nil, nil, *chipWithVnpu) - convey.So(len(ch), convey.ShouldEqual, c.expectMetricLen) - }) - } -} - -func buildTestCases(collector *HbmCollector, chipWithVnpu *colcommon.HuaWeiAIChip, cache *hbmCache) []TestCase { - cases := []TestCase{ - {name: "when npu is not 910 series ", initFunc: func() {}, expectMetricLen: expectMetricLen4}, - {name: "when vnpu is nil and with container info", initFunc: func() { - collector.Is910Series = true - }, expectMetricLen: expectMetricLen6}, - {name: "when chip is vnpu", initFunc: func() { - chipWithVnpu.VDevActivityInfo = &common.VDevActivityInfo{ - VDevID: vdevId, - } - }, expectMetricLen: expectMetricLen4}, - {name: "when extInfo.HbmInfo is nil", initFunc: func() { cache.extInfo.HbmInfo = nil }, expectMetricLen: 0}, - {name: "when extInfo is nil", initFunc: func() { cache.extInfo = nil }, expectMetricLen: 0}, - } - return cases -} - -func buildHbmCache() hbmCache { - cache := hbmCache{ - chip: colcommon.HuaWeiAIChip{ - PhyId: 0, - }, - hbmUtilization: 0, - timestamp: time.Now(), - extInfo: &common.HbmAggregateInfo{ - HbmInfo: &common.HbmInfo{ - BandWidthUtilRate: 0, - Frequency: 0, - MemorySize: 0, - Temp: 0, - Usage: 0, - }, - ECCInfo: &common.ECCInfo{ - EnableFlag: 0, - SingleBitErrorCnt: 0, - DoubleBitErrorCnt: 0, - TotalSingleBitErrorCnt: 0, - TotalDoubleBitErrorCnt: 0, - SingleBitIsolatedPagesCnt: 0, - DoubleBitIsolatedPagesCnt: 0, - }, - }, - } - return cache -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go deleted file mode 100644 index 1ecc3a9..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs.go +++ /dev/null @@ -1,312 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "fmt" - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - hccsTxDescs []*prometheus.Desc - hccsRxDescs []*prometheus.Desc - hccsErrDescs []*prometheus.Desc - hccsBWTxDescs []*prometheus.Desc - hccsBWRxDescs []*prometheus.Desc - hccsBWProfilingTime *prometheus.Desc = nil - hccsBWTotalTx *prometheus.Desc = nil - hccsBWTotalRx *prometheus.Desc = nil - - supportedHccsDevices = map[string]bool{ - api.Ascend910B: true, - api.Ascend910A3: true, - } -) - -const ( - // MaxHccsNum max hccs num - MaxHccsNum int = 8 - // hccs info begin index, 1 or 2 - num1 = 1 - num2 = 2 -) - -// init add descs in init method -func init() { - for i := 0; i < MaxHccsNum; i++ { - index := strconv.Itoa(i) - colcommon.BuildDescSlice(&hccsTxDescs, api.Prefix+"tx_cnt_"+index, - "transmitted message count for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsRxDescs, api.Prefix+"rx_cnt_"+index, - "received message count for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsErrDescs, api.Prefix+"crc_err_cnt_"+index, - "crc error count for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsBWTxDescs, api.BwPrefix+"tx_"+index, - "single-link transmission data bandwidth for "+api.Hccs+" "+index) - colcommon.BuildDescSlice(&hccsBWRxDescs, api.BwPrefix+"rx_"+index, - "single-link receive data bandwidth for "+api.Hccs+" "+index) - } - hccsBWProfilingTime = colcommon.BuildDesc(api.BwPrefix+"profiling_time", - "sampling interval for "+api.Hccs+" bandwidth") - hccsBWTotalTx = colcommon.BuildDesc(api.BwPrefix+"total_tx", "total sent data bandwidth") - hccsBWTotalRx = colcommon.BuildDesc(api.BwPrefix+"total_rx", "total received data bandwidth") -} - -type hccsCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // hccsStat hccs info of npu chip - hccsStat *common.HccsStatisticInfo - - // hccsBW hccs bandwidth info of npu chip - hccsBW *common.HccsBandwidthInfo -} - -// HccsCollector collect hccs info -type HccsCollector struct { - colcommon.MetricsCollectorAdapter - hccsBeginIndex int - - // Automatically adapt according to the interface call - realGetStatisticInfoFunc func(logicID int32) (*common.HccsStatisticInfo, error) -} - -// IsSupported judge whether the collector is supported -func (c *HccsCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedHccsDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe description of the metric -func (c *HccsCollector) Describe(ch chan<- *prometheus.Desc) { - for _, desc := range hccsTxDescs { - ch <- desc - } - for _, desc := range hccsRxDescs { - ch <- desc - } - for _, desc := range hccsErrDescs { - ch <- desc - } - for _, desc := range hccsBWTxDescs { - ch <- desc - } - for _, desc := range hccsBWRxDescs { - ch <- desc - } - ch <- hccsBWProfilingTime - ch <- hccsBWTotalTx - ch <- hccsBWTotalRx -} - -// CollectToCache collect the metric to cache -func (c *HccsCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - logicID := chip.LogicID - var hccsStatisticInfo *common.HccsStatisticInfo - var err error - if c.realGetStatisticInfoFunc != nil { - hccsStatisticInfo, err = c.realGetStatisticInfoFunc(logicID) - } else { - hccsStatisticInfo = buildFailedHccsInfo() - err = fmt.Errorf("realGetStatisticInfoFunc is nil when get hccs info, " + - "maybe both GetHccsStatisticInfoInU64 and GetHccsStatisticInfo can't be unreached") - } - handleErr(err, colcommon.DomainForHccs, logicID) - - hccsBandwidthInfo, err := n.Dmgr.GetHccsBandwidthInfo(logicID) - handleErr(err, colcommon.DomainForHccsBW, logicID) - c.LocalCache.Store(chip.PhyId, hccsCache{ - chip: chip, - timestamp: time.Now(), - hccsStat: hccsStatisticInfo, - hccsBW: hccsBandwidthInfo}, - ) - } - - colcommon.UpdateCache[hccsCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// PreCollect pre collect hccs info -func (c *HccsCollector) PreCollect(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - if len(chipList) == 0 { - return - } - chipOne := chipList[0] - devType := n.Dmgr.GetDevType() - if devType == api.Ascend910B || common.IsA900A3SuperPod(chipOne.MainBoardId) || - common.Is800IA3Chip(chipOne.MainBoardId) { - // A2 or A900A3 SuperPod or 800IA3 begin at 1st bit - c.hccsBeginIndex = num1 - } else if common.IsA9000A3SuperPod(chipOne.MainBoardId) { - // A9000A3SuperPod begin at 2nd bit - c.hccsBeginIndex = num2 - } else { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: api.Hccs, ID: "0"}, - "not support main board id:%d", chipOne.MainBoardId) - } - - // Both failed, retry 3 times with 2s interval - const retryTimes = 3 - const retryInterval = 2 * time.Second - var success bool - var err1, err2 error - for i := 0; i < retryTimes; i++ { - _, err1 = n.Dmgr.GetHccsStatisticInfoInU64(chipOne.LogicID) - if err1 == nil { - logger.Infof("get hccs statistic info by subCmd(5) succeeded, will use subCmd(5) to get hccs info") - c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfoInU64 - success = true - break - } - _, err2 = n.Dmgr.GetHccsStatisticInfo(chipOne.LogicID) - if err2 == nil { - logger.Infof("get hccs statistic info by subCmd(3) succeeded, will use subCmd(3) to get hccs info") - c.realGetStatisticInfoFunc = n.Dmgr.GetHccsStatisticInfo - success = true - break - } - time.Sleep(retryInterval) - } - // If still failed after retries, set to nil and log error - if !success { - logger.Errorf("get hccs statistic info failed after trying both subCmd(5) and subCmd(3) with 3 retries, "+ - "err1: %v, err2: %v", err1, err2) - c.realGetStatisticInfoFunc = nil - } - -} - -// UpdatePrometheus update prometheus -func (c *HccsCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache hccsCache, cardLabel []string) { - timestamp := cache.timestamp - promUpdateHccsStatisticInfo(ch, cache, c, timestamp, cardLabel) - promUpdateHccsBwInfo(ch, cache, c, timestamp, cardLabel) - } - updateFrame[hccsCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -func promUpdateHccsBwInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, - timestamp time.Time, cardLabel []string) { - bandwidthInfo := cache.hccsBW - if bandwidthInfo == nil { - return - } - if c.hccsBeginIndex < 0 { - logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateMetric(ch, timestamp, bandwidthInfo.TxBandwidth[i], cardLabel, hccsBWTxDescs[i]) - doUpdateMetric(ch, timestamp, bandwidthInfo.RxBandwidth[i], cardLabel, hccsBWRxDescs[i]) - } - doUpdateMetric(ch, timestamp, bandwidthInfo.ProfilingTime, cardLabel, hccsBWProfilingTime) - doUpdateMetric(ch, timestamp, bandwidthInfo.TotalTxbw, cardLabel, hccsBWTotalTx) - doUpdateMetric(ch, timestamp, bandwidthInfo.TotalRxbw, cardLabel, hccsBWTotalRx) -} - -func promUpdateHccsStatisticInfo(ch chan<- prometheus.Metric, cache hccsCache, c *HccsCollector, - timestamp time.Time, cardLabel []string) { - statisticInfo := cache.hccsStat - - if statisticInfo == nil { - return - } - if c.hccsBeginIndex < 0 { - logger.Errorf("invalid %sBeginIndex %v", api.Hccs, c.hccsBeginIndex) - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateMetric(ch, timestamp, statisticInfo.TxCnt[i], cardLabel, hccsTxDescs[i]) - doUpdateMetric(ch, timestamp, statisticInfo.RxCnt[i], cardLabel, hccsRxDescs[i]) - doUpdateMetric(ch, timestamp, statisticInfo.CrcErrCnt[i], cardLabel, hccsErrDescs[i]) - } -} - -// UpdateTelegraf update telegraf -func (c *HccsCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - telegrafUpdateHccsStatisticInfo(cache, c, fieldMap) - telegrafUpdateHccsBwInfo(cache, c, fieldMap) - } - - return fieldsMap - -} - -func telegrafUpdateHccsBwInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { - bandwidthInfo := cache.hccsBW - if bandwidthInfo == nil || c.hccsBeginIndex < 0 { - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateTelegraf(fieldMap, hccsBWTxDescs[i], bandwidthInfo.TxBandwidth[i], "") - doUpdateTelegraf(fieldMap, hccsBWRxDescs[i], bandwidthInfo.RxBandwidth[i], "") - } - doUpdateTelegraf(fieldMap, hccsBWProfilingTime, bandwidthInfo.ProfilingTime, "") - doUpdateTelegraf(fieldMap, hccsBWTotalTx, bandwidthInfo.TotalTxbw, "") - doUpdateTelegraf(fieldMap, hccsBWTotalRx, bandwidthInfo.TotalRxbw, "") -} - -func telegrafUpdateHccsStatisticInfo(cache hccsCache, c *HccsCollector, fieldMap map[string]interface{}) { - statisticInfo := cache.hccsStat - - if statisticInfo == nil || c.hccsBeginIndex < 0 { - return - } - for i := c.hccsBeginIndex; i < MaxHccsNum; i++ { - doUpdateTelegraf(fieldMap, hccsTxDescs[i], statisticInfo.TxCnt[i], "") - doUpdateTelegraf(fieldMap, hccsRxDescs[i], statisticInfo.RxCnt[i], "") - doUpdateTelegraf(fieldMap, hccsErrDescs[i], statisticInfo.CrcErrCnt[i], "") - } -} - -// buildFailedHccsInfo build failed hccs info -func buildFailedHccsInfo() *common.HccsStatisticInfo { - errorResult := &common.HccsStatisticInfo{ - TxCnt: make([]uint64, 8), - RxCnt: make([]uint64, 8), - CrcErrCnt: make([]uint64, 8), - } - for i := 0; i < 8; i++ { - errorResult.TxCnt[i] = common.FailedValue - errorResult.RxCnt[i] = common.FailedValue - errorResult.CrcErrCnt[i] = common.FailedValue - } - return errorResult -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go deleted file mode 100644 index 4b596df..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_hccs_test.go +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" -) - -const ( - mockLogicID int32 = 0 - mockMainBoardId uint32 = 100 - errorMsgWith8001 string = "error code 8001 occurred" - errorMsgWithout8001 string = "error code 8002 occurred" - singleChipList int = 1 - unsupportedBoardId uint32 = 999 -) - -type preCollectTestCase struct { - name string - chipList []colcommon.HuaWeiAIChip - devType string - mainBoardId uint32 - isA900A3SuperPod bool - isA9000A3SuperPod bool - is800IA3Chip bool - getStatInfoErr error - expectedBeginIndex int - expectedFuncSet bool -} - -func TestPreCollect(t *testing.T) { - n := mockNewNpuCollector() - testCases := buildPreCollectTestCases() - - for _, tc := range testCases { - convey.Convey(tc.name, t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - - setupPatches(patches, n, tc) - collector := &HccsCollector{} - collector.PreCollect(n, tc.chipList) - verifyPreCollectResult(collector, tc) - }) - } -} - -func buildPreCollectTestCases() []preCollectTestCase { - cases := []preCollectTestCase{ - {name: "should return early when chipList is empty", - chipList: []colcommon.HuaWeiAIChip{}, - expectedBeginIndex: 0, - expectedFuncSet: false}, - {name: "should not set beginIndex when mainBoardId is not supported", - chipList: createMockChipList(singleChipList, unsupportedBoardId), - devType: api.Ascend910A3, - mainBoardId: unsupportedBoardId, - getStatInfoErr: nil, - expectedBeginIndex: 0, - expectedFuncSet: true}, - } - cases = append(cases, buildBeginIndexCases()...) - return cases -} - -func buildBeginIndexCases() []preCollectTestCase { - return []preCollectTestCase{ - {name: "should set beginIndex to num1 when devType is Ascend910B", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910B, - mainBoardId: mockMainBoardId, - getStatInfoErr: nil, - expectedBeginIndex: num1, - expectedFuncSet: true}, - {name: "should set beginIndex to num1 when IsA900A3SuperPod returns true", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910A3, - mainBoardId: mockMainBoardId, - isA900A3SuperPod: true, - getStatInfoErr: nil, - expectedBeginIndex: num1, - expectedFuncSet: true}, - {name: "should set beginIndex to num1 when Is800IA3Chip returns true", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910A3, - mainBoardId: mockMainBoardId, - is800IA3Chip: true, - getStatInfoErr: nil, - expectedBeginIndex: num1, - expectedFuncSet: true}, - {name: "should set beginIndex to num2 when IsA9000A3SuperPod returns true", - chipList: createMockChipList(singleChipList, mockMainBoardId), - devType: api.Ascend910A3, - mainBoardId: mockMainBoardId, - isA9000A3SuperPod: true, - getStatInfoErr: nil, - expectedBeginIndex: num2, - expectedFuncSet: true}, - } -} - -func createMockChipList(count int, mainBoardId uint32) []colcommon.HuaWeiAIChip { - if count == 0 { - return []colcommon.HuaWeiAIChip{} - } - return []colcommon.HuaWeiAIChip{ - { - LogicID: mockLogicID, - MainBoardId: mainBoardId, - }, - } -} - -func setupPatches(patches *gomonkey.Patches, n *colcommon.NpuCollector, tc preCollectTestCase) { - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", tc.devType) - patches.ApplyFuncReturn(common.IsA900A3SuperPod, tc.isA900A3SuperPod) - patches.ApplyFuncReturn(common.IsA9000A3SuperPod, tc.isA9000A3SuperPod) - patches.ApplyFuncReturn(common.Is800IA3Chip, tc.is800IA3Chip) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", - &common.HccsStatisticInfo{}, tc.getStatInfoErr) -} - -func verifyPreCollectResult(collector *HccsCollector, tc preCollectTestCase) { - convey.So(collector.hccsBeginIndex, convey.ShouldEqual, tc.expectedBeginIndex) - if tc.expectedFuncSet { - convey.So(collector.realGetStatisticInfoFunc, convey.ShouldNotBeNil) - } else { - convey.So(collector.realGetStatisticInfoFunc, convey.ShouldBeNil) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go deleted file mode 100644 index 018a370..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_network.go +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -var ( - // bandwidth - descBandwidthTx = colcommon.BuildDesc("npu_chip_info_bandwidth_tx", - "the npu interface transport speed, unit is 'MB/s'") - descBandwidthRx = colcommon.BuildDesc("npu_chip_info_bandwidth_rx", - "the npu interface receive speed, unit is 'MB/s'") - - // linkspeed - npuChipLinkSpeed = colcommon.BuildDesc("npu_chip_link_speed", - "the npu interface receive link speed, unit is 'Mb/s'") - - // linkupNum - npuChipLinkUpNum = colcommon.BuildDesc("npu_chip_link_up_num", "the npu interface receive link-up num") - - // linkstatus - descLinkStatus = colcommon.BuildDesc("npu_chip_info_link_status", "the npu link status") -) - -type netInfoCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - extInfo *common.NpuNetInfo -} - -// NetworkCollector collects the network info -type NetworkCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check if the collector is supported -func (c *NetworkCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := n.Dmgr.IsTrainingCard() - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "only training card supports network related info") - return isSupport -} - -// Describe description of the metric -func (c *NetworkCollector) Describe(ch chan<- *prometheus.Desc) { - // bandwidth - ch <- descBandwidthTx - ch <- descBandwidthRx - // linkspeed - ch <- npuChipLinkSpeed - // linkupNum - ch <- npuChipLinkUpNum - // linkstatus - ch <- descLinkStatus -} - -// CollectToCache collect the metric to cache -func (c *NetworkCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - netInfo := collectNetworkInfo(chip.PhyId) - c.LocalCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: &netInfo}) - } - colcommon.UpdateCache[netInfoCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *NetworkCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache netInfoCache, cardLabel []string) { - netInfo := cache.extInfo - if netInfo == nil { - return - } - time := cache.timestamp - if validateNotNilForEveryElement(netInfo.BandwidthInfo) { - doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.TxValue, cardLabel, descBandwidthTx) - doUpdateMetricWithValidateNum(ch, time, netInfo.BandwidthInfo.RxValue, cardLabel, descBandwidthRx) - } - if validateNotNilForEveryElement(netInfo.LinkSpeedInfo) { - doUpdateMetricWithValidateNum(ch, time, netInfo.LinkSpeedInfo.Speed, cardLabel, npuChipLinkSpeed) - } - if validateNotNilForEveryElement(netInfo.LinkStatInfo) { - doUpdateMetricWithValidateNum(ch, time, netInfo.LinkStatInfo.LinkUPNum, cardLabel, npuChipLinkUpNum) - } - if validateNotNilForEveryElement(netInfo.LinkStatusInfo) { - doUpdateMetricWithValidateNum(ch, time, float64(hccn.GetLinkStatusCode(netInfo.LinkStatusInfo.LinkState)), - cardLabel, descLinkStatus) - } - } - updateFrame[netInfoCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf update telegraf metrics -func (c *NetworkCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - extInfo := cache.extInfo - if extInfo == nil { - continue - } - if validateNotNilForEveryElement(extInfo.BandwidthInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthTx, extInfo.BandwidthInfo.TxValue, "") - doUpdateTelegrafWithValidateNum(fieldMap, descBandwidthRx, extInfo.BandwidthInfo.RxValue, "") - } - if validateNotNilForEveryElement(extInfo.LinkSpeedInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkSpeed, extInfo.LinkSpeedInfo.Speed, "") - } - if validateNotNilForEveryElement(extInfo.LinkStatInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, npuChipLinkUpNum, extInfo.LinkStatInfo.LinkUPNum, "") - } - if validateNotNilForEveryElement(extInfo.LinkStatusInfo) { - doUpdateTelegrafWithValidateNum(fieldMap, descLinkStatus, - float64(hccn.GetLinkStatusCode(extInfo.LinkStatusInfo.LinkState)), "") - } - } - return fieldsMap -} - -func collectNetworkInfo(phyID int32) common.NpuNetInfo { - newNetInfo := common.NpuNetInfo{} - - newNetInfo.LinkStatusInfo = &common.LinkStatusInfo{} - if linkState, err := hccn.GetNPULinkStatus(phyID); err == nil { - newNetInfo.LinkStatusInfo.LinkState = linkState - hwlog.ResetErrCnt(colcommon.DomainForLinkState, phyID) - } else { - logErrMetricsWithLimit(colcommon.DomainForLinkState, phyID, err) - newNetInfo.LinkStatusInfo.LinkState = colcommon.Abnormal - } - - if tx, rx, err := hccn.GetNPUInterfaceTraffic(phyID); err == nil { - newNetInfo.BandwidthInfo = &common.BandwidthInfo{} - newNetInfo.BandwidthInfo.RxValue = rx - newNetInfo.BandwidthInfo.TxValue = tx - hwlog.ResetErrCnt(colcommon.DomainForBandwidth, phyID) - } else { - newNetInfo.BandwidthInfo = nil - logErrMetricsWithLimit(colcommon.DomainForBandwidth, phyID, err) - } - if linkUpNum, err := hccn.GetNPULinkUpNum(phyID); err == nil { - newNetInfo.LinkStatInfo = &common.LinkStatInfo{} - newNetInfo.LinkStatInfo.LinkUPNum = float64(linkUpNum) - hwlog.ResetErrCnt(colcommon.DomainForLinkStat, phyID) - } else { - newNetInfo.LinkStatInfo = nil - logErrMetricsWithLimit(colcommon.DomainForLinkStat, phyID, err) - } - - if speed, err := hccn.GetNPULinkSpeed(phyID); err == nil { - newNetInfo.LinkSpeedInfo = &common.LinkSpeedInfo{} - newNetInfo.LinkSpeedInfo.Speed = float64(speed) - hwlog.ResetErrCnt(colcommon.DomainForLinkSpeed, phyID) - } else { - newNetInfo.LinkSpeedInfo = nil - logErrMetricsWithLimit(colcommon.DomainForLinkSpeed, phyID, err) - } - - return newNetInfo -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go deleted file mode 100644 index 975ffcf..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_npu.go +++ /dev/null @@ -1,453 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "math" - "strconv" - "strings" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - errorCodeDescs []*prometheus.Desc - cardLabelForProcess = append(colcommon.CardLabel, "process_id", "container_id") - cardLabelForContainer []string - cardLabelForSN []string - cardLabelForNpuName = make([]string, len(colcommon.CardLabel)) -) - -var ( - machineInfoNPUDesc = colcommon.BuildDescWithLabel("machine_npu_nums", "Amount of npu installed on the machine.", nil) - - descUtil = colcommon.BuildDesc("npu_chip_info_utilization", "the ai core utilization") - descOverUtil = colcommon.BuildDesc("npu_chip_info_overall_utilization", "the overall utilization of npu") - descVectorUtil = colcommon.BuildDesc("npu_chip_info_vector_utilization", "the vector ai core utilization") - descTemp = colcommon.BuildDesc("npu_chip_info_temperature", "the npu temperature") - descPower = colcommon.BuildDesc("npu_chip_info_power", "the npu power") - descVoltage = colcommon.BuildDesc("npu_chip_info_voltage", "the npu voltage") - - descAICoreFreq = colcommon.BuildDesc("npu_chip_info_aicore_current_freq", - "the npu ai core current frequency, unit is 'MHz'") - descHealthStatus = colcommon.BuildDesc("npu_chip_info_health_status", "the npu health status") - descDevProcessNum = colcommon.BuildDesc("npu_chip_info_process_info_num", - "the npu process num") - - descDevProcessInfo = colcommon.BuildDescWithLabel("npu_chip_info_process_info", - "the npu process info, unit is 'MB'. if process run on host, container_id and container_name will be empty", - cardLabelForProcess) - - // net status - descNetworkStatus = colcommon.BuildDesc("npu_chip_info_network_status", "the npu network health status") - - // container (vnpu not support this metrics), only report to prometheus - npuCtrUtilization = colcommon.BuildDesc("container_npu_utilization", - "npu ai core utilization in container, unit is '%'") - npuCtrTotalMemory = colcommon.BuildDesc("container_npu_total_memory", - "npu total memory in container, unit is 'MB'") - npuCtrUsedMemory = colcommon.BuildDesc("container_npu_used_memory", - "the npu used memory in container, unit is 'MB'") - - npuCtrInfo *prometheus.Desc = nil - descNpuName *prometheus.Desc = nil - descNPUSerialNumber *prometheus.Desc = nil -) - -func init() { - - colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code", "the npu error code") - for i := 1; i < common.MaxErrorCodeLen; i++ { - colcommon.BuildDescSlice(&errorCodeDescs, "npu_chip_info_error_code_"+strconv.Itoa(i), "the npu error code") - } - - cardLabelForContainer = append(colcommon.CardLabel, "containerID", "containerName") - cardLabelForContainer[0] = "npuID" - npuCtrInfo = colcommon.BuildDescWithLabel("npu_container_info", "the container name and deviceID relationship", - cardLabelForContainer) - - cardLabelForSN = append(colcommon.CardLabel, "serial_number") - // NPU SN related metrics - descNPUSerialNumber = colcommon.BuildDescWithLabel("npu_chip_info_serial_number", - "the npu serial number information", cardLabelForSN) - - copy(cardLabelForNpuName, colcommon.CardLabel) - cardLabelForNpuName[1] = "name" - descNpuName = colcommon.BuildDescWithLabel("npu_chip_info_name", "the Ascend npu name with value '1'", - cardLabelForNpuName) -} - -type chipCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - - // the healthy status of the AI chip - HealthStatus string `json:"health_status"` - // the all error codes of the chip - ErrorCodes []int64 `json:"error_codes"` - // the utilization of the chip - Utilization int `json:"utilization"` - // the overall utilization of the chip - OverallUtilization int `json:"overall_utilization"` - // the vector utilization of the chip - VectorUtilization int `json:"vector_utilization"` - // the temperature of the chip - Temperature int `json:"temperature"` - // the work power of the chip - Power float32 `json:"power"` - // the work voltage of the chip - Voltage float32 `json:"voltage"` - // the AI core current frequency of the chip - AICoreCurrentFreq uint32 `json:"aicore_current_freq"` - // NetHealthStatus chip network health status - NetHealthStatus string `json:"net_health_status"` - // DevProcessInfo chip process info - DevProcessInfo *common.DevProcessInfo -} - -// BaseInfoCollector collects the base info of the chip -type BaseInfoCollector struct { - colcommon.MetricsCollectorAdapter -} - -// Describe collects the base info of the chip -func (c *BaseInfoCollector) Describe(ch chan<- *prometheus.Desc) { - // base info - ch <- machineInfoNPUDesc - ch <- descUtil - ch <- descVectorUtil - ch <- descOverUtil - ch <- descTemp - ch <- descPower - ch <- descVoltage - ch <- descHealthStatus - ch <- descNpuName - ch <- descAICoreFreq - ch <- descNPUSerialNumber - ch <- descDevProcessInfo - // status - ch <- descNetworkStatus - // container - ch <- npuCtrInfo - ch <- npuCtrUtilization - ch <- npuCtrTotalMemory - ch <- npuCtrUsedMemory - - // error code - for _, desc := range errorCodeDescs { - ch <- desc - } -} - -// CollectToCache collects the base info of the chip -func (c *BaseInfoCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - logicID := chip.LogicID - - dmgr := n.Dmgr - - freq, err := dmgr.GetDeviceFrequency(logicID, common.AICoreCurrentFreq) - if err != nil { - freq = common.UnRetError - } - temp, err := dmgr.GetDeviceTemperature(logicID) - if err != nil { - temp = common.RetError - } - vol, err := dmgr.GetDeviceVoltage(logicID) - if err != nil { - vol = common.UnRetError - } - - _, errCodes, err := dmgr.GetDeviceAllErrorCode(logicID) - if err != nil { - errCodes = make([]int64, 0) - } - - cache := &chipCache{ - chip: chip, - AICoreCurrentFreq: freq, - Temperature: int(temp), - Voltage: vol, - HealthStatus: getHealth(logicID, dmgr), - ErrorCodes: errCodes, - } - collectPower(logicID, dmgr, cache) - collectUtil(logicID, dmgr, cache) - setNetHealthStatus(logicID, dmgr, cache) - setProcessInfo(logicID, dmgr, cache) - - cache.timestamp = time.Now() - c.LocalCache.Store(chip.PhyId, *cache) - } - colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -func collectPower(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { - if dmgr.GetDevType() == api.Ascend310P { - cardPower, err := dmgr.GetMcuPowerInfo(chip.chip.CardId) - handleErr(err, colcommon.DomainForMcuPower, chip.chip.CardId) - // Ascend310P use cardPower to replace chipPower - chip.Power = cardPower - } else { - power, err := dmgr.GetDevicePowerInfo(logicID) - handleErr(err, colcommon.DomainForChipPower, logicID) - chip.Power = power - } -} - -// UpdatePrometheus updates the base info of the chip -func (c *BaseInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { - containerInfo := geenContainerInfo(&chipWithVnpu, containerMap) - timestamp := cache.timestamp - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Power), cardLabel, descPower) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Voltage), cardLabel, descVoltage) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.AICoreCurrentFreq), cardLabel, descAICoreFreq) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Temperature), cardLabel, descTemp) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.Utilization), cardLabel, descUtil) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.OverallUtilization), cardLabel, descOverUtil) - doUpdateMetricWithValidateNum(ch, timestamp, float64(cache.VectorUtilization), cardLabel, descVectorUtil) - doUpdateMetricWithValidateNum(ch, timestamp, 1, cardLabel, descNpuName) - doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.HealthStatus)), cardLabel, descHealthStatus) - doUpdateMetricWithValidateNum(ch, timestamp, float64(getHealthCode(cache.NetHealthStatus)), - cardLabel, descNetworkStatus) - - updateContainerInfo(ch, containerInfo, cardLabel, &cache, chipWithVnpu) - - updateProcessInfoForPrometheus(ch, &cache, containerInfo, timestamp, cardLabel) - updateErrorCodesInfo(ch, &cache, timestamp, cardLabel) - // Update NPU serial number info - if cache.chip.ElabelInfo != nil { - snLabel := append(cardLabel, cache.chip.ElabelInfo.SerialNumber) - doUpdateMetricWithValidateNum(ch, timestamp, 1, snLabel, descNPUSerialNumber) - } - } - updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - - ch <- prometheus.MustNewConstMetric(machineInfoNPUDesc, prometheus.GaugeValue, float64(len(chips))) -} - -func updateContainerInfo(ch chan<- prometheus.Metric, containerInfo container.DevicesInfo, - cardLabel []string, chip *chipCache, chipWithVnpu colcommon.HuaWeiAIChip) { - containerName := getContainerNameArray(containerInfo) - if len(containerName) != colcommon.ContainerNameLen { - return - } - // based on chipType , container_npu_total_memory、container_npu_used_memory reported in hbm or ddr group - doUpdateMetric(ch, chip.timestamp, 1, append(cardLabel, containerInfo.ID, strings.Join(containerName, "_")), - npuCtrInfo) - - // vnpu not support this metrics - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if vDevActivityInfo != nil && common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - - doUpdateMetricWithValidateNum(ch, chip.timestamp, float64(chip.Utilization), cardLabel, npuCtrUtilization) -} - -func updateErrorCodesInfo(ch chan<- prometheus.Metric, chip *chipCache, timestamp time.Time, cardLabel []string) { - if len(chip.ErrorCodes) > common.MaxErrorCodeLen { - logger.Warnf("Error code number is larger than %v, only the first %v will be reported, "+ - "all errorCode is: %v", common.MaxErrorCodeLen, common.MaxErrorCodeLen, chip.ErrorCodes) - } - for i := 0; i < len(chip.ErrorCodes) && i < len(errorCodeDescs); i++ { - doUpdateMetricWithValidateNum(ch, timestamp, float64(chip.ErrorCodes[i]), cardLabel, errorCodeDescs[i]) - } -} - -func updateProcessInfoForPrometheus(ch chan<- prometheus.Metric, chip *chipCache, - containerInfo container.DevicesInfo, timestamp time.Time, cardLabel []string) { - devProcessInfo := chip.DevProcessInfo - if devProcessInfo == nil { - return - } - doUpdateMetric(ch, timestamp, devProcessInfo.ProcNum, cardLabel, descDevProcessNum) - - containerID := "" - containerName := "" - cNameArray := getContainerNameArray(containerInfo) - if len(cNameArray) == colcommon.ContainerNameLen { - containerID = containerInfo.ID - containerName = strings.Join(cNameArray, "_") - } - - newCardLabel := make([]string, len(cardLabel)) - copy(newCardLabel, cardLabel) - // containerName in process info is namespace_podName_containerName - newCardLabel[len(newCardLabel)-1] = containerName - - if devProcessInfo.ProcNum == 0 { - doUpdateMetric(ch, timestamp, 0, append(newCardLabel, "", containerID), descDevProcessInfo) - return - } - - for i := int32(0); i < devProcessInfo.ProcNum; i++ { - procInfo := devProcessInfo.DevProcArray[i] - doUpdateMetric(ch, timestamp, procInfo.MemUsage, - append(newCardLabel, strconv.FormatInt(int64(procInfo.Pid), colcommon.Base), containerID), descDevProcessInfo) - } -} - -// UpdateTelegraf updates the base info of the chip -func (c *BaseInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - doUpdateTelegrafWithValidateNum(fieldMap, descTemp, float64(cache.Temperature), "") - doUpdateTelegrafWithValidateNum(fieldMap, descPower, float64(cache.Power), "") - doUpdateTelegrafWithValidateNum(fieldMap, descVoltage, float64(cache.Voltage), "") - doUpdateTelegrafWithValidateNum(fieldMap, descAICoreFreq, float64(cache.AICoreCurrentFreq), "") - doUpdateTelegrafWithValidateNum(fieldMap, descUtil, float64(cache.Utilization), "") - doUpdateTelegrafWithValidateNum(fieldMap, descVectorUtil, float64(cache.VectorUtilization), "") - doUpdateTelegrafWithValidateNum(fieldMap, descOverUtil, float64(cache.OverallUtilization), "") - doUpdateTelegrafWithValidateNum(fieldMap, descHealthStatus, float64(getHealthCode(cache.HealthStatus)), "") - doUpdateTelegrafWithValidateNum(fieldMap, descNetworkStatus, float64(getHealthCode(cache.NetHealthStatus)), "") - doUpdateTelegraf(fieldMap, descNpuName, chip.ChipInfo.Name, "") - - updateProcessInfoForTelegraf(&cache, fieldMap) - updateErrorCode(&cache, fieldMap) - // Update NPU serial number info - if cache.chip.ElabelInfo != nil { - doUpdateTelegraf(fieldMap, descNPUSerialNumber, cache.chip.ElabelInfo.SerialNumber, "") - } - - } - - if fieldsMap[colcommon.GeneralDevTagKey] == nil { - fieldsMap[colcommon.GeneralDevTagKey] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[colcommon.GeneralDevTagKey], machineInfoNPUDesc, len(chips), "") - return fieldsMap -} - -func updateErrorCode(chip *chipCache, fieldMap map[string]interface{}) { - if len(errorCodeDescs) == 0 { - return - } - descErrorCode := errorCodeDescs[0] - for i := 0; i < len(chip.ErrorCodes); i++ { - extInfo := "" - if i != 0 { - extInfo = "_" + strconv.Itoa(i) - } - doUpdateTelegrafWithValidateNum(fieldMap, descErrorCode, float64(chip.ErrorCodes[i]), extInfo) - } -} - -func updateProcessInfoForTelegraf(chip *chipCache, fieldMap map[string]interface{}) { - devProcessInfo := chip.DevProcessInfo - doUpdateTelegraf(fieldMap, descDevProcessNum, devProcessInfo.ProcNum, "") - if devProcessInfo.ProcNum == 0 { - doUpdateTelegraf(fieldMap, descDevProcessInfo, 0, "") - return - } - for i := int32(0); i < devProcessInfo.ProcNum; i++ { - procInfo := devProcessInfo.DevProcArray[i] - doUpdateTelegraf(fieldMap, descDevProcessInfo, procInfo.MemUsage, "_"+strconv.Itoa(int(procInfo.Pid))) - } -} - -func collectUtil(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { - util, err := dmgr.GetDeviceUtilizationRate(logicID, common.AICore) - handleErr(err, colcommon.DomainForAICoreUtilization, logicID) - chip.Utilization = int(util) - - overAllUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.Overall) - handleErr(err, colcommon.DomainForOverallUtilization, logicID) - chip.OverallUtilization = int(overAllUtil) - - vecUtil, err := dmgr.GetDeviceUtilizationRate(logicID, common.VectorCore) - handleErr(err, colcommon.DomainForVectorCoreUtilization, logicID) - chip.VectorUtilization = int(vecUtil) -} - -func setNetHealthStatus(logicID int32, dmgr devmanager.DeviceInterface, chip *chipCache) { - chip.NetHealthStatus = colcommon.Abnormal - if !dmgr.IsTrainingCard() { - return - } - - netCode, err := dmgr.GetDeviceNetWorkHealth(logicID) - logger.Debugf("chip %d network healthy code is %d", logicID, netCode) - if err != nil { - netCode = math.MaxUint32 - } - chip.NetHealthStatus = getNetworkHealthy(netCode) -} - -func getNetworkHealthy(netCode uint32) string { - if netCode == math.MaxUint32 { - return colcommon.Abnormal - } - - if netCode == common.NetworkInit || netCode == common.NetworkSuccess { - return colcommon.Healthy - } - - return colcommon.UnHealthy -} - -func getHealth(logicID int32, dmgr devmanager.DeviceInterface) string { - health, err := dmgr.GetDeviceHealth(logicID) - if err != nil || health != 0 { - return colcommon.UnHealthy - } - return colcommon.Healthy -} - -func getHealthCode(health string) int { - if health == colcommon.Abnormal { - return common.RetError - } - - if colcommon.Healthy == health { - return 1 - } - return 0 -} - -func setProcessInfo(logicID int32, dmgr devmanager.DeviceInterface, hwChip *chipCache) { - productTypes := dmgr.GetProductTypeArray() - info, err := dmgr.GetDevProcessInfo(logicID) - if err != nil { - if len(productTypes) == 1 && productTypes[0] == common.Atlas200ISoc { - logger.Debugf("process info is not supported on %s", common.Atlas200ISoc) - hwChip.DevProcessInfo = &common.DevProcessInfo{} - return - } - handleErr(err, colcommon.DomainForProcess, logicID) - info = &common.DevProcessInfo{} - } - hwChip.DevProcessInfo = info -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go deleted file mode 100644 index ca49804..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_optical.go +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - txPower0 = "Tx_Power0" - txPower1 = "Tx_Power1" - txPower2 = "Tx_Power2" - txPower3 = "Tx_Power3" - - rxPower0 = "Rx_Power0" - rxPower1 = "Rx_Power1" - rxPower2 = "Rx_Power2" - rxPower3 = "Rx_Power3" - - notPresent = "not present" - present = "present" - temperature = "temperature" - voltage = "Vcc" -) - -var ( - - // optical - descOpticalState = colcommon.BuildDesc("npu_chip_optical_state", "the npu interface receive optical-state") - descOpticalVcc = colcommon.BuildDesc("npu_chip_optical_vcc", "the npu interface receive optical-vcc") - descOpticalTemp = colcommon.BuildDesc("npu_chip_optical_temp", "the npu interface receive optical-temperature") - descOpticalTxPower0 = colcommon.BuildDesc("npu_chip_optical_tx_power_0", "npu interface receive optical-tx-power-0") - descOpticalTxPower1 = colcommon.BuildDesc("npu_chip_optical_tx_power_1", "npu interface receive optical-tx-power-1") - descOpticalTxPower2 = colcommon.BuildDesc("npu_chip_optical_tx_power_2", "npu interface receive optical-tx-power-2") - descOpticalTxPower3 = colcommon.BuildDesc("npu_chip_optical_tx_power_3", "npu interface receive optical-tx-power-3") - - descOpticalRxPower0 = colcommon.BuildDesc("npu_chip_optical_rx_power_0", "npu interface receive optical-rx-power-0") - descOpticalRxPower1 = colcommon.BuildDesc("npu_chip_optical_rx_power_1", "npu interface receive optical-rx-power-1") - descOpticalRxPower2 = colcommon.BuildDesc("npu_chip_optical_rx_power_2", "npu interface receive optical-rx-power-2") - descOpticalRxPower3 = colcommon.BuildDesc("npu_chip_optical_rx_power_3", "npu interface receive optical-rx-power-3") -) - -type opticalCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo indicates the optical module information - extInfo *common.OpticalInfo -} - -// OpticalCollector collect the optical metrics -type OpticalCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported judge whether the collector is supported -func (c *OpticalCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := n.Dmgr.IsTrainingCard() - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "only training card supports network related info") - return isSupport -} - -// Describe description of the metric -func (c *OpticalCollector) Describe(ch chan<- *prometheus.Desc) { - // optical - ch <- descOpticalState - ch <- descOpticalTxPower0 - ch <- descOpticalTxPower1 - ch <- descOpticalTxPower2 - ch <- descOpticalTxPower3 - ch <- descOpticalRxPower0 - ch <- descOpticalRxPower1 - ch <- descOpticalRxPower2 - ch <- descOpticalRxPower3 - ch <- descOpticalVcc - ch <- descOpticalTemp -} - -// CollectToCache collect the metric to cache -func (c *OpticalCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - opticalInfo, err := hccn.GetNPUOpticalInfo(chip.PhyId) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForOptical, chip.PhyId, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForOptical, chip.PhyId) - info := getMainOptInfo(opticalInfo) - c.LocalCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), extInfo: info}) - } - colcommon.UpdateCache[opticalCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *OpticalCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache opticalCache, cardLabel []string) { - opticalInfo := cache.extInfo - if opticalInfo == nil { - return - } - timestamp := cache.timestamp - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalState, cardLabel, descOpticalState) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalVcc, cardLabel, descOpticalVcc) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTemp, cardLabel, descOpticalTemp) - - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower0, cardLabel, descOpticalTxPower0) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower1, cardLabel, descOpticalTxPower1) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower2, cardLabel, descOpticalTxPower2) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalTxPower3, cardLabel, descOpticalTxPower3) - - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower0, cardLabel, descOpticalRxPower0) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower1, cardLabel, descOpticalRxPower1) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower2, cardLabel, descOpticalRxPower2) - doUpdateMetricWithValidateNum(ch, timestamp, opticalInfo.OpticalRxPower3, cardLabel, descOpticalRxPower3) - } - - updateFrame[opticalCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *OpticalCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalState, extInfo.OpticalState, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalVcc, extInfo.OpticalVcc, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTemp, extInfo.OpticalTemp, "") - - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower0, extInfo.OpticalTxPower0, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower1, extInfo.OpticalTxPower1, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower2, extInfo.OpticalTxPower2, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalTxPower3, extInfo.OpticalTxPower3, "") - - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower0, extInfo.OpticalRxPower0, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower1, extInfo.OpticalRxPower1, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower2, extInfo.OpticalRxPower2, "") - doUpdateTelegrafWithValidateNum(fieldMap, descOpticalRxPower3, extInfo.OpticalRxPower3, "") - } - return fieldsMap -} - -func getMainOptInfo(opticalInfo map[string]string) *common.OpticalInfo { - mainOpticalInfo := common.OpticalInfo{} - mainOpticalInfo.OpticalTxPower0 = hccn.GetFloatDataFromStr(opticalInfo[txPower0], txPower0) - mainOpticalInfo.OpticalTxPower1 = hccn.GetFloatDataFromStr(opticalInfo[txPower1], txPower1) - mainOpticalInfo.OpticalTxPower2 = hccn.GetFloatDataFromStr(opticalInfo[txPower2], txPower2) - mainOpticalInfo.OpticalTxPower3 = hccn.GetFloatDataFromStr(opticalInfo[txPower3], txPower3) - mainOpticalInfo.OpticalRxPower0 = hccn.GetFloatDataFromStr(opticalInfo[rxPower0], rxPower0) - mainOpticalInfo.OpticalRxPower1 = hccn.GetFloatDataFromStr(opticalInfo[rxPower1], rxPower1) - mainOpticalInfo.OpticalRxPower2 = hccn.GetFloatDataFromStr(opticalInfo[rxPower2], rxPower2) - mainOpticalInfo.OpticalRxPower3 = hccn.GetFloatDataFromStr(opticalInfo[rxPower3], rxPower3) - mainOpticalInfo.OpticalVcc = hccn.GetFloatDataFromStr(opticalInfo[voltage], voltage) - mainOpticalInfo.OpticalTemp = hccn.GetFloatDataFromStr(opticalInfo[temperature], temperature) - var optState float64 - if opticalInfo[present] == present { - optState = 1.0 - } else if opticalInfo[present] == notPresent { - optState = 0.0 - } else { - optState = common.RetError - } - mainOpticalInfo.OpticalState = optState - - return &mainOpticalInfo -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go deleted file mode 100644 index f68f95b..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_pcie.go +++ /dev/null @@ -1,234 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - pcieBwType = "pcie_bw_type" - avgPcieBw = "avgPcieBw" - minPcieBw = "minPcieBw" - maxPcieBw = "maxPcieBw" - - avgPostfix = "_avgPcieBw" - minPostfix = "_minPcieBw" - maxPostfix = "_maxPcieBw" -) - -var ( - pcieBwLabel = append(colcommon.CardLabel, pcieBwType) - - descRxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_p_bw", - "the npu write bw to remote‘s speed, unit is 'MB/ms'", pcieBwLabel) - - descRxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_np_bw", - "the npu read bw's speed from remote, unit is 'MB/ms'", pcieBwLabel) - - descRxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_rx_cpl_bw", - "the npu reply remote read operate cpl's speed, unit is 'MB/ms'", pcieBwLabel) - - descTxPBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_p_bw", - "the npu receive remote write operate's speed, unit is 'MB/ms'", pcieBwLabel) - - descTxNpBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_np_bw", - "the npu receive remote read operate's speed, unit is 'MB/ms'", pcieBwLabel) - - descTxCplBW = colcommon.BuildDescWithLabel("npu_chip_info_pcie_tx_cpl_bw", - "the npu read cpl's responese bw speed from remote, unit is 'MB/ms'", pcieBwLabel) -) -var ( - supportedPcieDevices = map[string]bool{ - api.Ascend910B: true, - } -) - -type pcieCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo pcie transport and receive bandwidth, have six metrics - extInfo *common.PCIEBwStat -} - -// PcieCollector collect pcie info -type PcieCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *PcieCollector) IsSupported(n *colcommon.NpuCollector) bool { - // only 910A2 supports pcie info - isSupport := supportedPcieDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe description of the metric -func (c *PcieCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descRxPBW - ch <- descTxPBW - ch <- descRxNpBW - ch <- descTxNpBW - ch <- descRxCplBW - ch <- descTxCplBW -} - -// CollectToCache collect the metric to cache -func (c *PcieCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - pcieBwInfo, err := n.Dmgr.GetPCIEBandwidth(chip.LogicID, common.ProfilingTime) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForPcieBandwidth, chip.LogicID, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForPcieBandwidth, chip.LogicID) - c.LocalCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieBwInfo}) - } - colcommon.UpdateCache[pcieCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *PcieCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache pcieCache, cardLabel []string) { - pcieBwInfo := cache.extInfo - if pcieBwInfo == nil { - return - } - - if cache.chip.VDevActivityInfo != nil && common.IsValidVDevID(cache.chip.VDevActivityInfo.VDevID) { - logger.Debug("vnpu does not supports pcie info query") - return - } - - timestamp := cache.timestamp - - updateAvgPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) - updateMinPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) - updateMaxPcieBwInfo(ch, timestamp, pcieBwInfo, cardLabel) - } - - updateFrame[pcieCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *PcieCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieAvgBw, avgPostfix) - doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieAvgBw, avgPostfix) - - doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMinBw, minPostfix) - doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMinBw, minPostfix) - - doUpdateTelegraf(fieldMap, descTxPBW, extInfo.PcieTxPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descTxNpBW, extInfo.PcieTxNPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descTxCplBW, extInfo.PcieTxCPLBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descRxPBW, extInfo.PcieRxPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descRxNpBW, extInfo.PcieRxNPBw.PcieMaxBw, maxPostfix) - doUpdateTelegraf(fieldMap, descRxCplBW, extInfo.PcieRxCPLBw.PcieMaxBw, maxPostfix) - - } - return fieldsMap -} - -func pcieBwLabelVal(cardLabels []string, pcieBwType string) []string { - return append(cardLabels, pcieBwType) -} - -func metricWithPcieBw(labelsVal []string, metrics *prometheus.Desc, val float64, valType string) prometheus.Metric { - return prometheus.MustNewConstMetric(metrics, prometheus.GaugeValue, val, pcieBwLabelVal(labelsVal, valType)...) -} - -func updateAvgPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, - cardLabel []string) { - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieAvgBw), avgPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieAvgBw), avgPcieBw)) -} - -func updateMinPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, - cardLabel []string) { - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMinBw), minPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMinBw), minPcieBw)) -} - -func updateMaxPcieBwInfo(ch chan<- prometheus.Metric, timestamp time.Time, pcieBwInfo *common.PCIEBwStat, - cardLabel []string) { - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxPBW, float64(pcieBwInfo.PcieTxPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxNpBW, float64(pcieBwInfo.PcieTxNPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descTxCplBW, float64(pcieBwInfo.PcieTxCPLBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxPBW, float64(pcieBwInfo.PcieRxPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxNpBW, float64(pcieBwInfo.PcieRxNPBw.PcieMaxBw), maxPcieBw)) - ch <- prometheus.NewMetricWithTimestamp(timestamp, - metricWithPcieBw(cardLabel, descRxCplBW, float64(pcieBwInfo.PcieRxCPLBw.PcieMaxBw), maxPcieBw)) -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go deleted file mode 100644 index b1d307c..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_roce.go +++ /dev/null @@ -1,263 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - macRxMacPauseNum = "mac_rx_mac_pause_num" - macTxMacPauseNum = "mac_tx_mac_pause_num" - macRxPfcPktNum = "mac_rx_pfc_pkt_num" - macTxPfcPktNum = "mac_tx_pfc_pkt_num" - macRxBadPktNum = "mac_rx_bad_pkt_num" - macTxBadPktNum = "mac_tx_bad_pkt_num" - roCERxAllPktNum = "roce_rx_all_pkt_num" - roCETxAllPktNum = "roce_tx_all_pkt_num" - roCERxErrPktNum = "roce_rx_err_pkt_num" - roCETxErrPktNum = "roce_tx_err_pkt_num" - roCERxCnpPktNum = "roce_rx_cnp_pkt_num" - roCETxCnpPktNum = "roce_tx_cnp_pkt_num" - macRxBadOctNum = "mac_rx_bad_oct_num" - macTxBadOctNum = "mac_tx_bad_oct_num" - roCEUnexpectedAckNum = "roce_unexpected_ack_num" - roCEOutOfOrderNum = "roce_out_of_order_num" - roCEVerificationErrNum = "roce_verification_err_num" - roCEQpStatusErrNum = "roce_qp_status_err_num" - roCENewPktRtyNum = "roce_new_pkt_rty_num" - roCEEcnDBNum = "roce_ecn_db_num" - macRXFcsErrPktNum = "mac_rx_fcs_err_pkt_num" -) - -var ( - // mac - descMacRxPauseNum = colcommon.BuildDesc("npu_chip_mac_rx_pause_num", "npu interface receive mac-rx-pause-num") - descMacTxPauseNum = colcommon.BuildDesc("npu_chip_mac_tx_pause_num", "npu interface receive mac-tx-pause-num") - descMacRxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_rx_pfc_pkt_num", "npu interface receive mac-rx-pfc-pkt-num") - descMacTxPfcPktNum = colcommon.BuildDesc("npu_chip_mac_tx_pfc_pkt_num", "npu interface receive mac-tx-pfc-pkt-num") - descMacRxBadPktNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_pkt_num", "npu interface receive mac-rx-bad-pkt-num") - descMacTxBadPktNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_pkt_num", "npu interface receive mac-tx-bad-pkt-num") - descMacTxBadOctNum = colcommon.BuildDesc("npu_chip_mac_tx_bad_oct_num", "npu interface receive mac-tx-bad-oct-num") - descMacRxBadOctNum = colcommon.BuildDesc("npu_chip_mac_rx_bad_oct_num", "npu interface receive mac-rx-bad-oct-num") - - descRxFCSNum = colcommon.BuildDesc("npu_chip_info_rx_fcs_num", "the npu network fcs receive number") - descRxECNNum = colcommon.BuildDesc("npu_chip_info_rx_ecn_num", "the npu network ecn receive number") - - // roce - descRoceRxAllPktNum = colcommon.BuildDesc("npu_chip_roce_rx_all_pkt_num", "npu interface receive roce-rx-all-pkt-num") - descRoceTxAllPktNum = colcommon.BuildDesc("npu_chip_roce_tx_all_pkt_num", "npu interface receive roce-tx-all-pkt-num") - descRoceRxErrPktNum = colcommon.BuildDesc("npu_chip_roce_rx_err_pkt_num", "npu interface receive roce-rx-err-pkt-num") - descRoceTxErrPktNum = colcommon.BuildDesc("npu_chip_roce_tx_err_pkt_num", "npu interface receive roce-tx-err-pkt-num") - descRoceRxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_rx_cnp_pkt_num", "npu interface receive roce-rx-cnp-pkt-num") - descRoceTxCnpPktNum = colcommon.BuildDesc("npu_chip_roce_tx_cnp_pkt_num", "npu interface receive roce-tx-cnp-pkt-num") - - descRoceNewPktRtyNum = colcommon.BuildDesc("npu_chip_roce_new_pkt_rty_num", - "npu interface receive roce-new-pkt-rty-num") - descRoceOutOfOrderNum = colcommon.BuildDesc("npu_chip_roce_out_of_order_num", - "the npu interface receive roce-out-of-order-num") - descRoceQpStatusErrNum = colcommon.BuildDesc("npu_chip_roce_qp_status_err_num", - "the npu interface receive roce-qp-status-err-num") - descRoceUnexpectedAcktNum = colcommon.BuildDesc("npu_chip_roce_unexpected_ack_num", - "the npu interface receive roce-unexpected-ack-num") - descRoceVerificationErrNum = colcommon.BuildDesc("npu_chip_roce_verification_err_num", - "the npu interface receive roce-verification-err-num") -) - -type roceCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo the statistics about packets - extInfo *common.StatInfo -} - -// RoceCollector collect roce info -type RoceCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *RoceCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := n.Dmgr.IsTrainingCard() - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "only training card supports network related info") - return isSupport -} - -// Describe description of the metric -func (c *RoceCollector) Describe(ch chan<- *prometheus.Desc) { - - // mac - ch <- descMacRxPauseNum - ch <- descMacTxPauseNum - ch <- descMacRxPfcPktNum - ch <- descMacTxPfcPktNum - ch <- descMacRxBadPktNum - ch <- descMacTxBadPktNum - ch <- descMacTxBadOctNum - ch <- descMacRxBadOctNum - ch <- descRxFCSNum - - // roce - ch <- descRoceRxAllPktNum - ch <- descRoceTxAllPktNum - ch <- descRoceRxErrPktNum - ch <- descRoceTxErrPktNum - ch <- descRoceRxCnpPktNum - ch <- descRoceTxCnpPktNum - ch <- descRoceNewPktRtyNum - ch <- descRoceUnexpectedAcktNum - ch <- descRoceOutOfOrderNum - ch <- descRoceVerificationErrNum - ch <- descRoceQpStatusErrNum - ch <- descRxECNNum - -} - -// CollectToCache collect the metric to cache -func (c *RoceCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - statInfo, err := hccn.GetNPUStatInfo(chip.DeviceID) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForRoce, chip.LogicID, err) - return - } - hwlog.ResetErrCnt(colcommon.DomainForRoce, chip.LogicID) - c.LocalCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), extInfo: getMainStatInfo(statInfo)}) - } - colcommon.UpdateCache[roceCache](n, colcommon.GetCacheKey(c), &c.LocalCache) - -} - -// UpdatePrometheus update prometheus metrics -func (c *RoceCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache roceCache, cardLabel []string) { - statInfo := cache.extInfo - if statInfo == nil { - return - } - updateStatInfoOfMac(ch, cache.timestamp, statInfo, cardLabel) - updateStatInfoOfRoCE(ch, cache.timestamp, statInfo, cardLabel) - } - updateFrame[roceCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *RoceCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - doUpdateTelegraf(fieldMap, descMacRxPauseNum, extInfo.MacRxPauseNum, "") - doUpdateTelegraf(fieldMap, descMacTxPauseNum, extInfo.MacTxPauseNum, "") - doUpdateTelegraf(fieldMap, descMacRxPfcPktNum, extInfo.MacRxPfcPktNum, "") - doUpdateTelegraf(fieldMap, descMacTxPfcPktNum, extInfo.MacTxPfcPktNum, "") - doUpdateTelegraf(fieldMap, descMacRxBadPktNum, extInfo.MacRxBadPktNum, "") - doUpdateTelegraf(fieldMap, descMacTxBadPktNum, extInfo.MacTxBadPktNum, "") - doUpdateTelegraf(fieldMap, descMacTxBadOctNum, extInfo.MacTxBadOctNum, "") - doUpdateTelegraf(fieldMap, descMacRxBadOctNum, extInfo.MacRxBadOctNum, "") - doUpdateTelegraf(fieldMap, descRxFCSNum, extInfo.MacRXFcsErrPktNum, "") - - doUpdateTelegraf(fieldMap, descRoceRxAllPktNum, extInfo.RoceRxAllPktNum, "") - doUpdateTelegraf(fieldMap, descRoceTxAllPktNum, extInfo.RoceTxAllPktNum, "") - doUpdateTelegraf(fieldMap, descRoceRxErrPktNum, extInfo.RoceRxErrPktNum, "") - doUpdateTelegraf(fieldMap, descRoceTxErrPktNum, extInfo.RoceTxErrPktNum, "") - doUpdateTelegraf(fieldMap, descRoceRxCnpPktNum, extInfo.RoceRxCnpPktNum, "") - doUpdateTelegraf(fieldMap, descRoceTxCnpPktNum, extInfo.RoceTxCnpPktNum, "") - doUpdateTelegraf(fieldMap, descRoceNewPktRtyNum, extInfo.RoceNewPktRtyNum, "") - doUpdateTelegraf(fieldMap, descRoceUnexpectedAcktNum, extInfo.RoceUnexpectedAckNum, "") - doUpdateTelegraf(fieldMap, descRoceOutOfOrderNum, extInfo.RoceOutOfOrderNum, "") - doUpdateTelegraf(fieldMap, descRoceVerificationErrNum, extInfo.RoceVerificationErrNum, "") - doUpdateTelegraf(fieldMap, descRoceQpStatusErrNum, extInfo.RoceQpStatusErrNum, "") - doUpdateTelegraf(fieldMap, descRxECNNum, extInfo.RoceEcnDBNum, "") - } - return fieldsMap -} -func getMainStatInfo(statInfo map[string]int) *common.StatInfo { - mainStatInfo := common.StatInfo{} - mainStatInfo.MacRxPauseNum = float64(statInfo[macRxMacPauseNum]) - mainStatInfo.MacTxPauseNum = float64(statInfo[macTxMacPauseNum]) - mainStatInfo.MacRxPfcPktNum = float64(statInfo[macRxPfcPktNum]) - mainStatInfo.MacTxPfcPktNum = float64(statInfo[macTxPfcPktNum]) - mainStatInfo.MacRxBadPktNum = float64(statInfo[macRxBadPktNum]) - mainStatInfo.MacTxBadPktNum = float64(statInfo[macTxBadPktNum]) - mainStatInfo.RoceRxAllPktNum = float64(statInfo[roCERxAllPktNum]) - mainStatInfo.RoceTxAllPktNum = float64(statInfo[roCETxAllPktNum]) - mainStatInfo.RoceRxErrPktNum = float64(statInfo[roCERxErrPktNum]) - mainStatInfo.RoceTxErrPktNum = float64(statInfo[roCETxErrPktNum]) - mainStatInfo.RoceRxCnpPktNum = float64(statInfo[roCERxCnpPktNum]) - mainStatInfo.RoceTxCnpPktNum = float64(statInfo[roCETxCnpPktNum]) - mainStatInfo.MacRxBadOctNum = float64(statInfo[macRxBadOctNum]) - mainStatInfo.MacTxBadOctNum = float64(statInfo[macTxBadOctNum]) - mainStatInfo.RoceUnexpectedAckNum = float64(statInfo[roCEUnexpectedAckNum]) - mainStatInfo.RoceOutOfOrderNum = float64(statInfo[roCEOutOfOrderNum]) - mainStatInfo.RoceVerificationErrNum = float64(statInfo[roCEVerificationErrNum]) - mainStatInfo.RoceQpStatusErrNum = float64(statInfo[roCEQpStatusErrNum]) - mainStatInfo.RoceNewPktRtyNum = float64(statInfo[roCENewPktRtyNum]) - mainStatInfo.RoceEcnDBNum = float64(statInfo[roCEEcnDBNum]) - mainStatInfo.MacRXFcsErrPktNum = float64(statInfo[macRXFcsErrPktNum]) - - return &mainStatInfo -} - -func updateStatInfoOfMac(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { - doUpdateMetric(ch, ts, statInfo.MacRxPauseNum, cardLabel, descMacRxPauseNum) - doUpdateMetric(ch, ts, statInfo.MacTxPauseNum, cardLabel, descMacTxPauseNum) - doUpdateMetric(ch, ts, statInfo.MacRxPfcPktNum, cardLabel, descMacRxPfcPktNum) - doUpdateMetric(ch, ts, statInfo.MacTxPfcPktNum, cardLabel, descMacTxPfcPktNum) - doUpdateMetric(ch, ts, statInfo.MacRxBadPktNum, cardLabel, descMacRxBadPktNum) - doUpdateMetric(ch, ts, statInfo.MacTxBadPktNum, cardLabel, descMacTxBadPktNum) - doUpdateMetric(ch, ts, statInfo.MacTxBadOctNum, cardLabel, descMacTxBadOctNum) - doUpdateMetric(ch, ts, statInfo.MacRxBadOctNum, cardLabel, descMacRxBadOctNum) - doUpdateMetric(ch, ts, statInfo.MacRXFcsErrPktNum, cardLabel, descRxFCSNum) -} - -func updateStatInfoOfRoCE(ch chan<- prometheus.Metric, ts time.Time, statInfo *common.StatInfo, cardLabel []string) { - doUpdateMetric(ch, ts, statInfo.RoceRxAllPktNum, cardLabel, descRoceRxAllPktNum) - doUpdateMetric(ch, ts, statInfo.RoceTxAllPktNum, cardLabel, descRoceTxAllPktNum) - doUpdateMetric(ch, ts, statInfo.RoceRxErrPktNum, cardLabel, descRoceRxErrPktNum) - doUpdateMetric(ch, ts, statInfo.RoceTxErrPktNum, cardLabel, descRoceTxErrPktNum) - doUpdateMetric(ch, ts, statInfo.RoceRxCnpPktNum, cardLabel, descRoceRxCnpPktNum) - doUpdateMetric(ch, ts, statInfo.RoceTxCnpPktNum, cardLabel, descRoceTxCnpPktNum) - doUpdateMetric(ch, ts, statInfo.RoceNewPktRtyNum, cardLabel, descRoceNewPktRtyNum) - doUpdateMetric(ch, ts, statInfo.RoceUnexpectedAckNum, cardLabel, descRoceUnexpectedAcktNum) - doUpdateMetric(ch, ts, statInfo.RoceOutOfOrderNum, cardLabel, descRoceOutOfOrderNum) - doUpdateMetric(ch, ts, statInfo.RoceVerificationErrNum, cardLabel, descRoceVerificationErrNum) - doUpdateMetric(ch, ts, statInfo.RoceQpStatusErrNum, cardLabel, descRoceQpStatusErrNum) - doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) - doUpdateMetric(ch, ts, statInfo.RoceEcnDBNum, cardLabel, descRxECNNum) -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go deleted file mode 100644 index 918469c..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_sio.go +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -var ( - descSioCrcTxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_tx_err_cnt", - "sio transmitted error count between die") - descSioCrcRxErrCnt = colcommon.BuildDesc("npu_chip_info_sio_crc_rx_err_cnt", - "sio received error count between die") -) -var ( - supportedSioDevices = map[string]bool{ - api.Ascend910A3: true, - } -) - -type sioCache struct { - chip colcommon.HuaWeiAIChip - timestamp time.Time - // extInfo sio status between dies, only support super pod - extInfo *common.SioCrcErrStatisticInfo -} - -// SioCollector collect sio info -type SioCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *SioCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedSioDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), - "sio information cannot be queried.") - return isSupport -} - -// Describe description of the metric -func (c *SioCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- descSioCrcTxErrCnt - ch <- descSioCrcRxErrCnt -} - -// CollectToCache collect the metric to cache -func (c *SioCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - logicID := chip.LogicID - sioInfo, err := n.Dmgr.GetSioInfo(logicID) - if err != nil { - logErrMetricsWithLimit(colcommon.DomainForSio, logicID, err) - continue - } - hwlog.ResetErrCnt(colcommon.DomainForSio, logicID) - - c.LocalCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: sioInfo}) - } - colcommon.UpdateCache[sioCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *SioCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache sioCache, cardLabel []string) { - extInfo := cache.extInfo - if extInfo == nil { - return - } - doUpdateMetric(ch, cache.timestamp, extInfo.TxErrCnt, cardLabel, descSioCrcTxErrCnt) - doUpdateMetric(ch, cache.timestamp, extInfo.RxErrCnt, cardLabel, descSioCrcRxErrCnt) - } - updateFrame[sioCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) -} - -// UpdateTelegraf update telegraf metrics -func (c *SioCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - fieldMap := getFieldMap(fieldsMap, cache.chip.LogicID) - - extInfo := cache.extInfo - if extInfo == nil { - continue - } - - doUpdateTelegraf(fieldMap, descSioCrcTxErrCnt, extInfo.TxErrCnt, "") - doUpdateTelegraf(fieldMap, descSioCrcRxErrCnt, extInfo.RxErrCnt, "") - } - return fieldsMap -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go deleted file mode 100644 index 8cb32bd..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_version.go +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/versions" -) - -var ( - versionInfoDesc = common.BuildDescWithLabel("npu_exporter_version_info", "exporter version with value '1'", - []string{"exporterVersion"}) -) - -// VersionCollector collect sio info -type VersionCollector struct { - common.MetricsCollectorAdapter -} - -// Describe description of the metric -func (c *VersionCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- versionInfoDesc -} - -// UpdatePrometheus update prometheus metric -func (c *VersionCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { - ch <- prometheus.MustNewConstMetric(versionInfoDesc, prometheus.GaugeValue, 1, []string{versions.BuildVersion}...) -} - -// UpdateTelegraf update telegraf metric -func (c *VersionCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - - if fieldsMap[common.GeneralDevTagKey] == nil { - fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], versionInfoDesc, versions.BuildVersion, "") - return fieldsMap -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go deleted file mode 100644 index 5117ec9..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu.go +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - cardLabelForVNpuName = make([]string, len(colcommon.CardLabel)) - podAiCoreUtilizationRate *prometheus.Desc = nil - podTotalMemory *prometheus.Desc = nil - podUsedMemory *prometheus.Desc = nil -) - -var ( - supportedVnpuDevices = map[string]bool{ - api.Ascend310P: true, - } -) - -const ( - vNpuUUID = "v_dev_id" - aiCoreCnt = "aicore_count" - isVirtual = "is_virtual" -) - -func init() { - cardLabelForVNpuName = append(colcommon.CardLabel, isVirtual) - cardLabelForVNpuName[2] = vNpuUUID - cardLabelForVNpuName[3] = aiCoreCnt - - podAiCoreUtilizationRate = colcommon.BuildDescWithLabel("vnpu_pod_aicore_utilization", - "the vnpu aicore utilization rate, unit is '%'", cardLabelForVNpuName) - podTotalMemory = colcommon.BuildDescWithLabel("vnpu_pod_total_memory", - "the vnpu total memory on pod, unit is 'KB'", cardLabelForVNpuName) - podUsedMemory = colcommon.BuildDescWithLabel("vnpu_pod_used_memory", - "the vnpu used memory on pod, unit is 'KB'", cardLabelForVNpuName) - -} - -// VnpuCollector collect vnpu info -type VnpuCollector struct { - colcommon.MetricsCollectorAdapter -} - -// IsSupported check whether the collector is supported -func (c *VnpuCollector) IsSupported(n *colcommon.NpuCollector) bool { - isSupport := supportedVnpuDevices[n.Dmgr.GetDevType()] - logForUnSupportDevice(isSupport, n.Dmgr.GetDevType(), colcommon.GetCacheKey(c), "") - return isSupport -} - -// Describe description of the metric -func (c *VnpuCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- podAiCoreUtilizationRate - ch <- podTotalMemory - ch <- podUsedMemory -} - -// CollectToCache collect the metric to cache -func (c *VnpuCollector) CollectToCache(n *colcommon.NpuCollector, chipList []colcommon.HuaWeiAIChip) { - for _, chip := range chipList { - cache := &chipCache{ - chip: chip, - } - cache.timestamp = time.Now() - c.LocalCache.Store(chip.PhyId, *cache) - } - colcommon.UpdateCache[chipCache](n, colcommon.GetCacheKey(c), &c.LocalCache) -} - -// UpdatePrometheus update prometheus metrics -func (c *VnpuCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) { - - updateSingleChip := func(chipWithVnpu colcommon.HuaWeiAIChip, cache chipCache, cardLabel []string) { - if chipWithVnpu.VDevActivityInfo == nil { - return - } - vDevActivityInfo := chipWithVnpu.VDevActivityInfo - if !common.IsValidVDevID(vDevActivityInfo.VDevID) { - return - } - containerName := getContainerNameArray(containerMap[int32(vDevActivityInfo.VDevID)]) - if len(containerName) != colcommon.ContainerNameLen { - return - } - cardLabel = getPodDisplayInfo(&chipWithVnpu, containerName) - doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevAiCoreRate, cardLabel, podAiCoreUtilizationRate) - doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevTotalMem, cardLabel, podTotalMemory) - doUpdateMetric(ch, cache.timestamp, vDevActivityInfo.VDevUsedMem, cardLabel, podUsedMemory) - } - - updateFrame[chipCache](colcommon.GetCacheKey(c), n, containerMap, chips, updateSingleChip) - -} - -// UpdateTelegraf update telegraf metrics -func (c *VnpuCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *colcommon.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []colcommon.HuaWeiAIChip) map[string]map[string]interface{} { - - caches := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(c)) - for _, chip := range chips { - cache, ok := caches[chip.PhyId] - if !ok { - continue - } - - vDevActivityInfo := chip.VDevActivityInfo - if vDevActivityInfo == nil || !common.IsValidVDevID(vDevActivityInfo.VDevID) { - continue - } - - devTagKey := strconv.Itoa(int(cache.chip.LogicID)) + "_" + strconv.Itoa(int(vDevActivityInfo.VDevID)) - - if fieldsMap[devTagKey] == nil { - fieldsMap[devTagKey] = make(map[string]interface{}) - } - - doUpdateTelegraf(fieldsMap[devTagKey], podAiCoreUtilizationRate, vDevActivityInfo.VDevAiCoreRate, "") - doUpdateTelegraf(fieldsMap[devTagKey], podTotalMemory, vDevActivityInfo.VDevTotalMem, "") - doUpdateTelegraf(fieldsMap[devTagKey], podUsedMemory, vDevActivityInfo.VDevUsedMem, "") - } - return fieldsMap -} - -func getPodDisplayInfo(chip *colcommon.HuaWeiAIChip, containerName []string) []string { - if len(containerName) != colcommon.ContainerNameLen { - logger.Errorf("container name length %v is not %v", len(containerName), colcommon.ContainerNameLen) - return nil - } - - chipInfo := common.DeepCopyChipInfo(chip.ChipInfo) - vDevActivityInfo := common.DeepCopyVDevActivityInfo(chip.VDevActivityInfo) - - return []string{ - strconv.Itoa(int(chip.DeviceID)), - common.GetNpuName(chipInfo), - strconv.Itoa(int(vDevActivityInfo.VDevID)), - strconv.FormatFloat(vDevActivityInfo.VDevAiCore, 'f', colcommon.DecimalPlaces, colcommon.BitSize), - containerName[colcommon.NameSpaceIdx], - containerName[colcommon.PodNameIdx], - containerName[colcommon.ConNameIdx], - strconv.FormatBool(vDevActivityInfo.IsVirtualDev), - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go deleted file mode 100644 index d57ade0..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_for_vnpu_test.go +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright(C) 2025-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "strconv" - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" -) - -const ( - vnpuMetricNum = 3 - validVnpuID = 100 - invalidVnpuID = 1 -) - -// TestVnpuCollectorIsSupported test VnpuCollector IsSupported -func TestVnpuCollectorIsSupported(t *testing.T) { - n := mockNewNpuCollector() - cases := []testCase{ - buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), - buildTestCase("VnpuCollector: testIsSupported on other type", &VnpuCollector{}, "OTHER", false), - } - - for _, c := range cases { - patches := gomonkey.NewPatches() - convey.Convey(c.name, t, func() { - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) - isSupported := c.collectorType.IsSupported(n) - convey.So(isSupported, convey.ShouldEqual, c.expectValue) - }) - } -} - -func TestVnpuCollectorDescribe(t *testing.T) { - collector := &VnpuCollector{} - convey.Convey("TestVnpuCollectorDescribe", t, func() { - ch := make(chan *prometheus.Desc, vnpuMetricNum) - collector.Describe(ch) - convey.So(len(ch), convey.ShouldEqual, vnpuMetricNum) - close(ch) - }) -} - -func TestVnpuCollectorCollectToCache(t *testing.T) { - collector := &VnpuCollector{} - n := mockNewNpuCollector() - testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} - - convey.Convey("TestVnpuCollectorCollectToCache", t, func() { - collector.CollectToCache(n, testChips) - cacheInfo := colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(collector)) - convey.So(cacheInfo, convey.ShouldNotBeNil) - }) -} - -func TestVnpuCollectorUpdatePrometheus(t *testing.T) { - collector := &VnpuCollector{} - n := mockNewNpuCollector() - containerMap := mockContainerInfo() - - testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} - collector.CollectToCache(n, testChips) - chip := createValidVnpuChip() - testCases := []struct { - name string - preHandleFunc func() - expectValue int - }{ - {name: "TestVnpuCollectorUpdatePrometheus_effective virtual device scenarios", - preHandleFunc: func() {}, - expectValue: vnpuMetricNum, - }, - {name: "TestVnpuCollectorUpdatePrometheus_there is no container info", - preHandleFunc: func() { - containerMap = map[int32]container.DevicesInfo{} - }, - expectValue: 0, - }, - {name: "TestVnpuCollectorUpdatePrometheus_the vdevid is invalid", - preHandleFunc: func() { - chip.VDevActivityInfo.VDevID = invalidVnpuID - }, - expectValue: 0, - }, - {name: "TestVnpuCollectorUpdatePrometheus_there is no vdev info", - preHandleFunc: func() { - chip.VDevActivityInfo = nil - }, - expectValue: 0, - }, - } - ch := make(chan prometheus.Metric, vnpuMetricNum) - defer close(ch) - for _, tt := range testCases { - convey.Convey(tt.name, t, func() { - tt.preHandleFunc() - collector.UpdatePrometheus(ch, n, containerMap, []colcommon.HuaWeiAIChip{chip}) - convey.So(len(ch), convey.ShouldEqual, tt.expectValue) - //clean ch - for { - if len(ch) == 0 { - break - } - <-ch - } - }) - } -} - -func mockContainerInfo() map[int32]container.DevicesInfo { - containerMap := map[int32]container.DevicesInfo{ - validVnpuID: { - Devices: []int{0}, - ID: strconv.Itoa(validVnpuID), - Name: "nsName_podName_ctrName", - }, - } - return containerMap -} - -func TestVnpuCollectorUpdateTelegraf(t *testing.T) { - collector := &VnpuCollector{} - n := mockNewNpuCollector() - containerMap := mockContainerInfo() - testChips := []colcommon.HuaWeiAIChip{{PhyId: 0}} - collector.CollectToCache(n, testChips) - chip := createValidVnpuChip() - convey.Convey("TestVnpuCollectorUpdateTelegraf", t, func() { - convey.Convey("effective virtual device scenarios", func() { - chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} - newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) - convey.So(len(newFieldMaps), convey.ShouldEqual, 1) - convey.So(len(newFieldMaps["0_100"]), convey.ShouldEqual, vnpuMetricNum) - }) - convey.Convey("there is no container info", func() { - chip.VDevActivityInfo = nil - chipsWithVnpu := []colcommon.HuaWeiAIChip{chip} - containerMap = map[int32]container.DevicesInfo{} - newFieldMaps := collector.UpdateTelegraf(make(map[string]map[string]interface{}), n, containerMap, chipsWithVnpu) - convey.So(len(newFieldMaps), convey.ShouldEqual, 0) - }) - - }) -} - -func TestGetPodDisplayInfo(t *testing.T) { - const num8 = 8 - convey.Convey("TestGetPodDisplayInfo", t, func() { - chip := createValidVnpuChip() - convey.Convey("valid container information", func() { - containerNames := []string{"namespace", "pod-name", "container-name"} - labels := getPodDisplayInfo(&chip, containerNames) - convey.Convey("should return 8 metrics", func() { - convey.So(len(labels), convey.ShouldEqual, num8) - convey.So(labels[len(labels)-1], convey.ShouldEqual, "true") - }) - }) - - convey.Convey("invalid container information", func() { - containerNames := []string{"short"} - labels := getPodDisplayInfo(&chip, containerNames) - convey.Convey("should return nil", func() { - convey.So(labels, convey.ShouldBeNil) - }) - }) - }) -} - -func createValidVnpuChip() colcommon.HuaWeiAIChip { - chip := createChip() - chip.VDevActivityInfo = &common.VDevActivityInfo{ - VDevID: validVnpuID, - VDevAiCore: 1, - VDevTotalMem: 1, - VDevUsedMem: 1, - IsVirtualDev: true, - } - return chip -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go b/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go deleted file mode 100644 index 7524c68..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/collector_test.go +++ /dev/null @@ -1,548 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics for general collector -package metrics - -import ( - "strconv" - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "ascend-common/devmanager/common" - "ascend-common/devmanager/hccn" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - maxMetricsCount = 2000 - num5 = 5 - mockContainerName = "mockContainerName" - maxChipNum int32 = 8 -) - -var ( - collectorChain []colcommon.MetricsCollector -) - -// TestDescribe test Describe -func TestDescribe(t *testing.T) { - - convey.Convey("test prometheus desc ", t, func() { - ch := make(chan *prometheus.Desc, maxMetricsCount) - for _, c := range collectorChain { - c.Describe(ch) - } - t.Logf("Describe len(ch):%v", len(ch)) - convey.So(ch, convey.ShouldNotBeEmpty) - }) -} - -type testCase struct { - name string - collectorType colcommon.MetricsCollector - deviceType string - expectValue bool -} - -func buildTestCase(name string, collectorType colcommon.MetricsCollector, deviceType string, - expectValue bool) testCase { - return testCase{ - name: name, - collectorType: collectorType, - deviceType: deviceType, - expectValue: expectValue, - } -} - -// testIsSupported test IsSupported -func TestIsSupported(t *testing.T) { - n := mockNewNpuCollector() - cases := []testCase{ - buildTestCase("DdrCollector: testIsSupported on Ascend310", &DdrCollector{}, api.Ascend310, true), - buildTestCase("DdrCollector: testIsSupported on Ascend310P", &DdrCollector{}, api.Ascend310P, true), - buildTestCase("DdrCollector: testIsSupported on Ascend910", &DdrCollector{}, api.Ascend910, true), - buildTestCase("DdrCollector: testIsSupported on Ascend910B", &DdrCollector{}, api.Ascend910B, false), - buildTestCase("DdrCollector: testIsSupported on Ascend910A3", &DdrCollector{}, api.Ascend910A3, false), - - buildTestCase("HccsCollector: testIsSupported on Ascend310", &HccsCollector{}, api.Ascend310, false), - buildTestCase("HccsCollector: testIsSupported on Ascend310P", &HccsCollector{}, api.Ascend310P, false), - buildTestCase("HccsCollector: testIsSupported on Ascend910", &HccsCollector{}, api.Ascend910, false), - buildTestCase("HccsCollector: testIsSupported on Ascend910B", &HccsCollector{}, api.Ascend910B, true), - buildTestCase("HccsCollector: testIsSupported on Ascend910A3", &HccsCollector{}, api.Ascend910A3, true), - - buildTestCase("SioCollector: testIsSupported on Ascend310", &SioCollector{}, api.Ascend310, false), - buildTestCase("SioCollector: testIsSupported on Ascend310P", &SioCollector{}, api.Ascend310P, false), - buildTestCase("SioCollector: testIsSupported on Ascend910", &SioCollector{}, api.Ascend910, false), - buildTestCase("SioCollector: testIsSupported on Ascend910B", &SioCollector{}, api.Ascend910B, false), - buildTestCase("SioCollector: testIsSupported on Ascend910A3", &SioCollector{}, api.Ascend910A3, true), - - buildTestCase("VnpuCollector: testIsSupported on Ascend310", &VnpuCollector{}, api.Ascend310, false), - buildTestCase("VnpuCollector: testIsSupported on Ascend310P", &VnpuCollector{}, api.Ascend310P, true), - buildTestCase("VnpuCollector: testIsSupported on Ascend910", &VnpuCollector{}, api.Ascend910, false), - buildTestCase("VnpuCollector: testIsSupported on Ascend910B", &VnpuCollector{}, api.Ascend910B, false), - buildTestCase("VnpuCollector: testIsSupported on Ascend910A3", &VnpuCollector{}, api.Ascend910A3, false), - } - - for _, c := range cases { - patches := gomonkey.NewPatches() - convey.Convey(c.name, t, func() { - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDevType", c.deviceType) - isSupported := c.collectorType.IsSupported(n) - convey.So(isSupported, convey.ShouldEqual, c.expectValue) - }) - } -} - -// TestIsSupported2 test IsSupported -func TestIsSupported2(t *testing.T) { - n := mockNewNpuCollector() - convey.Convey("TestIsSupported ", t, func() { - for _, c := range collectorChain { - c.IsSupported(n) - } - }) - -} - -// TestCollectToCache test CollectToCache -func TestCollectToCache(t *testing.T) { - n := mockNewNpuCollector() - - convey.Convey("TestCollectToCache", t, func() { - - patches := gomonkey.NewPatches() - defer patches.Reset() - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceMemoryInfo", mockMemoryInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHbmInfo", mockHbmAggregateInfo().HbmInfo, nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceEccInfo", mockHbmAggregateInfo().ECCInfo, nil) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfo", mockHccsStaticsInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsStatisticInfoInU64", mockHccsStaticsInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetHccsBandwidthInfo", mockHccsBWInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetPCIEBandwidth", mockPcieInfo(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetSioInfo", mockSioInfo(), nil) - patches.ApplyFuncReturn(hccn.GetNPULinkStatus, "UP", nil) - patches.ApplyFuncReturn(hccn.GetNPUInterfaceTraffic, float64(0), float64(0), nil) - patches.ApplyFuncReturn(hccn.GetNPULinkUpNum, 0, nil) - patches.ApplyFuncReturn(hccn.GetNPULinkSpeed, 0, nil) - patches.ApplyFuncReturn(hccn.GetNPUOpticalInfo, mockOpticalInfo(), nil) - patches.ApplyFuncReturn(hccn.GetNPUStatInfo, mockRoceInfoMap(), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceFrequency", uint32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceTemperature", int32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceVoltage", float32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceAllErrorCode", int32(1), []int64{0}, nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceHealth", uint32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDevicePowerInfo", float32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDeviceUtilizationRate", uint32(0), nil) - patches.ApplyMethodReturn(n.Dmgr, "GetDevProcessInfo", mockProcessInfo(), nil) - - chips := mockGetNPUChipList() - for _, c := range collectorChain { - c.PreCollect(n, chips) - c.CollectToCache(n, chips) - } - - convey.So(colcommon.GetInfoFromCache[ddrCache](n, colcommon.GetCacheKey(&DdrCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[hbmCache](n, colcommon.GetCacheKey(&HbmCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[hccsCache](n, colcommon.GetCacheKey(&HccsCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[netInfoCache](n, colcommon.GetCacheKey(&NetworkCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[chipCache](n, colcommon.GetCacheKey(&BaseInfoCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[opticalCache](n, colcommon.GetCacheKey(&OpticalCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[pcieCache](n, colcommon.GetCacheKey(&PcieCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[roceCache](n, colcommon.GetCacheKey(&RoceCollector{})), - convey.ShouldNotBeEmpty) - convey.So(colcommon.GetInfoFromCache[sioCache](n, colcommon.GetCacheKey(&SioCollector{})), - convey.ShouldNotBeEmpty) - - }) -} - -// TestUpdatePrometheus test UpdatePrometheus -func TestUpdatePrometheus(t *testing.T) { - n := mockNewNpuCollector() - - convey.Convey("TestUpdatePrometheus", t, func() { - - ch := make(chan prometheus.Metric, maxMetricsCount) - - patches := gomonkey.NewPatches() - defer patches.Reset() - containerInfos := mockGetContainerNPUInfo() - chips := mockGetNPUChipList() - - mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) - mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) - mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) - mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) - mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) - mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) - mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) - mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) - mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) - - for _, c := range collectorChain { - c.UpdatePrometheus(ch, n, containerInfos, chips) - } - - t.Logf("TestUpdatePrometheus len(ch):%v", len(ch)) - convey.So(ch, convey.ShouldNotBeEmpty) - }) -} - -// TestUpdateTelegraf test UpdateTelegraf -func TestUpdateTelegraf(t *testing.T) { - n := mockNewNpuCollector() - - convey.Convey("TestUpdatePrometheus", t, func() { - - patches := gomonkey.NewPatches() - defer patches.Reset() - containerInfos := mockGetContainerNPUInfo() - chips := mockGetNPUChipList() - - mockDdrCache(n, chips, colcommon.GetCacheKey(&DdrCollector{})) - mockHbmCache(n, chips, colcommon.GetCacheKey(&HbmCollector{})) - mockHccsCache(n, chips, colcommon.GetCacheKey(&HccsCollector{})) - mockNetInfoCache(n, chips, colcommon.GetCacheKey(&NetworkCollector{})) - mockChipCache(n, chips, colcommon.GetCacheKey(&BaseInfoCollector{})) - mockOpticalCache(n, chips, colcommon.GetCacheKey(&OpticalCollector{})) - mockPcieCache(n, chips, colcommon.GetCacheKey(&PcieCollector{})) - mockRoceCache(n, chips, colcommon.GetCacheKey(&RoceCollector{})) - mockSioCache(n, chips, colcommon.GetCacheKey(&SioCollector{})) - fieldsMap := make(map[string]map[string]interface{}) - - for _, c := range collectorChain { - c.UpdateTelegraf(fieldsMap, n, containerInfos, chips) - } - - t.Logf("fieldsMap len(ch):%v", len(fieldsMap)) - convey.So(fieldsMap, convey.ShouldNotBeEmpty) - }) -} - -func mockRoceCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, roceCache{chip: chip, timestamp: time.Now(), - extInfo: getMainStatInfo(mockRoceInfoMap())}) - } - colcommon.UpdateCache[roceCache](n, cacheKey, &localCache) -} - -func mockRoceInfoMap() map[string]int { - return map[string]int{ - macRxMacPauseNum: 0, - macTxMacPauseNum: 0, - macRxPfcPktNum: 0, - macTxPfcPktNum: 0, - macRxBadPktNum: 0, - macTxBadPktNum: 0, - roCERxAllPktNum: 0, - roCETxAllPktNum: 0, - roCERxErrPktNum: 0, - roCETxErrPktNum: 0, - roCERxCnpPktNum: 0, - roCETxCnpPktNum: 0, - macRxBadOctNum: 0, - macTxBadOctNum: 0, - roCEUnexpectedAckNum: 0, - roCEOutOfOrderNum: 0, - roCEVerificationErrNum: 0, - roCEQpStatusErrNum: 0, - roCENewPktRtyNum: 0, - roCEEcnDBNum: 0, - macRXFcsErrPktNum: 0, - } -} - -func mockDdrCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, ddrCache{chip: chip, timestamp: time.Now(), extInfo: mockMemoryInfo()}) - } - colcommon.UpdateCache[ddrCache](n, cacheKey, &localCache) -} - -func mockHccsCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, hccsCache{chip: chip, timestamp: time.Now(), - hccsStat: mockHccsStaticsInfo(), hccsBW: mockHccsBWInfo()}) - } - colcommon.UpdateCache[hccsCache](n, cacheKey, &localCache) -} - -func mockHccsBWInfo() *common.HccsBandwidthInfo { - return &common.HccsBandwidthInfo{ - ProfilingTime: 0, - RxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, - TxBandwidth: []float64{0, 0, 0, 0, 0, 0, 0, 0}, - TotalRxbw: 0, - TotalTxbw: 0, - } -} - -func mockHccsStaticsInfo() *common.HccsStatisticInfo { - return &common.HccsStatisticInfo{ - TxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, - RxCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, - CrcErrCnt: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, - } -} - -func mockSioCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, sioCache{chip: chip, timestamp: time.Now(), extInfo: mockSioInfo()}) - } - colcommon.UpdateCache[sioCache](n, cacheKey, &localCache) -} - -func mockSioInfo() *common.SioCrcErrStatisticInfo { - return &common.SioCrcErrStatisticInfo{ - TxErrCnt: 0, - RxErrCnt: 0, - } -} -func mockPcieCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - pcieInfo := mockPcieInfo() - localCache.Store(chip.PhyId, pcieCache{chip: chip, timestamp: time.Now(), extInfo: &pcieInfo}) - } - colcommon.UpdateCache[pcieCache](n, cacheKey, &localCache) -} - -func mockPcieInfo() common.PCIEBwStat { - return common.PCIEBwStat{ - PcieRxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieRxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieRxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieTxPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieTxNPBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - PcieTxCPLBw: common.PcieStatValue{PcieMinBw: int32(0), PcieMaxBw: int32(0), PcieAvgBw: int32(0)}, - } -} - -func mockOpticalCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, opticalCache{chip: chip, timestamp: time.Now(), - extInfo: getMainOptInfo(mockOpticalInfo())}) - } - colcommon.UpdateCache[opticalCache](n, cacheKey, &localCache) -} - -func mockOpticalInfo() map[string]string { - return map[string]string{ - txPower0: "1 mW", - txPower1: "1 mW", - txPower2: "1 mW", - txPower3: "1 mW", - rxPower0: "1 mW", - rxPower1: "1 mW", - rxPower2: "1 mW", - rxPower3: "1 mW", - voltage: "1 mV", - temperature: "50 C", - present: "1.0", - } -} - -func mockHbmCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, hbmCache{chip: chip, timestamp: time.Now(), extInfo: mockHbmAggregateInfo(), - hbmUtilization: 0}, - ) - } - colcommon.UpdateCache[hbmCache](n, cacheKey, &localCache) -} - -func mockNetInfoCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, netInfoCache{chip: chip, timestamp: time.Now(), extInfo: mockNetInfo()}) - } - colcommon.UpdateCache[netInfoCache](n, cacheKey, &localCache) -} - -func mockNetInfo() *common.NpuNetInfo { - return &common.NpuNetInfo{ - LinkStatusInfo: &common.LinkStatusInfo{LinkState: "0"}, - BandwidthInfo: &common.BandwidthInfo{RxValue: 0, TxValue: 0}, - LinkStatInfo: &common.LinkStatInfo{LinkUPNum: 0}, - LinkSpeedInfo: &common.LinkSpeedInfo{Speed: 0}, - } -} - -func mockChipCache(n *colcommon.NpuCollector, chips []colcommon.HuaWeiAIChip, cacheKey string) { - localCache := sync.Map{} - for _, chip := range chips { - localCache.Store(chip.PhyId, chipCache{chip: chip, timestamp: time.Now(), - HealthStatus: "Healthy", - ErrorCodes: []int64{0}, - Utilization: 0, - OverallUtilization: 0, - VectorUtilization: 0, - Temperature: 0, - Power: 0, - Voltage: 0, - AICoreCurrentFreq: 0, - NetHealthStatus: "Healthy", - DevProcessInfo: mockProcessInfo(), - }) - } - colcommon.UpdateCache[chipCache](n, cacheKey, &localCache) -} - -func mockProcessInfo() *common.DevProcessInfo { - return &common.DevProcessInfo{ - ProcNum: 1, - DevProcArray: []common.DevProcInfo{{Pid: 0, MemUsage: 0}}, - } -} - -func mockMemoryInfo() *common.MemoryInfo { - return &common.MemoryInfo{ - MemorySize: 0, - MemoryAvailable: 0, - Frequency: 0, - Utilization: 0, - } -} - -func mockHbmAggregateInfo() *common.HbmAggregateInfo { - return &common.HbmAggregateInfo{ - HbmInfo: &common.HbmInfo{ - MemorySize: 1, - Frequency: 1, - Usage: 1, - Temp: 1, - BandWidthUtilRate: 1, - }, - ECCInfo: &common.ECCInfo{ - EnableFlag: 1, - }, - } -} - -func mockNewNpuCollector() *colcommon.NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: time.Duration(num5) * time.Second, - updateTime: time.Duration(num5) * time.Second, - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := colcommon.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -func mockGetNPUChipList() []colcommon.HuaWeiAIChip { - chips := make([]colcommon.HuaWeiAIChip, 0) - for id := int32(0); id < maxChipNum; id++ { - chip := colcommon.HuaWeiAIChip{ - CardId: id, - PhyId: id, - DeviceID: id, - LogicID: id, - ChipInfo: &common.ChipInfo{ - Name: api.Ascend910, - Type: "Ascend", - Version: "V1", - }, - } - - chips = append(chips, chip) - } - return chips -} - -func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { - containsInfo := make(map[int32]container.DevicesInfo) - for id := int32(0); id < maxChipNum; id++ { - - containerInfo := container.DevicesInfo{ - ID: strconv.Itoa(int(id)), - Name: mockContainerName, - Devices: []int{int(id)}, - } - containsInfo[id] = containerInfo - } - return containsInfo -} - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - - initChain() -} - -func initChain() { - collectorChain = []colcommon.MetricsCollector{ - &HccsCollector{}, - &BaseInfoCollector{}, - &SioCollector{}, - &VersionCollector{}, - &HbmCollector{}, - &DdrCollector{}, - &VnpuCollector{}, - &PcieCollector{}, - &NetworkCollector{}, - &RoceCollector{}, - &OpticalCollector{}, - } -} - -func createChip() colcommon.HuaWeiAIChip { - return colcommon.HuaWeiAIChip{ - CardId: 0, - PhyId: 0, - DeviceID: 0, - LogicID: 0, - ChipInfo: &common.ChipInfo{ - Name: api.Ascend910, - Type: "Ascend", - Version: "V1", - }, - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go deleted file mode 100644 index 7a0697d..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/common_utils.go +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics offer common utils for collector -package metrics - -import ( - "math" - "reflect" - "strconv" - "strings" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -func validateNum(num float64) bool { - if num == -1 || num == math.MaxUint32 || float32(num) == math.MaxUint32 { - return false - } - - return true -} - -func doUpdateTelegrafWithValidateNum(fieldMap map[string]interface{}, desc *prometheus.Desc, - value float64, extInfo string) { - if validateNum(value) { - doUpdateTelegraf(fieldMap, desc, value, extInfo) - } -} - -func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { - fieldMap[utils.GetDescName(desc)+extInfo] = value -} - -func doUpdateMetricWithValidateNum(ch chan<- prometheus.Metric, timestamp time.Time, value float64, - cardLabel []string, desc *prometheus.Desc) { - if validateNum(value) { - doUpdateMetric(ch, timestamp, value, cardLabel, desc) - } -} -func doUpdateMetric(ch chan<- prometheus.Metric, timestamp time.Time, value interface{}, - cardLabel []string, desc *prometheus.Desc) { - var finalValue float64 - - switch value.(type) { - case int: - finalValue = float64(value.(int)) - case int32: - finalValue = float64(value.(int32)) - case int64: - finalValue = float64(value.(int64)) - case uint32: - finalValue = float64(value.(uint32)) - case uint64: - finalValue = float64(value.(uint64)) - case float32: - finalValue = float64(value.(float32)) - case float64: - finalValue = value.(float64) - default: - logger.Errorf("invalid param in function doUpdateMetric,"+ - "metrics name is (%v), value type is (%T),value is (%v)", utils.GetDescName(desc), value, value) - } - // collect failed, set value to -1 - if finalValue == common.FailedValue { - finalValue = common.FailedMetricValue - } - ch <- prometheus.NewMetricWithTimestamp(timestamp, - prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, finalValue, cardLabel...)) -} - -func getContainerInfoWithDefault(cNameArray []string) (containerName, namespaceValue, podNameValue string) { - if len(cNameArray) == colcommon.ContainerNameLen { - namespaceValue = cNameArray[colcommon.NameSpaceIdx] - podNameValue = cNameArray[colcommon.PodNameIdx] - containerName = cNameArray[colcommon.ConNameIdx] - } - return containerName, namespaceValue, podNameValue -} - -func geenGeneralCardLabel(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) []string { - - containerInfo := geenContainerInfo(chip, containerMap) - - containerName, namespaceValue, podNameValue := getContainerInfoWithDefault(getContainerNameArray(containerInfo)) - cardLabel := collectCardLabelValue(chip, namespaceValue, podNameValue, containerName) - return cardLabel -} - -func geenContainerInfo(chip *colcommon.HuaWeiAIChip, containerMap map[int32]container.DevicesInfo) container.DevicesInfo { - deviceID := chip.DeviceID - if chip.VDevActivityInfo != nil && chip.VDevActivityInfo.IsVirtualDev { - deviceID = int32(chip.VDevActivityInfo.VDevID) - } - containerInfo, ok := containerMap[deviceID] - if !ok { - containerInfo = container.DevicesInfo{} - } - return containerInfo -} -func collectCardLabelValue(chip *colcommon.HuaWeiAIChip, namespaceValue, podNameValue, containerName string) []string { - - return []string{strconv.FormatInt(int64(chip.DeviceID), colcommon.Base), common.GetNpuName(chip.ChipInfo), chip.VDieID, - chip.PCIeBusInfo, namespaceValue, podNameValue, containerName} -} - -func getContainerNameArray(devInfo container.DevicesInfo) []string { - if devInfo.Name == "" { - return nil - } - - return strings.Split(devInfo.Name, "_") -} - -func getFieldMap(fieldsMap map[string]map[string]interface{}, devTagKey int32) map[string]interface{} { - devTagKeyStr := strconv.Itoa(int(devTagKey)) - if fieldsMap[devTagKeyStr] == nil { - fieldsMap[devTagKeyStr] = make(map[string]interface{}) - } - return fieldsMap[devTagKeyStr] -} - -func handleErr(err error, domain string, logicID int32) { - if err != nil { - logErrMetricsWithLimit(domain, logicID, err) - } else { - hwlog.ResetErrCnt(domain, logicID) - } -} - -func logErrMetricsWithLimit(metric string, logicID int32, err error) { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{ - Domain: metric, - ID: logicID}, - "logicID(%d),%v", logicID, err) -} - -func validateNotNilForEveryElement(objs ...interface{}) bool { - for _, v := range objs { - val := reflect.ValueOf(v) - if val.Kind() != reflect.Ptr { - return false - } - if val.IsNil() { - return false - } - } - return true -} -func logForUnSupportDevice(isSupport bool, devType string, group string, extInfo string) { - if !isSupport { - logger.Infof("devType %v does not support [%v], %v", devType, group, extInfo) - } -} - -func updateFrame[T any](cacheKey string, n *colcommon.NpuCollector, containerMap map[int32]container.DevicesInfo, - chips []colcommon.HuaWeiAIChip, callBack func(chipWithVnpu colcommon.HuaWeiAIChip, cache T, cardLabel []string)) { - - caches := colcommon.GetInfoFromCache[T](n, cacheKey) - if len(caches) == 0 { - logger.Debugf("cacheKey(%v) not found", cacheKey) - return - } - for _, chip := range chips { - cardLabel := geenGeneralCardLabel(&chip, containerMap) - cache, ok := caches[chip.PhyId] - if !ok { - logger.Warnf("cacheKey(%v) not found, chip.PhyId(%v)", cacheKey, chip.PhyId) - continue - } - - callBack(chip, cache, cardLabel) - } -} diff --git a/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go b/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go deleted file mode 100644 index 9cb88bd..0000000 --- a/mind-cluster/component/npu-exporter/collector/metrics/common_utils_test.go +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package metrics offer common utils for collector -package metrics - -import ( - "math" - "testing" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/devmanager/common" - colcommon "huawei.com/npu-exporter/v6/collector/common" -) - -const ( - invalidNum = -1 - num100 = 100 -) - -// TestValidateNum test numerical verification -func TestValidateNum(t *testing.T) { - convey.Convey("TestValidateNum", t, func() { - convey.Convey("return true when the num is valid", func() { - convey.So(validateNum(0), convey.ShouldBeTrue) - convey.So(validateNum(num100), convey.ShouldBeTrue) - }) - - convey.Convey("return false when the num is invalid", func() { - convey.So(validateNum(invalidNum), convey.ShouldBeFalse) - convey.So(validateNum(math.MaxUint32), convey.ShouldBeFalse) - }) - }) -} - -// TestDoUpdateTelegraf test update telegraf -func TestDoUpdateTelegraf(t *testing.T) { - convey.Convey("TestDoUpdateTelegraf", t, func() { - fieldMap := make(map[string]interface{}) - desc := prometheus.NewDesc("test_metric", "", nil, nil) - - convey.Convey("update when num is valid", func() { - doUpdateTelegrafWithValidateNum(fieldMap, desc, num100, "_suffix") - convey.So(fieldMap["test_metric_suffix"], convey.ShouldEqual, num100) - }) - - convey.Convey("don't update when num is invalid", func() { - doUpdateTelegrafWithValidateNum(fieldMap, desc, -1, "_suffix") - convey.So(fieldMap, convey.ShouldBeEmpty) - }) - }) -} - -// TestDoUpdateMetric test update prometheus -func TestDoUpdateMetric(t *testing.T) { - const ( - num10 = 10 - num100 = 100 - negaNum = -5 - floatNum = 3.14 - ) - convey.Convey("TestDoUpdateMetric", t, func() { - ch := make(chan prometheus.Metric, 1) - desc := prometheus.NewDesc("test_metric", "", []string{"label"}, nil) - - convey.Convey("convert the various numeric types correctly", func() { - testCases := []struct { - input interface{} - expected float64 - }{ - {int(num10), num10}, - {int32(negaNum), negaNum}, - {uint64(num100), num100}, - {float32(floatNum), floatNum}, - } - - for _, tc := range testCases { - doUpdateMetric(ch, time.Now(), tc.input, []string{"label"}, desc) - m := <-ch - convey.So(m, convey.ShouldNotBeEmpty) - } - }) - }) -} - -// TestContainerInfo test container information processing -func TestContainerInfo(t *testing.T) { - convey.Convey("TestContainerInfo", t, func() { - convey.Convey("correctly split the array of container names", func() { - testCases := []struct { - input []string - expected []string - }{ - {[]string{"ns", "pod", "container"}, []string{"container", "ns", "pod"}}, - {[]string{"short"}, []string{"", "", ""}}, - } - - for _, tc := range testCases { - c, ns, pod := getContainerInfoWithDefault(tc.input) - convey.So([]string{c, ns, pod}, convey.ShouldResemble, tc.expected) - } - }) - }) -} - -// TestCardLabel test card label generation -func TestCardLabel(t *testing.T) { - convey.Convey("TestCardLabel", t, func() { - chip := &colcommon.HuaWeiAIChip{ - DeviceID: 0, - ChipInfo: &common.ChipInfo{Name: "1", Type: "1", Version: "1"}, - VDieID: "die1", - PCIeBusInfo: "0000:00:01.0", - } - - expected := []string{ - "0", - "1-1-1", - "die1", - "0000:00:01.0", - "test-ns", - "test-pod", - "test-container", - } - - convey.Convey("correctly generate an array of tags", func() { - labels := collectCardLabelValue(chip, "test-ns", "test-pod", "test-container") - convey.So(labels, convey.ShouldResemble, expected) - }) - }) -} - -// TestNilValidation test null pointer validation -func TestNilValidation(t *testing.T) { - convey.Convey("TestNilValidation", t, func() { - var nilPtr *int - val := 10 - - convey.Convey("all non null pointers should return true", func() { - convey.So(validateNotNilForEveryElement(&val), convey.ShouldBeTrue) - }) - - convey.Convey("a null pointer should return false", func() { - convey.So(validateNotNilForEveryElement(nilPtr), convey.ShouldBeFalse) - }) - - convey.Convey("non pointer types should return false", func() { - convey.So(validateNotNilForEveryElement(val), convey.ShouldBeFalse) - }) - }) -} diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics deleted file mode 100644 index 8f51362..0000000 --- a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics +++ /dev/null @@ -1,166 +0,0 @@ -# HELP machine_npu_nums Amount of npu installed on the machine. -# TYPE machine_npu_nums gauge -machine_npu_nums 8 -# HELP npu_chip_info_aicore_current_freq the npu ai core current frequency, unit is 'MHz' -# TYPE npu_chip_info_aicore_current_freq gauge -npu_chip_info_aicore_current_freq{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_aicore_current_freq{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_bandwidth_rx the npu interface receive speed, unit is 'MB/s' -# TYPE npu_chip_info_bandwidth_rx gauge -npu_chip_info_bandwidth_rx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_rx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_bandwidth_tx the npu interface transport speed, unit is 'MB/s' -# TYPE npu_chip_info_bandwidth_tx gauge -npu_chip_info_bandwidth_tx{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_bandwidth_tx{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_error_code the npu error code -# TYPE npu_chip_info_error_code gauge -npu_chip_info_error_code{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_error_code{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_hbm_total_memory the npu hbm total memory -# TYPE npu_chip_info_hbm_total_memory gauge -npu_chip_info_hbm_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_hbm_used_memory the npu hbm used memory -# TYPE npu_chip_info_hbm_used_memory gauge -npu_chip_info_hbm_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_hbm_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_health_status the npu health status -# TYPE npu_chip_info_health_status gauge -npu_chip_info_health_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_health_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -# HELP npu_chip_info_link_status the npu link status -# TYPE npu_chip_info_link_status gauge -npu_chip_info_link_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_link_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_name the Ascend npu name with value '1' -# TYPE npu_chip_info_name gauge -npu_chip_info_name{container_name="",id="0",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="1",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="2",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="3",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="4",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="5",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="6",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -npu_chip_info_name{container_name="",id="7",name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 1 1606402000 -# HELP npu_chip_info_network_status the npu network health status -# TYPE npu_chip_info_network_status gauge -npu_chip_info_network_status{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_network_status{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_power the npu power -# TYPE npu_chip_info_power gauge -npu_chip_info_power{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_power{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_temperature the npu temperature -# TYPE npu_chip_info_temperature gauge -npu_chip_info_temperature{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_temperature{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_total_memory the npu total memory -# TYPE npu_chip_info_total_memory gauge -npu_chip_info_total_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_total_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_used_memory the npu used memory -# TYPE npu_chip_info_used_memory gauge -npu_chip_info_used_memory{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_used_memory{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_utilization the ai core utilization -# TYPE npu_chip_info_utilization gauge -npu_chip_info_utilization{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_utilization{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_chip_info_voltage the npu voltage -# TYPE npu_chip_info_voltage gauge -npu_chip_info_voltage{container_name="",id="0",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="1",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="2",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="3",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="4",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="5",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="6",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -npu_chip_info_voltage{container_name="",id="7",model_name="910Awn-Ascend-V1",namespace="",pcie_bus_info="",pod_name="",vdie_id=""} 0 1606402000 -# HELP npu_exporter_version_info exporter version with value '1' -# TYPE npu_exporter_version_info gauge -npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 b/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 deleted file mode 100644 index bd501ee..0000000 --- a/mind-cluster/component/npu-exporter/collector/testdata/prometheus_metrics2 +++ /dev/null @@ -1,6 +0,0 @@ -# HELP machine_npu_nums Amount of npu installed on the machine. -# TYPE machine_npu_nums gauge -machine_npu_nums 0 -# HELP npu_exporter_version_info exporter version with value '1' -# TYPE npu_exporter_version_info gauge -npu_exporter_version_info{exporterVersion=""} 1 diff --git a/mind-cluster/component/npu-exporter/go.mod b/mind-cluster/component/npu-exporter/go.mod deleted file mode 100644 index 0d84960..0000000 --- a/mind-cluster/component/npu-exporter/go.mod +++ /dev/null @@ -1,63 +0,0 @@ -module huawei.com/npu-exporter/v6 - -go 1.18 - -require ( - ascend-common v0.0.0 - github.com/agiledragon/gomonkey/v2 v2.8.0 - github.com/golang/protobuf v1.5.3 - github.com/influxdata/telegraf v1.26.3 - github.com/prometheus/client_golang v1.15.0 - github.com/smartystreets/goconvey v1.6.4 - github.com/stretchr/testify v1.8.2 - google.golang.org/grpc v1.57.2 - google.golang.org/protobuf v1.30.0 - k8s.io/cri-api v0.25.13 -) - -require ( - github.com/BurntSushi/toml v1.2.1 // indirect - github.com/alecthomas/participle v0.4.1 // indirect - github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect - github.com/awnumar/memcall v0.1.2 // indirect - github.com/awnumar/memguard v0.22.3 // indirect - github.com/benbjohnson/clock v1.3.3 // indirect - github.com/beorn7/perks v1.0.1 // indirect - github.com/blues/jsonata-go v1.5.4 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/coreos/go-semver v0.3.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/fatih/color v1.15.0 // indirect - github.com/fsnotify/fsnotify v1.6.0 // indirect - github.com/gobwas/glob v0.2.3 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/snappy v0.0.4 // indirect - github.com/google/go-cmp v0.6.0 // indirect - github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect - github.com/gosnmp/gosnmp v1.35.0 // indirect - github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 // indirect - github.com/jtolds/gls v4.20.0+incompatible // indirect - github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.17 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect - github.com/naoina/go-stringutil v0.1.0 // indirect - github.com/philhofer/fwd v1.1.2 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.42.0 // indirect - github.com/prometheus/procfs v0.9.0 // indirect - github.com/prometheus/prometheus v0.42.0 // indirect - github.com/rogpeppe/go-internal v1.11.0 // indirect - github.com/sleepinggenius2/gosmi v0.4.4 // indirect - github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d // indirect - github.com/tinylib/msgp v1.1.8 // indirect - golang.org/x/crypto v0.31.0 // indirect - golang.org/x/net v0.25.0 // indirect - golang.org/x/sys v0.28.0 // indirect - golang.org/x/text v0.21.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apimachinery v0.26.2 // indirect -) - -replace ascend-common => ../ascend-common diff --git a/mind-cluster/component/npu-exporter/go.sum b/mind-cluster/component/npu-exporter/go.sum deleted file mode 100644 index d638dd1..0000000 --- a/mind-cluster/component/npu-exporter/go.sum +++ /dev/null @@ -1,561 +0,0 @@ -cloud.google.com/go v0.110.1 h1:oDJ19Fu9TX9Xs06iyCw4yifSqZ7JQ8BeuVHcTmWQlOA= -cloud.google.com/go/bigquery v1.51.1 h1:qI/8vkBbzLkv0BJmzE7ajA6uZqQC+C31MAwgb+vJe2U= -cloud.google.com/go/compute v1.19.1 h1:am86mquDUgjGNWxiGn+5PGLbmgiWXlE/yNWpIpNvuXY= -cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= -cloud.google.com/go/iam v1.0.0 h1:hlQJMovyJJwYjZcTohUH4o1L8Z8kYz+E+W/zktiLCBc= -cloud.google.com/go/monitoring v1.13.0 h1:2qsrgXGVoRXpP7otZ14eE1I568zAa92sJSDPyOJvwjM= -cloud.google.com/go/pubsub v1.30.1 h1:RdzTlwhswvROjPIoTfnSJ9tEp0LY2S5ATX90anOw7E8= -cloud.google.com/go/storage v1.29.0 h1:6weCgzRvMg7lzuUurI4697AqIRPU1SvzHhynwpW31jI= -code.cloudfoundry.org/clock v1.0.0 h1:kFXWQM4bxYvdBw2X8BbBeXwQNgfoWv1vqAk2ZZyBN2o= -collectd.org v0.5.0 h1:y4uFSAuOmeVhG3GCRa3/oH+ysePfO/+eGJNfd0Qa3d8= -github.com/Azure/azure-amqp-common-go/v4 v4.0.0 h1:mV5O74KYmonn0ZXtwfMjGUtZ9Z+Hv7AIFVS1s03sRvo= -github.com/Azure/azure-event-hubs-go/v3 v3.4.0 h1:LtH0nHkXivyV/GajOu5ZFC5sb/5KZ8j+9U8UsfHVTOo= -github.com/Azure/azure-kusto-go v0.8.0 h1:AeO6VBRGzB1BhmWeheSyN+WSrx+1wmhHm47vzptitdw= -github.com/Azure/azure-pipeline-go v0.2.3 h1:7U9HBg1JFK3jHl5qmo4CTZKFTVgMwdFHMVtCdfBE21U= -github.com/Azure/azure-sdk-for-go v65.0.0+incompatible h1:HzKLt3kIwMm4KeJYTdx9EbjRYTySD/t8i1Ee/W5EGXw= -github.com/Azure/azure-sdk-for-go/sdk/azcore v0.21.1 h1:qoVeMsc9/fh/yhxVaA0obYjVH/oI/ihrOoMwsLS9KSA= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v0.13.2 h1:mM/yraAumqMMIYev6zX0oxHqX6hreUs5wXf76W47r38= -github.com/Azure/azure-sdk-for-go/sdk/internal v0.9.1 h1:sLZ/Y+P/5RRtsXWylBjB5lkgixYfm0MQPiwrSX//JSo= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor v0.4.1 h1:P6UDRqlbywdpvhpVZeiB5p+DuhMTrVD4xfvPW55bs8M= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v0.3.1 h1:EXTDtCSTfPauGawsG+Ae/W46B1PkrgzuKNrcFqy4ljM= -github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v0.3.0 h1:Px2UA+2RvSSvv+RvJNuUB6n7rs5Wsel4dXLe90Um2n4= -github.com/Azure/azure-storage-blob-go v0.15.0 h1:rXtgp8tN1p29GvpGgfJetavIG0V7OgcSXPpwp3tx6qk= -github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd h1:b3wyxBl3vvr15tUAziPBPK354y+LSdfPCpex5oBttHo= -github.com/Azure/go-amqp v0.18.0 h1:95bTiJq0oxjK1RUlt5T3HF/THj6jWTRZpSXMPSOJLz8= -github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= -github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= -github.com/Azure/go-autorest/autorest v0.11.28 h1:ndAExarwr5Y+GaHE6VCaY1kyS/HwwGGyuimVhWsHOEM= -github.com/Azure/go-autorest/autorest/adal v0.9.23 h1:Yepx8CvFxwNKpH6ja7RZ+sKX+DWYNldbLiALMC3BTz8= -github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 h1:wkAZRgT/pn8HhFyzfe9UnqOjJYqlembgCTi72Bm/xKk= -github.com/Azure/go-autorest/autorest/azure/cli v0.4.5 h1:0W/yGmFdTIT77fvdlGZ0LMISoLHFJ7Tx4U0yeB+uFs4= -github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= -github.com/Azure/go-autorest/autorest/to v0.4.0 h1:oXVqrxakqqV1UZdSazDOPOLvOIz+XA683u8EctwboHk= -github.com/Azure/go-autorest/autorest/validation v0.3.1 h1:AgyqjAd94fwNAoTjl/WQXg4VvFeRFpO+UhNyRXqF1ac= -github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= -github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= -github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e h1:NeAW1fUYUEWhft7pkxDf6WoUvEZJ/uOKsvtpjLnn8MU= -github.com/AzureAD/microsoft-authentication-library-for-go v0.4.0 h1:WVsrXCnHlDDX8ls+tootqRE87/hL9S/g4ewig9RsD/c= -github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= -github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/ClickHouse/clickhouse-go v1.5.4 h1:cKjXeYLNWVJIx2J1K6H2CqyRmfwVJVY1OV1coaaFcI0= -github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= -github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= -github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= -github.com/Mellanox/rdmamap v0.0.0-20191106181932-7c3c4763a6ee h1:atI/FFjXh6hIVlPE1Jup9m8N4B9q/OSbMUe2EBahs+w= -github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= -github.com/Shopify/sarama v1.38.1 h1:lqqPUPQZ7zPqYlWpTh+LQ9bhYNu2xJL6k1SJN4WVe2A= -github.com/aerospike/aerospike-client-go/v5 v5.11.0 h1:z3ZmDSm3I10VMXXIIrsFCFq3IenwFqTCnLNyvnFVzrk= -github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= -github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= -github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2/go.mod h1:CxCgO+NdpMdi9SsTlGbc0W+/UNxO3I0AabOEJZ3w61w= -github.com/alecthomas/kong v0.2.1/go.mod h1:+inYUSluD+p4L8KdviBSgzcqEjUQOfC5fQDRFuc36lI= -github.com/alecthomas/participle v0.4.1 h1:P2PJWzwrSpuCWXKnzqvw0b0phSfH1kJo4p2HvLynVsI= -github.com/alecthomas/participle v0.4.1/go.mod h1:T8u4bQOSMwrkTWOSyt8/jSFPEnRtd0FKFMjVfYBlqPs= -github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ= -github.com/alecthomas/repr v0.0.0-20210301060118-828286944d6a/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= -github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= -github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= -github.com/aliyun/alibaba-cloud-sdk-go v1.62.193 h1:Cwd5cNwrQqtOzOJ1vqswYe3amU3vOz3v0wQF8WizmXI= -github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= -github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= -github.com/antchfx/jsonquery v1.3.1 h1:kh3599hMLpygvcxoENcj99eCvnS++JjRX10LjNYhK58= -github.com/antchfx/xmlquery v1.3.15 h1:aJConNMi1sMha5G8YJoAIF5P+H+qG1L73bSItWHo8Tw= -github.com/antchfx/xpath v1.2.5-0.20230505064641-588960cceeac h1:Et7H7mEPWuivbFEXi3dWa8hobnvF380TS2mq7JmgjEI= -github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ= -github.com/apache/arrow/go/v12 v12.0.0 h1:xtZE63VWl7qLdB0JObIXvvhGjoVNrQ9ciIHG2OK5cmc= -github.com/apache/iotdb-client-go v0.12.2-0.20220722111104-cd17da295b46 h1:28HyUQcr8ZCyCAatR0gkf9PuLr52U2T+66tx5Th0nxI= -github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg= -github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 h1:Bmjk+DjIi3tTAU0wxGaFbfjGUqlxxSXARq9A96Kgoos= -github.com/aristanetworks/goarista v0.0.0-20190325233358-a123909ec740 h1:FD4/ikKOFxwP8muWDypbmBWc634+YcAs3eBrYAmRdZY= -github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA= -github.com/awnumar/memcall v0.1.2 h1:7gOfDTL+BJ6nnbtAp9+HQzUFjtP1hEseRQq8eP055QY= -github.com/awnumar/memcall v0.1.2/go.mod h1:S911igBPR9CThzd/hYQQmTc9SWNu3ZHIlCGaWsWsoJo= -github.com/awnumar/memguard v0.22.3 h1:b4sgUXtbUjhrGELPbuC62wU+BsPQy+8lkWed9Z+pj0Y= -github.com/awnumar/memguard v0.22.3/go.mod h1:mmGunnffnLHlxE5rRgQc3j+uwPZ27eYb61ccr8Clz2Y= -github.com/aws/aws-sdk-go-v2 v1.18.0 h1:882kkTpSFhdgYRKVZ/VCgf7sd0ru57p2JCxz4/oN5RY= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= -github.com/aws/aws-sdk-go-v2/config v1.18.8 h1:lDpy0WM8AHsywOnVrOHaSMfpaiV2igOw8D7svkFkXVA= -github.com/aws/aws-sdk-go-v2/credentials v1.13.20 h1:oZCEFcrMppP/CNiS8myzv9JgOzq2s0d3v3MXYil/mxQ= -github.com/aws/aws-sdk-go-v2/feature/dynamodb/attributevalue v1.2.0 h1:8kvinmbIDObqsWegKP0JjeanYPiA4GUVpAtciNWE+jw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.2 h1:jOzQAesnBFDmz93feqKnsTHsXrlwWORNZMFHMV+WLFU= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.7.1 h1:p9Dys1g2YdaqMalnp6AwCA+tpMMdJNGw5YYKP/u3sUk= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.32 h1:dpbVNUjczQ8Ae3QKHbpHBpfvaVkRdesxpTOe9pTouhU= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.26 h1:QH2kOS3Ht7x+u0gHCh06CXL/h6G8LQJFpZfFBYBNboo= -github.com/aws/aws-sdk-go-v2/internal/ini v1.3.28 h1:KeTxcGdNnQudb46oOl4d90f2I33DF/c6q3RnZAmvQdQ= -github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.25.9 h1:7jgW378oM948BxuOBarXeeaKSrRaCj7didsdeSwYGGo= -github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.20.9 h1:sXs+JjIwgKA27t+5O8YgXl0cmZpEmctyDVO5y6cMdqA= -github.com/aws/aws-sdk-go-v2/service/dynamodb v1.17.3 h1:2oB4ikNEMLaPtu6lbNFJyTSayBILvrOfa2VfOffcuvU= -github.com/aws/aws-sdk-go-v2/service/dynamodbstreams v1.4.0 h1:QbFWJr2SAyVYvyoOHvJU6sCGLnqNT94ZbWElJMEI1JY= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.10 h1:dpiPHgmFstgkLG07KaYAewvuptq5kvo52xn7tVSrtrQ= -github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.7.23 h1:5AwQnYQT3ZX/N7hPTAx4ClWyucaiqr2esQRMNbJIby0= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.26 h1:uUt4XctZLhl9wBE1L8lobU3bVN8SNUP7T+olb0bWBO4= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.9.0 h1:0BOlTqnNnrEO04oYKzDxMMe68t107pmIotn18HtVonY= -github.com/aws/aws-sdk-go-v2/service/kinesis v1.17.8 h1:9Kk24woetm1Tm4cAZNoJStJW1VQAeh92lLD9XZ4176g= -github.com/aws/aws-sdk-go-v2/service/s3 v1.19.0 h1:5mRAms4TjSTOGYsqKYte5kHr1PzpMJSyLThjF3J+hw0= -github.com/aws/aws-sdk-go-v2/service/sso v1.12.8 h1:5cb3D6xb006bPTqEfCNaEA6PPEfBXxxy4NNeX/44kGk= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.8 h1:NZaj0ngZMzsubWZbrEFSB4rgSQRbFq38Sd6KBxHuOIU= -github.com/aws/aws-sdk-go-v2/service/sts v1.18.9 h1:Qf1aWwnsNkyAoqDqmdM3nHwN78XQjec27LjM6b9vyfI= -github.com/aws/aws-sdk-go-v2/service/timestreamwrite v1.16.0 h1:HHVOprdnZxhM6F5JgljW8nCklfwUyOlbd/wuca6vORA= -github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= -github.com/awslabs/kinesis-aggregation/go v0.0.0-20210630091500-54e17340d32f h1:Pf0BjJDga7C98f0vhw+Ip5EaiE07S3lTKpIYPNS0nMo= -github.com/benbjohnson/clock v1.3.3 h1:g+rSsSaAzhHJYcIQE78hJ3AhyjjtQvleKDjlhdBnIhc= -github.com/benbjohnson/clock v1.3.3/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= -github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= -github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/blues/jsonata-go v1.5.4 h1:XCsXaVVMrt4lcpKeJw6mNJHqQpWU751cnHdCFUq3xd8= -github.com/blues/jsonata-go v1.5.4/go.mod h1:uns2jymDrnI7y+UFYCqsRTEiAH22GyHnNXrkupAVFWI= -github.com/bmatcuk/doublestar/v3 v3.0.0 h1:TQtVPlDnAYwcrVNB2JiGuMc++H5qzWZd9PhkNo5WyHI= -github.com/bufbuild/protocompile v0.4.0 h1:LbFKd2XowZvQ/kajzguUp2DC9UEIQhIq77fZZlaQsNA= -github.com/caio/go-tdigest v3.1.0+incompatible h1:uoVMJ3Q5lXmVLCCqaMGHLBWnbGoN6Lpu7OAUPR60cds= -github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= -github.com/cenkalti/backoff/v4 v4.2.0 h1:HN5dHm3WBOgndBH6E8V0q2jIYIR3s9yglV8k/+MN3u4= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cisco-ie/nx-telemetry-proto v0.0.0-20230117155933-f64c045c77df h1:GmrltUp5Qf5XhT+LmqMDizsgm/6VHTSxPWRdrq21yRo= -github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg= -github.com/containerd/containerd v1.6.18 h1:qZbsLvmyu+Vlty0/Ex5xc0z2YtKpIsb5n45mAMI+2Ns= -github.com/coocood/freecache v1.2.3 h1:lcBwpZrwBZRZyLk/8EMyQVXRiFl663cCuMOrjCALeto= -github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= -github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec= -github.com/couchbase/go-couchbase v0.1.1 h1:ClFXELcKj/ojyoTYbsY34QUrrYCBi/1G749sXSCkdhk= -github.com/couchbase/gomemcached v0.1.3 h1:HIc5qMYNbuhB7zNaiEtj61DCYkquAwrQlf64q7JzdEY= -github.com/couchbase/goutils v0.1.0 h1:0WLlKJilu7IBm98T8nS9+J36lBFVLRUSIUtyD/uWpAE= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/denisenkom/go-mssqldb v0.12.3 h1:pBSGx9Tq67pBOTLmxNuirNTeB8Vjmf886Kx+8Y+8shw= -github.com/devigned/tab v0.1.1 h1:3mD6Kb1mUOYeLpJvTVSDwSg5ZsfSxfvxGRTxRsJsITA= -github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= -github.com/digitalocean/go-libvirt v0.0.0-20220811165305-15feff002086 h1:FTREXo+EVmU9nOCaQ46PvH0hs1Rt2/diCoTAtxzDxrA= -github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= -github.com/djherbis/times v1.5.0 h1:79myA211VwPhFTqUk8xehWrsEO+zcIZj0zT8mXPVARU= -github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= -github.com/docker/docker v23.0.4+incompatible h1:Kd3Bh9V/rO+XpTP/BLqM+gx8z7+Yb0AA2Ibj+nNo4ek= -github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/doclambda/protobufquery v0.0.0-20220727165953-0da287796ee9 h1:677nbAF3nq56BEZ2R/VMl0wROQqJo4vJ/ZWuzm+vsUU= -github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= -github.com/dynatrace-oss/dynatrace-metric-utils-go v0.5.0 h1:wHGPJSXvwKQVf/XfhjUPyrhpcPKWNy8F3ikH+eiwoBg= -github.com/eapache/go-resiliency v1.3.0 h1:RRL0nge+cWGlxXbUzJ7yMcq6w2XBEr19dCN6HECGaT0= -github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6 h1:8yY/I9ndfrgrXUbOGObLHKBR4Fl3nZXwM2c7OYTT8hM= -github.com/eapache/queue v1.1.0 h1:YOEu7KNc61ntiQlcEeUIoDTJ2o8mQznoNvUhiigpIqc= -github.com/eclipse/paho.golang v0.10.0 h1:oUGPjRwWcZQRgDD9wVDV7y7i7yBSxts3vcvcNJo8B4Q= -github.com/eclipse/paho.mqtt.golang v1.4.2 h1:66wOzfUHSSI1zamx7jR6yMEI5EuHnT1G6rNA5PM12m4= -github.com/emicklei/go-restful/v3 v3.10.1 h1:rc42Y5YTp7Am7CS630D7JmhRjq4UlEUuEKfrDac4bSQ= -github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= -github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= -github.com/form3tech-oss/jwt-go v3.2.5+incompatible h1:/l4kBbb4/vGSsdtB5nUe8L7B9mImVMaBPw9L/0TBHU8= -github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= -github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro= -github.com/go-asn1-ber/asn1-ber v1.5.4 h1:vXT6d/FNDiELJnLb6hGNa309LMsrCoYFvpwHDF0+Y1A= -github.com/go-ldap/ldap/v3 v3.4.4 h1:qPjipEpt+qDa6SI/h1fzuGWoRUY+qqQ9sOZq67/PYUs= -github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= -github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= -github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= -github.com/go-redis/redis/v7 v7.4.1 h1:PASvf36gyUpr2zdOUS/9Zqc80GbM+9BDyiJSJDDOrTI= -github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= -github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= -github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= -github.com/go-stomp/stomp v2.1.4+incompatible h1:D3SheUVDOz9RsjVWkoh/1iCOwD0qWjyeTZMUZ0EXg2Y= -github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= -github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= -github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= -github.com/gofrs/uuid v4.2.0+incompatible h1:yyYWMnhkhrKwwr8gAOcOCYxOOscHgDS9yZgBrnJfGa0= -github.com/gofrs/uuid/v5 v5.0.0 h1:p544++a97kEL+svbcFbCQVM9KFu0Yo25UoISXGNNH9M= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c= -github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg= -github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= -github.com/golang-sql/sqlexp v0.1.0 h1:ZCD6MBpcuOVfGVqsEmY5/4FtYiKz6tSyUv9LPEDei6A= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/flatbuffers v23.3.3+incompatible h1:5PJI/WbJkaMTvpGxsHVKG/LurN/KnWXNyGpwSCDgen0= -github.com/google/gnostic v0.6.9 h1:ZK/5VhkoX835RikCHpSUJV9a+S3e1zLh59YnyWeBW+0= -github.com/google/gnxi v0.0.0-20221016143401-2aeceb5a2901 h1:xlsMG0I0F6Ou3a4zRWu3cThivTt2N2V1cZafIloTBTU= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-github/v32 v32.1.0 h1:GWkQOdXqviCPx7Q7Fj+KyPoGm4SwHRh8rheoPhd27II= -github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/s2a-go v0.1.3 h1:FAgZmpLl/SXurPEZyCMPBIiiYeTbqfjlbdnCNTAkbGE= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/googleapis/enterprise-certificate-proxy v0.2.3 h1:yk9/cqRKtT9wXZSsRH9aurXEpJX+U6FLtpYTdC3R06k= -github.com/googleapis/gax-go/v2 v2.8.0 h1:UBtEZqx1bjXtOQ5BVTkuYghXrr3N4V123VKJK67vJZc= -github.com/gopcua/opcua v0.3.7 h1:iGjLW3D+ztnjtZQPKsJ0nwibHyDw1m11NfqOU8KSFQ8= -github.com/gophercloud/gophercloud v1.2.0 h1:1oXyj4g54KBg/kFtCdMM6jtxSzeIyg8wv4z1HoGPp1E= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= -github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= -github.com/gosnmp/gosnmp v1.35.0 h1:EuWWNPxTCdAUx2/NbQcSa3WdNxjzpy4Phv57b4MWpJM= -github.com/gosnmp/gosnmp v1.35.0/go.mod h1:2AvKZ3n9aEl5TJEo/fFmf/FGO4Nj4cVeEc5yuk88CYc= -github.com/grid-x/modbus v0.0.0-20211113184042-7f2251c342c9 h1:Q7e9kXS3sRbTjsNDKazbcbDSGAKjFdk096M3qYbwNpE= -github.com/grid-x/serial v0.0.0-20211107191517-583c7356b3aa h1:Rsn6ARgNkXrsXJIzhkE4vQr5Gbx2LvtEMv4BJOK4LyU= -github.com/gwos/tcg/sdk v0.0.0-20220621192633-df0eac0a1a4c h1:pVr0TkSFnMP4BWSsEak/4bxD8/K+foJ9V8DGyZ6PIDE= -github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= -github.com/harlow/kinesis-consumer v0.3.6-0.20211204214318-c2b9f79d7ab6 h1:38nI+nE+oUmLmlNjuByhvnmuBrcQVLNkOJhSSM4eJv0= -github.com/hashicorp/consul/api v1.20.0 h1:9IHTjNVSZ7MIwjlW3N3a7iGiykCMDpxZu8jsxFJh0yc= -github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= -github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= -github.com/hashicorp/go-hclog v1.4.0 h1:ctuWFGrhFha8BnnzxqeRGidlEcQkDyL5u8J8t5eA11I= -github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc= -github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= -github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= -github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= -github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= -github.com/hashicorp/packer-plugin-sdk v0.3.1 h1:Gr/mnihsdUcPfGiruFL93BQkiFh3EFPwyxxTWkwvRsQ= -github.com/hashicorp/serf v0.10.1 h1:Z1H2J60yRKvfDYAOZLd2MU0ND4AH/WDz7xYHDWQsIPY= -github.com/huandu/xstrings v1.3.2 h1:L18LIDzqlW6xN2rEkpdV8+oL/IXWJ1APd+vsdYy4Wdw= -github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= -github.com/influxdata/go-syslog/v3 v3.0.0 h1:jichmjSZlYK0VMmlz+k4WeOQd7z745YLsvGMqwtYt4I= -github.com/influxdata/influxdb-observability/common v0.3.3 h1:fzsgJKiV/bucNPRYggLE1F6UgpePQaYh72Lqj1rPEmI= -github.com/influxdata/influxdb-observability/influx2otel v0.3.3 h1:KWesgMC0sqRLfvPZXnCzJauCZ82XoHtKTFJVKmEk63M= -github.com/influxdata/influxdb-observability/otel2influx v0.3.3 h1:zdesvjHJYXccZ4vd6hP6vXwbd6YbAj7AGMhOjk9pt0k= -github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= -github.com/influxdata/tail v1.0.1-0.20210707231403-b283181d1fa7 h1:0rQOs1VHLVFpAAOIR0mJEvVOIaMYFgYdreeVbgI9sII= -github.com/influxdata/telegraf v1.26.3 h1:wawD3VTdnPDbHnJ1RBGgCf0YB7vlxREZ70rvEepHdGs= -github.com/influxdata/telegraf v1.26.3/go.mod h1:w+VUZ4NRDzfhRmhEdBbbNZBNT7E8qRkLiL73j/pD0ug= -github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 h1:vvyMtD5LTJc1W9sQKjDkAWdcg0478CszSdzlHtiAXCY= -github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65/go.mod h1:zApaNFpP/bTpQItGZNNUMISDMDAnTXu9UqJ4yT3ocz8= -github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8 h1:W2IgzRCb0L9VzMujq/QuTaZUKcH8096jWwP519mHN6Q= -github.com/intel/iaevents v1.1.0 h1:FzxMBfXk/apG2EUXUCfaq3gUQ+q+TgZ1HNMjjUILUGE= -github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= -github.com/jackc/pgconn v1.13.0 h1:3L1XMNV2Zvca/8BYhzcRFS70Lr0WlDg16Di6SFGAbys= -github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= -github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= -github.com/jackc/pgproto3/v2 v2.3.1 h1:nwj7qwf0S+Q7ISFfBndqeLwSwxs+4DPsbRFjECT1Y4Y= -github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b h1:C8S2+VttkHFdOOCXJe+YGfa4vHYwlt4Zx+IVXQ97jYg= -github.com/jackc/pgtype v1.12.0 h1:Dlq8Qvcch7kiehm8wPGIW0W3KsCCHJnRacKW0UM8n5w= -github.com/jackc/pgx/v4 v4.17.1 h1:tASdE79tX9LOQu3MMvioWT6YaZkf58ZhmLHhV4sv5WM= -github.com/jackc/puddle v1.3.0 h1:eHK/5clGOatcjX3oWGBO/MpxpbHzSwud5EWTSCI+MX0= -github.com/jaegertracing/jaeger v1.38.0 h1:rDQ36TnSxUX4gTskMQzEdpieS0BGYdfXXnUJmGnNMGw= -github.com/james4k/rcon v0.0.0-20120923215419-8fbb8268b60a h1:JxcWget6X/VfBMKxPIc28Jel37LGREut2fpV+ObkwJ0= -github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= -github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= -github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg= -github.com/jcmturner/gokrb5/v8 v8.4.3 h1:iTonLeSJOn7MVUtyMT+arAn5AKAPrkilzhGw8wE/Tq8= -github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY= -github.com/jeremywohl/flatten/v2 v2.0.0-20211013061545-07e4a09fb8e4 h1:eA9wi6ZzpIRobvXkn/S2Lyw1hr2pc71zxzOPl7Xjs4w= -github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c= -github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/native v1.0.0 h1:Ts/E8zCSEsG17dUqv7joXJFybuMLjQfWE04tsBODTxk= -github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/karrick/godirwalk v1.16.2 h1:eY2INUWoB2ZfpF/kXasyjWJ3Ncuof6qZuNWYZFN3kAI= -github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= -github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= -github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk= -github.com/knadh/koanf v1.5.0 h1:q2TSd/3Pyc/5yP9ldIrSdIz26MCcyNQzW0pEAugLPNs= -github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b h1:udzkj9S/zlT5X367kqJis0QP7YMxobob6zhzq6Yre00= -github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= -github.com/leodido/ragel-machinery v0.0.0-20181214104525-299bdde78165 h1:bCiVCRCs1Heq84lurVinUPy19keqGEe4jh5vtK37jcg= -github.com/linkedin/goavro/v2 v2.12.0 h1:rIQQSj8jdAUlKQh6DttK8wCRv4t4QO09g1C4aBWXslg= -github.com/logzio/azure-monitor-metrics-receiver v1.0.0 h1:TAzhIZL2ueyyc81qIw8FGg4nUbts4Hvc3oOxSobY1IA= -github.com/lufia/plan9stats v0.0.0-20220913051719-115f729f3c8c h1:VtwQ41oftZwlMnOEbMWQtSEUgU64U4s+GHk7hZK+jtY= -github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= -github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= -github.com/mattn/go-ieproxy v0.0.1 h1:qiyop7gCflfhwCzGyeT0gro3sF9AIg9HU98JORTkqfI= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= -github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= -github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= -github.com/mdlayher/apcupsd v0.0.0-20220319200143-473c7b5f3c6a h1:JOlLsLUQnokTyWWwEvOVoKH3XUl6oDMP8jisO54l6J8= -github.com/mdlayher/genetlink v1.2.0 h1:4yrIkRV5Wfk1WfpWTcoOlGmsWgQj3OtQN9ZsbrE+XtU= -github.com/mdlayher/netlink v1.6.0 h1:rOHX5yl7qnlpiVkFWoqccueppMtXzeziFjWAjLg6sz0= -github.com/mdlayher/socket v0.2.3 h1:XZA2X2TjdOwNoNPVPclRCURoX/hokBY8nkTmRZFEheM= -github.com/microsoft/ApplicationInsights-Go v0.4.4 h1:G4+H9WNs6ygSCe6sUyxRc2U81TI5Es90b2t/MwX5KqY= -github.com/miekg/dns v1.1.51 h1:0+Xg7vObnhrz/4ZCZcZh7zPXlmU0aveS2HDBd0m0qSo= -github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= -github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= -github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= -github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= -github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= -github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= -github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= -github.com/moby/ipvs v1.1.0 h1:ONN4pGaZQgAx+1Scz5RvWV4Q7Gb+mvfRh3NsPS+1XQQ= -github.com/moby/patternmatcher v0.5.0 h1:YCZgJOeULcxLw1Q+sVR636pmS7sPEn1Qo2iAN6M7DBo= -github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= -github.com/moby/term v0.0.0-20221128092401-c43b287e0e0f h1:J/7hjLaHLD7epG0m6TBMGmp4NQ+ibBYLfeyJWdAIFLA= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/montanaflynn/stats v0.6.6 h1:Duep6KMIDpY4Yo11iFsvyqJDyfzLF9+sndUKT+v64GQ= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/multiplay/go-ts3 v1.1.0 h1:OWOjRxBCRds+FbpyM1JKSscRbbmYr/IIrh6V78CM5Xw= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= -github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks= -github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0= -github.com/nats-io/jwt/v2 v2.3.0 h1:z2mA1a7tIf5ShggOFlR1oBPgd6hGqcDYsISxZByUzdI= -github.com/nats-io/nats-server/v2 v2.9.9 h1:bmj0RhvHOc8+z5/RuhI38GqPwtkFAHQuU3e99FVA/TI= -github.com/nats-io/nats.go v1.24.0 h1:CRiD8L5GOQu/DcfkmgBcTTIQORMwizF+rPk6T0RaHVQ= -github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8= -github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= -github.com/netsampler/goflow2 v1.3.3 h1:uheCMgWwbaHnVdsvc2bqbdQe93E73pVF77WGu/kPE7U= -github.com/newrelic/newrelic-telemetry-sdk-go v0.8.1 h1:6OX5VXMuj2salqNBc41eXKz6K+nV6OB/hhlGnAKCbwU= -github.com/nsqio/go-nsq v1.1.0 h1:PQg+xxiUjA7V+TLdXw7nVrJ5Jbl3sN86EhGCQj4+FYE= -github.com/olivere/elastic v6.2.37+incompatible h1:UfSGJem5czY+x/LqxgeCBgjDn6St+z8OnsCuxwD3L0U= -github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.73.0 h1:b62Oq3dniQm3eg8OcnBnlZCyZ4O85iyKPFuCIeYNCKk= -github.com/openconfig/gnmi v0.9.1 h1:hVOdLTaRjdy68oCGJbkf2vrmnUoQ5xbINqBOAMix4xM= -github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= -github.com/opencontainers/image-spec v1.1.0-rc2 h1:2zx/Stx4Wc5pIPDvIxHXvXtQFW/7XWJGmnM7r3wg034= -github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= -github.com/opensearch-project/opensearch-go/v2 v2.2.0 h1:6RicCBiqboSVtLMjSiKgVQIsND4I3sxELg9uwWe/TKM= -github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b h1:FfH+VrHHk6Lxt9HdVS0PXzSXFyS2NbZKXv33FYPol0A= -github.com/p4lang/p4runtime v1.3.0 h1:3fUhHj0JtsGcL2Bh0uxpACdBJBDqpZyLgj93tqKzoJY= -github.com/pborman/ansi v1.0.0 h1:OqjHMhvlSuCCV5JT07yqPuJPQzQl+WXsiZ14gZsqOrQ= -github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw= -github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0= -github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc= -github.com/pion/dtls/v2 v2.2.6 h1:yXMxKr0Skd+Ub6A8UqXTRLSywskx93ooMRHsQUtd+Z4= -github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= -github.com/pion/transport/v2 v2.0.2 h1:St+8o+1PEzPT51O9bv+tH/KYYLMNR5Vwm5Z3Qkjsywg= -github.com/pion/udp/v2 v2.0.1 h1:xP0z6WNux1zWEjhC7onRA3EwwSliXqu1ElUZAQhUP54= -github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 h1:KoWmjvw+nsYOo29YJK9vDA65RGE3NrOnUtO7a+RF9HU= -github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c h1:NRoLoZvkBTKvR5gQLgA3e0hqjkY9u1wm+iOL45VN/qI= -github.com/prometheus-community/pro-bing v0.1.0 h1:zjzLGhfNPP0bP1OlzGB+SJcguOViw7df12LPg2vUJh8= -github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= -github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= -github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= -github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= -github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= -github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= -github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= -github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= -github.com/prometheus/prometheus v0.42.0 h1:G769v8covTkOiNckXFIwLx01XE04OE6Fr0JPA0oR2nI= -github.com/prometheus/prometheus v0.42.0/go.mod h1:Pfqb/MLnnR2KK+0vchiaH39jXxvLMBk+3lnIGP4N7Vk= -github.com/rabbitmq/amqp091-go v1.8.0 h1:GBFy5PpLQ5jSVVSYv8ecHGqeX7UTLYR4ItQbDCss9MM= -github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= -github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= -github.com/riemann/riemann-go-client v0.5.1-0.20211206220514-f58f10cdce16 h1:bGXoxRwUpPTCaQ86DRE+3wqE9vh3aH8W0HH5L/ygOFM= -github.com/robbiet480/go.nut v0.0.0-20220219091450-bd8f121e1fa1 h1:YmFqprZILGlF/X3tvMA4Rwn3ySxyE3hGUajBHkkaZbM= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/safchain/ethtool v0.3.0 h1:gimQJpsI6sc1yIqP/y8GYgiXn/NjgvpM0RNoWLVVmP0= -github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e h1:CGjiMQ0wMH4wtNWrlj6kiTbkPt2F3rbYnhGX6TWLfco= -github.com/shirou/gopsutil/v3 v3.23.3 h1:Syt5vVZXUDXPEXpIBt5ziWsJ4LdSAAxF4l/xZeQgSEE= -github.com/shoenig/go-m1cpu v0.1.4 h1:SZPIgRM2sEF9NJy50mRHu9PKGwxyyTTJIWvCtgVbozs= -github.com/showwin/speedtest-go v1.4.2 h1:3YjBajURQTJCv/rVwJsd5UtCYlaiqCihg5NhPxJapk8= -github.com/signalfx/com_signalfx_metrics_protobuf v0.0.3 h1:32k2QLgsKhcEs55q4REPKyIadvid5FPy2+VMgvbmKJ0= -github.com/signalfx/gohistogram v0.0.0-20160107210732-1ccfd2ff5083 h1:WsShHmu12ZztYPfh9b+I+VjYD1o8iOHhB67WZCMEEE8= -github.com/signalfx/golib/v3 v3.3.50 h1:TTBpfzsO00F8ep6rhLgBmRIPUpRqBenacezjE4xCweI= -github.com/signalfx/sapm-proto v0.12.0 h1:OtOe+Jm8L61Ml8K6X8a89zc8/RlaaMRElCImeGKR/Ew= -github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= -github.com/sleepinggenius2/gosmi v0.4.4 h1:xgu+Mt7CptuB10IPt3SVXBAA9tARToT4B9xGzjjxQX8= -github.com/sleepinggenius2/gosmi v0.4.4/go.mod h1:l8OniPmd3bJzw0MXP2/qh7AhP/e+bTY2CNivIhsnDT0= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= -github.com/snowflakedb/gosnowflake v1.6.13 h1:r8iozak/p3P2jYfjF3EbeteqMMzPWjwmVrdENJDW6EI= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= -github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/testcontainers/testcontainers-go v0.18.0 h1:8RXrcIQv5xX/uBOSmZd297gzvA7F0yuRA37/918o7Yg= -github.com/thomasklein94/packer-plugin-libvirt v0.3.4 h1:K+NkHFcZuiUTp4ZiDdBhWRMZiSMdsXwGuzyg4THKDAU= -github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= -github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= -github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= -github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0= -github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw= -github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= -github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= -github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= -github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= -github.com/vapourismo/knx-go v0.0.0-20220829185957-fb5458a5389d h1:BJMc7MNW/p80cCkC46JimNuowOWCnSSW5IHjtUrXzNk= -github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs= -github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= -github.com/vjeantet/grok v1.0.1 h1:2rhIR7J4gThTgcZ1m2JY4TrJZNgjn985U28kT2wQrJ4= -github.com/vmware/govmomi v0.28.1-0.20220921224932-b4b508abf208 h1:IDVzGQ2aczmTEfTos4hzmFw20tGQ4zZsVnel9C6VEpA= -github.com/wavefronthq/wavefront-sdk-go v0.13.0 h1:3s9maJmzI4orW+hiVBfCNp/SIu8ISXi6rtewmDGzheE= -github.com/wvanbergen/kafka v0.0.0-20171203153745-e2edea948ddf h1:TOV5PC6fIWwFOFra9xJfRXZcL2pLhMI8oNuDugNxg9Q= -github.com/wvanbergen/kazoo-go v0.0.0-20180202103751-f72d8611297a h1:ILoU84rj4AQ3q6cjQvtb9jBjx4xzR/Riq/zYhmDQiOk= -github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= -github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= -github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= -github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= -github.com/xdg/scram v1.0.5 h1:TuS0RFmt5Is5qm9Tm2SoD89OPqe4IRiFtyFY4iwWXsw= -github.com/xdg/stringprep v1.0.3 h1:cmL5Enob4W83ti/ZHuZLuKD/xqJfus4fVPwE+/BDm+4= -github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/yuin/gopher-lua v0.0.0-20200816102855-ee81675732da h1:NimzV1aGyq29m5ukMK0AMWEhFaL/lrEOaephfuoiARg= -github.com/yusufpapurcu/wmi v1.2.2 h1:KBNDSne4vP5mbSWnJbO+51IMOXJB67QiYCSBrubbPRg= -github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= -go.mongodb.org/mongo-driver v1.11.2 h1:+1v2rDQUWNcGW7/7E0Jvdz51V38XXxJfhzbV17aNHCw= -go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= -go.opentelemetry.io/collector v0.73.0 h1:oEBFtf5WcXiIPGXcjOM5gSQ3GNh/3d6pHf0IThhGmfw= -go.opentelemetry.io/collector/component v0.73.0 h1:ka24yVJoVETCru+l5Fm85xGc2y0HwvGfYwyRe7qmjq0= -go.opentelemetry.io/collector/confmap v0.73.0 h1:tC8x8sDk7JQ3QcbosqrxLe756sYcg4iUdTXsx7Ie4CM= -go.opentelemetry.io/collector/consumer v0.73.0 h1:gy89oaG198A7KGbXIsMIdN4lWVQqqSdx6dsBCfzLujU= -go.opentelemetry.io/collector/featuregate v0.73.0 h1:hpHKXmRiJqMLefIzXwIuqDo9df2HcI/66IAKLo+g7nc= -go.opentelemetry.io/collector/pdata v1.0.0-rcv0011 h1:7lT0vseP89mHtUpvgmWYRvQZ0eY+SHbVsnXY20xkoMg= -go.opentelemetry.io/collector/semconv v0.73.0 h1:gF4f6z1q8YfWzzo/gPKysjFmmM4Pv4nC2bWrTPxTPaE= -go.opentelemetry.io/otel v1.14.0 h1:/79Huy8wbf5DnIPhemGB+zEPVwnN6fuQybr/SRXa6hM= -go.opentelemetry.io/otel/metric v0.37.0 h1:pHDQuLQOZwYD+Km0eb657A25NaRzy0a+eLyKfDXedEs= -go.opentelemetry.io/otel/sdk v1.14.0 h1:PDCppFRDq8A1jL9v6KMI6dYesaq+DFcDZvjsoGvxGzY= -go.opentelemetry.io/otel/sdk/metric v0.37.0 h1:haYBBtZZxiI3ROwSmkZnI+d0+AVzBWeviuYQDeBWosU= -go.opentelemetry.io/otel/trace v1.14.0 h1:wp2Mmvj41tDsyAJXiWDWpfNsOiIyd38fy85pyKcFq/M= -go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= -go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= -go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= -golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/exp v0.0.0-20230307190834-24139beb5833 h1:SChBja7BCQewoTAU7IgvucQKMIXrEpFxNMs0spT3/5s= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= -golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= -golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= -golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= -golang.zx2c4.com/wireguard v0.0.0-20211209221555-9c9e7e272434 h1:3zl8RkJNQ8wfPRomwv/6DBbH2Ut6dgMaWTxM0ZunWnE= -golang.zx2c4.com/wireguard/wgctrl v0.0.0-20211230205640-daad0b7ba671 h1:tJAYx7pB6b5bNqi7XatStqFT2zFAxhXcGDq1R6FqqjU= -google.golang.org/api v0.121.0 h1:8Oopoo8Vavxx6gt+sgs8s8/X60WBAtKQq6JqnkF+xow= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 h1:9NWlQfY2ePejTmfwUH1OWwmznFa+0kKcHGPDvcPza9M= -google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 h1:m8v1xLLLzMe1m5P+gCTF8nJB9epwZQUBERm20Oy1poQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 h1:0nDDozoAU19Qb2HwhXadU8OcsiO/09cnTqhUtq2MEOM= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= -google.golang.org/grpc v1.57.2 h1:uw37EN34aMFFXB2QPW7Tq6tdTbind1GpRxw5aOX3a5k= -google.golang.org/grpc v1.57.2/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= -google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= -gopkg.in/fatih/pool.v2 v2.0.0 h1:xIFeWtxifuQJGk/IEPKsTduEKcKvPmhoiVDGpC40nKg= -gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= -gopkg.in/gorethink/gorethink.v3 v3.0.5 h1:e2Uc/Xe+hpcVQFsj6MuHlYog3r0JYpnTzwDj/y2O4MU= -gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= -gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= -gopkg.in/olivere/elastic.v5 v5.0.86 h1:xFy6qRCGAmo5Wjx96srho9BitLhZl2fcnpuidPwduXM= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= -gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637 h1:yiW+nvdHb9LVqSHQBXfZCieqV4fzYhNBql77zY0ykqs= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -honnef.co/go/tools v0.2.2 h1:MNh1AVMyVX23VUHE2O27jm6lNj3vjO5DexS4A1xvnzk= -k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ= -k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ= -k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I= -k8s.io/client-go v0.26.2 h1:s1WkVujHX3kTp4Zn4yGNFK+dlDXy1bAAkIl+cFAiuYI= -k8s.io/cri-api v0.25.13 h1:FaVci3+y5COQuyAFWUckdfOxRpD+m0cnaW2q0OPVm1Q= -k8s.io/cri-api v0.25.13/go.mod h1:yKsLus3raCZ+WbR2m5hS+3hUs5BgSldj2CFJTWyx48M= -k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= -k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw= -k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d h1:VcFq5n7wCJB2FQMCIHfC+f+jNcGgNMar1uKd6rVlifU= -k8s.io/utils v0.0.0-20230308161112-d77c459e9343 h1:m7tbIjXGcGIAtpmQr7/NAi7RsWoW3E7Zcm4jI1HicTc= -layeh.com/radius v0.0.0-20221205141417-e7fbddd11d68 h1:2NDro2Jzkrqfngy/sA5GVnChs7fx8EzcQKFi/lI2cfg= -lukechampine.com/uint128 v1.2.0 h1:mBi/5l91vocEN8otkC5bDLhi2KdCticRiwbdB0O+rjI= -modernc.org/cc/v3 v3.40.0 h1:P3g79IUS/93SYhtoeaHW+kRCIrYaxJ27MFPv+7kaTOw= -modernc.org/ccgo/v3 v3.16.13 h1:Mkgdzl46i5F/CNR/Kj80Ri59hC8TKAhZrYSaqvkwzUw= -modernc.org/libc v1.22.3 h1:D/g6O5ftAfavceqlLOFwaZuA5KYafKwmr30A6iSqoyY= -modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= -modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= -modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= -modernc.org/sqlite v1.21.0 h1:4aP4MdUf15i3R3M2mx6Q90WHKz3nZLoz96zlB6tNdow= -modernc.org/strutil v1.1.3 h1:fNMm+oJklMGYfU9Ylcywl0CO5O6nTfaowNsh2wpPjzY= -modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= -sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go deleted file mode 100644 index 1318957..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go +++ /dev/null @@ -1,20 +0,0 @@ -//go:build !custom || inputs || inputs.npu - -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package all this for register -package all - -import _ "github.com/influxdata/telegraf/plugins/inputs/npu" // register plugin diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md b/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md deleted file mode 100644 index 72fc73e..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/README.md +++ /dev/null @@ -1,107 +0,0 @@ -# npu plugin of telegraf -## 使用介绍 -该插件代码可根据以下两种方法来使用(选择其一即可): - -### 1、源码集成使用(适合未安装Telegraf的情况) -对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/write_external_plugin/ -#### **编译步骤:** -拉取telegraf v1.26.0分支源码 -```shell -git clone -b v1.26.0 https://github.com/influxdata/telegraf.git -``` -拉取插件源码 -```shell -git clone -b [latest_tag] https://gitcode.com/Ascend/mind-cluster.git -# [latest_tag]此tag请自行修改,建议采用仓库的最新标签,否则可能导致引用函数失效 -``` -将插件代码集成到telegraf源码中(其中路径按实际修改) -```shell -cp -r mind-cluster/component/npu-exporter/platforms/inputs/npu telegraf/plugins/inputs -``` -将插件注册到telegraf(其中路径按实际修改) -```shell -cp -r mind-cluster/component/npu-exporter/platforms/inputs/all/npu.go telegraf/plugins/inputs/all -``` -将telegraf源码中的Makefile里的“CGO_ENABLED=0”改为“CGO_ENABLED=1” -```shell -cd telegraf -sed -i s"/CGO_ENABLED=0/CGO_ENABLED=1/" Makefile -``` - -将如下内容加入到telegraf源码的go.mod的文件里 -注意:[latest_tag]请自行修改为commitID/分支名称/tag名称中的一种,建议采用仓库的最新标签,否则可能导致引用函数失效 -```go.mod -require huawei.com/npu-exporter/v6 v6.0.0-RC1 - -replace huawei.com/npu-exporter/v6 => gitcode.com/Ascend/mind-cluster.git/component/npu-exporter/v6 [latest_tag] -replace ascend-common => gitcode.com/Ascend/mind-cluster.git/component/ascend-common [latest_tag] -``` - -然后执行 -```shell -go mod tidy -``` -接着编译telegraf -```shell -make all -``` -运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) -```shell -mkdir -m 750 /var/log/mindx-dl/npu-exporter -``` -源码集成时,该日志可通过hwlog.LogConfig{}结构体来配置,该结构体的详细信息如下 -```go -type LogConfig struct { - // log file path, default "/var/log/mindx-dl/npu-exporter/npu-plugin.log" in npu plugin - LogFileName string - // only write to std out, default value: false - OnlyToStdout bool - // only write to file, default value: false - OnlyToFile bool - // log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0 - LogLevel int - // size of a single log file (MB), default value: 2MB in npu plugin - FileMaxSize int - // MaxLineLength Max length of each log line, default value: 256 - MaxLineLength int - // maximum number of backup log files, set as 2 in npu plugin - MaxBackups int - // maximum number of days for backup log files, default value: 2 - MaxAge int - // whether backup files need to be compressed, default value: false - IsCompress bool - // expiration time for log cache, default value: 1s - ExpiredTime int - // Size of log cache space, default: 2048 - CacheSize int -} -``` -#### **使用示例:** -使用插件中提供的配置文件运行telegraf -```shell -./telegraf --config path_to_plugins/inputs/npu/sample.conf -``` - -### 2、二进制集成,使用telegraf的execd机制(适合已安装Telegraf的情况) -对应官方文档:https://docs.influxdata.com/telegraf/v1.26/configure_plugins/external_plugins/shim/ - -从[MindCluster社区](https://www.hiascend.com/developer/download/community/result?module=cluster)获取npu-exporter软件包,并从中解压出npu-exporter二进制文件 - -### 使用 -运行前请先创建日志目录:(该日志是插件调用底层api将记录的日志) -```shell -mkdir -m 750 /var/log/mindx-dl/npu-exporter -``` -先编写配置文件,如test.conf -``` -[[inputs.execd]] - command = ["path_to_npu_plugin/npu-exporter", "-platform=Telegraf"] - signal = "none" - -[[outputs.file]] - files=["stdout"] -``` -然后运行telegraf -```shell -./telegraf --config path_to_config_file/test.conf -``` \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go deleted file mode 100644 index 4c200e0..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu.go +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright(C) 2021-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package npu this for parse and pack -package npu - -import ( - _ "embed" - "strings" - - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/inputs" - - "ascend-common/api" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -//go:embed sample.conf -var sampleConfig string - -const ( - num2 = 2 -) - -// WatchNPU npu watch struct -type WatchNPU struct { - collector *common.NpuCollector -} - -// SampleConfig used to return sampleConfig -func (*WatchNPU) SampleConfig() string { - return sampleConfig -} - -// Gather used to gather information from dcmi info and hccn tool info -func (npu *WatchNPU) Gather(acc telegraf.Accumulator) error { - - fieldsMap := make(map[string]map[string]interface{}) - const devName = "ascend" - - devTagValue := "" - if cardType := npu.collector.Dmgr.GetDevType(); cardType == api.Ascend910A3 || cardType == api.Ascend910B || - cardType == api.Ascend910A { - devTagValue = strings.ToLower(api.Ascend910) - } else { - devTagValue = strings.ToLower(cardType) - } - logger.DynamicConfigure(logger.Config{Acc: acc}) - - containerMap := common.GetContainerNPUInfo(npu.collector) - chips := common.GetChipListWithVNPU(npu.collector) - - fieldsMap = npu.gatherChain(fieldsMap, common.ChainForSingleGoroutine, containerMap, chips) - fieldsMap = npu.gatherChain(fieldsMap, common.ChainForMultiGoroutine, containerMap, chips) - fieldsMap = npu.gatherChain(fieldsMap, common.ChainForCustomPlugin, containerMap, chips) - - generalFields := fieldsMap[common.GeneralDevTagKey] - acc.AddFields(devName, generalFields, map[string]string{"device": devTagValue}) - - // after the report is completed, deleted to avoid repeated reporting in the for loop - delete(fieldsMap, common.GeneralDevTagKey) - for key, fields := range fieldsMap { - - ids := strings.Split(key, "_") - devTag := map[string]string{"device": devTagValue + "-" + ids[0]} - if len(ids) >= num2 { - devTag["vdev_id"] = ids[1] - } - - acc.AddFields(devName, fields, devTag) - } - - return nil -} - -func (npu *WatchNPU) gatherChain(fieldsMap map[string]map[string]interface{}, chain []common.MetricsCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - - for _, collector := range chain { - fieldsMap = collector.UpdateTelegraf(fieldsMap, npu.collector, containerMap, chips) - } - return fieldsMap -} - -func init() { - inputs.Add("npu", func() telegraf.Input { - return &WatchNPU{ - collector: common.Collector, - } - }) -} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go b/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go deleted file mode 100644 index c8adef4..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/npu_test.go +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package npu this for parse and pack -package npu - -import ( - "fmt" - "strings" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/influxdata/telegraf" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - num5 = 5 -) - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - initChain() -} - -func initChain() { - common.ChainForSingleGoroutine = []common.MetricsCollector{ - &metrics.VersionCollector{}, - } -} - -func mockNewNpuCollector() *common.NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: time.Duration(num5), - updateTime: time.Duration(num5), - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -// TestGather verifies different device type scenarios -func TestGather(t *testing.T) { - tests := []struct { - name string - deviceType string - expectedTag string - }{ - {name: api.Ascend910A3, - deviceType: api.Ascend910A3, - expectedTag: api.Ascend910, - }, - {name: api.Ascend310P, - deviceType: api.Ascend310P, - expectedTag: api.Ascend310P, - }, - } - npu := &WatchNPU{ - collector: mockNewNpuCollector(), - } - acc := &MockAccumulator{} - - for _, tt := range tests { - convey.Convey(tt.name, t, func() { - patches := gomonkey.NewPatches() - defer patches.Reset() - - patches.ApplyMethodReturn(npu.collector.Dmgr, "GetDevType", tt.deviceType) - patches.ApplyFuncReturn(common.GetContainerNPUInfo, nil) - patches.ApplyFuncReturn(common.GetChipListWithVNPU, nil) - patches.ApplyMethodReturn(common.ChainForSingleGoroutine[0], "UpdateTelegraf", - map[string]map[string]interface{}{ - common.GeneralDevTagKey: {"npu_exporter_version_info": "7.0.0"}, - "0": {"npu_chip_info_power": "1"}, - "1_100": {"npu_chip_info_voltage": "1"}, - }) - - err := npu.Gather(acc) - convey.So(err, convey.ShouldBeNil) - convey.So(acc.fields["ascend,device="+strings.ToLower(tt.expectedTag)], convey.ShouldNotBeEmpty) - }) - } -} - -// TestGatherChain tests the gatherChain method of WatchNPU -func TestGatherChain(t *testing.T) { - npu := &WatchNPU{} - fieldsMap := make(map[string]map[string]interface{}) - chain := []common.MetricsCollector{&metrics.VersionCollector{}} - - convey.Convey("TestGatherChain", t, func() { - result := npu.gatherChain(fieldsMap, chain, nil, nil) - logger.Infof("result:%v", result) - convey.So(len(result), convey.ShouldEqual, 1) - }) -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -// MockAccumulator is a mock implementation of telegraf.Accumulator -type MockAccumulator struct { - fields map[string]map[string]interface{} -} - -func (m *MockAccumulator) AddFields(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { - if m.fields == nil { - m.fields = make(map[string]map[string]interface{}) - } - pairs := make([]string, 0, len(tags)) - for k, v := range tags { - pairs = append(pairs, fmt.Sprintf("%s=%v", k, v)) - } - metricKey := measurement + "," + strings.Join(pairs, ",") - m.fields[metricKey] = fields -} - -func (m *MockAccumulator) AddGauge(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddCounter(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddSummary(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddHistogram(measurement string, fields map[string]interface{}, tags map[string]string, - t ...time.Time) { -} - -func (m *MockAccumulator) AddMetric(metric telegraf.Metric) { -} - -func (m *MockAccumulator) SetPrecision(precision time.Duration) { -} - -func (m *MockAccumulator) AddError(err error) { -} - -func (m *MockAccumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator { - return nil -} diff --git a/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf b/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf deleted file mode 100644 index 11fe998..0000000 --- a/mind-cluster/component/npu-exporter/platforms/inputs/npu/sample.conf +++ /dev/null @@ -1,9 +0,0 @@ -[agent] - interval="20s" - flush_interval="20s" - -[[inputs.npu]] - npu_log_level = 1 - -[[outputs.file]] - files=["stdout"] \ No newline at end of file diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go deleted file mode 100644 index 088eeb9..0000000 --- a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector.go +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package prometheus for prometheus collector -package prom - -import ( - "github.com/prometheus/client_golang/prometheus" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -// CollectorForPrometheus Entry point for collecting and converting -type CollectorForPrometheus struct { - collector *common.NpuCollector -} - -// NewPrometheusCollector create an instance of prometheus Collector -func NewPrometheusCollector(collector *common.NpuCollector) *CollectorForPrometheus { - promCollector := &CollectorForPrometheus{ - collector: collector, - } - return promCollector -} - -// Describe desc metrics of prometheus -func (*CollectorForPrometheus) Describe(ch chan<- *prometheus.Desc) { - if ch == nil { - logger.Error("ch is nil ") - return - } - const cacheSize = 100 - tempCh := make(chan *prometheus.Desc, cacheSize) - done := make(chan bool) - - go func() { - seenMetrics := make(map[string]struct{}) - for desc := range tempCh { - if desc == nil { - continue - } - descKey := utils.GetDescName(desc) - if _, exists := seenMetrics[descKey]; exists { - logger.Warnf("duplicate metric description detected, keeping first declaration, ignoring duplicate: %s", desc) - continue - } - seenMetrics[descKey] = struct{}{} - ch <- desc - } - // tempCh closed - done <- true - }() - - describeChain(tempCh, common.ChainForSingleGoroutine) - describeChain(tempCh, common.ChainForMultiGoroutine) - describeChain(tempCh, common.ChainForCustomPlugin) - - close(tempCh) - - <-done -} - -func describeChain(ch chan<- *prometheus.Desc, chain []common.MetricsCollector) { - for _, collector := range chain { - if collector != nil { - collector.Describe(ch) - } - } -} - -// Collect update metrics of prometheus -func (n *CollectorForPrometheus) Collect(ch chan<- prometheus.Metric) { - containerMap := common.GetContainerNPUInfo(n.collector) - chips := common.GetChipListWithVNPU(n.collector) - collectChain(ch, n, containerMap, chips, common.ChainForSingleGoroutine) - collectChain(ch, n, containerMap, chips, common.ChainForMultiGoroutine) - collectChain(ch, n, containerMap, chips, common.ChainForCustomPlugin) -} - -func collectChain(ch chan<- prometheus.Metric, n *CollectorForPrometheus, containerMap map[int32]container.DevicesInfo, - chips []common.HuaWeiAIChip, chain []common.MetricsCollector) { - if ch == nil { - logger.Error("ch is nil") - return - } - for _, collector := range chain { - collector.UpdatePrometheus(ch, n.collector, containerMap, chips) - } -} diff --git a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go b/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go deleted file mode 100644 index 331ca66..0000000 --- a/mind-cluster/component/npu-exporter/platforms/prom/prometheus_collector_test.go +++ /dev/null @@ -1,159 +0,0 @@ -/* -Copyright(C) 2021-2025. Huawei Technologies Co.,Ltd. All rights reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package prometheus for prometheus collector -package prom - -import ( - "strconv" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" - - "ascend-common/common-utils/hwlog" - "ascend-common/devmanager" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/collector/metrics" - "huawei.com/npu-exporter/v6/utils/logger" -) - -const ( - maxMetricsCount = 2000 - num5 = 5 - mockContainerName = "mockContainerName" - maxChipNum int32 = 8 -) - -func TestDescribe(t *testing.T) { - - convey.Convey("test prometheus desc ", t, func() { - collector := NewPrometheusCollector(nil) - - convey.Convey("test prometheus desc when ch is nil", func() { - collector.Describe(nil) - }) - convey.Convey("test prometheus desc when ch is not nil", func() { - ch := make(chan *prometheus.Desc, maxMetricsCount) - collector.Describe(ch) - t.Logf("Describe len(ch):%v", len(ch)) - - convey.So(ch, convey.ShouldNotBeEmpty) - }) - - }) -} - -func TestCollect(t *testing.T) { - convey.Convey("test prometheus collect ", t, func() { - npuCollector := mockNewNpuCollector() - collector := NewPrometheusCollector(npuCollector) - - convey.Convey("test prometheus collect when ch is nil", func() { - collector.Collect(nil) - }) - convey.Convey("test prometheus collect when ch is not nil", func() { - - ch := make(chan prometheus.Metric, maxMetricsCount) - - patches := gomonkey.NewPatches() - collector.Collect(ch) - - patches.ApplyFuncReturn(common.GetChipListWithVNPU, mockGetNPUChipList()) - patches.ApplyFuncReturn(common.GetContainerNPUInfo, mockGetContainerNPUInfo()) - - t.Logf("Describe len(ch):%v", len(ch)) - convey.So(ch, convey.ShouldNotBeEmpty) - }) - }) -} - -func mockNewNpuCollector() *common.NpuCollector { - tc := newNpuCollectorTestCase{ - cacheTime: time.Duration(num5), - updateTime: time.Duration(num5), - deviceParser: &container.DevicesParser{}, - dmgr: &devmanager.DeviceManager{}, - } - c := common.NewNpuCollector(tc.cacheTime, tc.updateTime, tc.deviceParser, tc.dmgr) - return c -} - -type newNpuCollectorTestCase struct { - cacheTime time.Duration - updateTime time.Duration - deviceParser *container.DevicesParser - dmgr *devmanager.DeviceManager -} - -func mockGetNPUChipList() []common.HuaWeiAIChip { - chips := make([]common.HuaWeiAIChip, 0) - for id := int32(0); id < maxChipNum; id++ { - chip := common.HuaWeiAIChip{ - CardId: id, - PhyId: id, - DeviceID: id, - LogicID: id, - } - - chips = append(chips, chip) - } - return chips -} - -func mockGetContainerNPUInfo() map[int32]container.DevicesInfo { - containsInfo := make(map[int32]container.DevicesInfo) - for id := int32(0); id < maxChipNum; id++ { - - containerInfo := container.DevicesInfo{ - ID: strconv.Itoa(int(id)), - Name: mockContainerName, - Devices: []int{int(id)}, - } - containsInfo[id] = containerInfo - } - return containsInfo -} - -func init() { - logger.HwLogConfig = &hwlog.LogConfig{ - OnlyToStdout: true, - } - logger.InitLogger("Prometheus") - - initChain() -} - -func initChain() { - common.ChainForSingleGoroutine = []common.MetricsCollector{ - &metrics.HccsCollector{}, - &metrics.BaseInfoCollector{}, - &metrics.SioCollector{}, - &metrics.VersionCollector{}, - &metrics.HbmCollector{}, - &metrics.DdrCollector{}, - &metrics.VnpuCollector{}, - &metrics.PcieCollector{}, - } - common.ChainForMultiGoroutine = []common.MetricsCollector{ - &metrics.NetworkCollector{}, - &metrics.RoceCollector{}, - &metrics.OpticalCollector{}, - } -} diff --git a/mind-cluster/component/npu-exporter/plugins/README.md b/mind-cluster/component/npu-exporter/plugins/README.md deleted file mode 100644 index 5690dac..0000000 --- a/mind-cluster/component/npu-exporter/plugins/README.md +++ /dev/null @@ -1,388 +0,0 @@ -## 自定义插件开发说明 - -用户可参考提供的demo,或将代码拷贝到plugins目录下,重新编译部署,下面对demo中各文件进行说明 - -- `dcmi.go` 、`dcmi_interface_api.h`:用户自定义NPU指标的接口声明与cgo实现,用于对接驱动dcmi接口,具体可参考demo实现,全部dcmi接口续参考驱动的dcmi接口文档。 -- `custom_metrics.go` 实现`MetricCollector`的接口,用于指标采集与上报,需要实现下面的接口,具体可参考demo实现: - - Describe:prometheus上报指标前,需要先定义指标的,该接口用于prometheus的指标定义 - - CollectToCache: 指标采集方法,每个采集周期都会执行,从外部获取数据,并传入到内部缓存中 - - UpdatePrometheus: 按照prometheus的格式,将缓存中的数据返回 - - UpdateTelagraf:按照telagraf的格式,将缓存中的数据返回。 - - IsSupporterd:检测当前环境,判断是否支持当前设备的检测。 - - PreCollect:正式开始采集前执行一次,可用于设备初始化。可以为空。 - - PostCollect:采集结束后执行一次,可用于数据的回收。可以为空。 -- `register.go`,提供插件注册函数,在npu-exporter启动时完成插件注册并完成dcmi接口初始化,**RegisterPlugin函数签名不要修改**,自定义插件通过`AddPluginCollector`接口注册,指标名称需要与`pluginConfiguration.json`中的指标组名称保持一致 - -对于插件指标组内定义的指标名称,不要与现有代码中已定义的插件指标(当前NPU指标、插件指标)重名 - -自定义插件采集时间超过10s后,npu-exporter会打印日志,提示插件采集时间过长,执行下一个插件采集。 - -### 编译部署 - -插件开发完后,执行Npu-exporter代码目录下的`build/build.sh`完成编译,需要提前准备go开发环境。 - -编译完成后,会在output目录下生成新的二进制文件与相关配置文件,根据需要打开或关闭相应开关,根据安装部署章节的安装指导,重新作镜像部署即可 - - - -`dcmi.go` - -```go -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins this for dcmi interface -package plugins - -// #cgo LDFLAGS: -ldl -/* - #include - #include - #include - #include - - #include "dcmi_interface_api.h" - - static void *dcmiHandle; - #define SO_NOT_FOUND -99999 - #define FUNCTION_NOT_FOUND -99998 - #define SUCCESS 0 - #define ERROR_UNKNOWN -99997 - #define CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__); - - static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health); - int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){ - CALL_FUNC(dcmi_get_device_health,card_id,device_id,health) - } - - // load .so files and functions - static int dcmiLoad_dl(const char* dcmiLibPath){ - if (dcmiLibPath == NULL) { - fprintf (stderr,"lib path is null\n"); - return SO_NOT_FOUND; - } - dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); - if (dcmiHandle == NULL){ - fprintf (stderr,"%s\n",dlerror()); - return SO_NOT_FOUND; - } - - dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health"); - - return SUCCESS; - } - - static int dcmiShutDown(void){ - if (dcmiHandle == NULL) { - return SUCCESS; - } - return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); - } -*/ -import "C" -import ( - "fmt" - - "unsafe" - - "ascend-common/common-utils/utils" - "ascend-common/devmanager/common" -) - -const ( - dcmiLibraryName = "libdcmi.so" -) - -// DcLoad load dcmi symbol -func DcLoad() error { - dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName) - if err != nil { - return err - } - cDcmiTemplateName := C.CString(dcmiLibPath) - defer C.free(unsafe.Pointer(cDcmiTemplateName)) - if retCode := C.dcmiLoad_dl(cDcmiTemplateName); retCode != C.SUCCESS { - return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) - } - return nil -} - -// DcShutDown clean the dynamically loaded resource -func DcShutDown() error { - if retCode := C.dcmiShutDown(); retCode != C.SUCCESS { - return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode)) - } - - return nil -} - -// DcGetDeviceHealth get device health -func DcGetDeviceHealth(cardID, deviceID int32) (int32, error) { - if !common.IsValidCardIDAndDeviceID(cardID, deviceID) { - return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID) - } - var health C.uint - if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID), - &health); int32(retCode) != common.Success { - return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+ - "code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health)) - } - if common.IsGreaterThanOrEqualInt32(int64(health)) { - return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+ - "health: %d", cardID, deviceID, int64(health)) - } - return int32(health), nil -} - -``` - - - -`dcmi_interface_api.h` - -```c++ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef __DCMI_INTERFACE_API_H__ -#define __DCMI_INTERFACE_API_H__ - -#ifdef __cplusplus -#if __cplusplus -extern "C" { -#endif -#endif /* __cplusplus */ - -#define DCMIDLLEXPORT static - -DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health); - -#ifdef __cplusplus -#if __cplusplus -} -#endif -#endif /* __cplusplus */ - -#endif /* __DCMI_INTERFACE_API_H__ */ -``` - - - -`custom_metrics.go` - -```go -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -import ( - "strings" - "sync" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/container" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - PluginInfoDesc = prometheus.NewDesc("plugin_info", "exporter custom plugin info", - []string{"plugin_label"}, nil) - - PluginNpuInfoDesc = prometheus.NewDesc("npu_plugin_info", "exporter custom npu plugin info", - []string{"npu_plugin_label"}, nil) -) - -const ( - pluginInfoKey = "pluginInfoKey" - pluginInfoValue = 1.11111 - pluginLabel = "pluginLabel" - npuPluginLabel = "npuPluginInfoKey" - npuPluginInfoKey = "npuPluginInfoKey" - pluginName = "MyPlugin" -) - -// PluginInfoCollector collect custom plugin info -type PluginInfoCollector struct { - common.MetricsCollectorAdapter - Cache sync.Map -} - -// Describe description of the metric -func (c *PluginInfoCollector) Describe(ch chan<- *prometheus.Desc) { - // add desc - logger.Debug("PluginInfoCollector Describe") - ch <- PluginInfoDesc - ch <- PluginNpuInfoDesc -} - -// CollectToCache collect the metric to cache -func (c *PluginInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { - // collect metric to cache - logger.Debug("PluginInfoCollector CollectToCache") - c.Cache.Store(pluginInfoKey, pluginInfoValue) - health, err := DcGetDeviceHealth(0, 0) - if err != nil { - logger.Error(err) - return - } - c.Cache.Store(npuPluginInfoKey, health) -} - -// UpdatePrometheus update prometheus metric -func (c *PluginInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { - logger.Debug("PluginInfoCollector UpdatePrometheus") - // get metric from cache - pluginCache, _ := c.Cache.Load(pluginInfoKey) - npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) - // update plugin info - ch <- prometheus.NewMetricWithTimestamp(time.Now(), - prometheus.MustNewConstMetric(PluginInfoDesc, prometheus.GaugeValue, pluginCache.(float64), pluginLabel)) - // update npu plugin info - value := float64(npuPluginCache.(int32)) - ch <- prometheus.NewMetricWithTimestamp(time.Now(), - prometheus.MustNewConstMetric(PluginNpuInfoDesc, prometheus.GaugeValue, value, npuPluginLabel)) - -} - -// UpdateTelegraf update telegraf metric -func (c *PluginInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - logger.Debug("PluginInfoCollector UpdateTelegraf") - // get metric from cache - pluginCache, _ := c.Cache.Load(pluginInfoKey) - npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey) - // update plugin info - if fieldsMap[common.GeneralDevTagKey] == nil { - fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], PluginInfoDesc, pluginCache.(float64), "") - // update npu plugin info - const NpuLogicID = "1" - value := float64(npuPluginCache.(int32)) - if fieldsMap[NpuLogicID] == nil { - fieldsMap[NpuLogicID] = make(map[string]interface{}) - } - doUpdateTelegraf(fieldsMap[NpuLogicID], PluginNpuInfoDesc, value, "") - return fieldsMap -} - -// PreCollect pre handle before collect -func (c *PluginInfoCollector) PreCollect(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { - logger.Debug("PluginInfoCollector PreCollect") -} - -// PostCollect post handle after collect -func (c *PluginInfoCollector) PostCollect(n *common.NpuCollector) { - logger.Debug("PluginInfoCollector PostCollect") -} - -// IsSupported Check whether the current hardware supports this metric -func (c *PluginInfoCollector) IsSupported(n *common.NpuCollector) bool { - logger.Debug("PluginInfoCollector IsSupported") - return true -} - -// getDescName parse metrics name from prometheus.Desc object -func getDescName(desc *prometheus.Desc) string { - str := desc.String() - startIndex := strings.Index(str, "fqName: ") + len("fqName: ") - readfqName := str[startIndex:] - - endIndex := strings.Index(readfqName, ",") - if endIndex != -1 { - readfqName = readfqName[:endIndex] - } - - readfqName = strings.Trim(readfqName, "\"") - return readfqName -} - -func doUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { - fieldMap[getDescName(desc)+extInfo] = value -} - - -``` - - - -`register.go` - -```go -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -import ( - "huawei.com/npu-exporter/v6/collector/config" - "huawei.com/npu-exporter/v6/utils/logger" -) - -// RegisterPlugin register plugin collector -func RegisterPlugin() { - err := config.AddPluginCollector(pluginName, &PluginInfoCollector{}) - if err != nil { - logger.Errorf("add plugin failed: %v\n", err) - } - logger.Infof("add plugin ok: %v\n", pluginName) - err = DcLoad() - if err != nil { - logger.Errorf("dcmi init failed: %v\n", err) - return - } -} - -``` - diff --git a/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go b/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go deleted file mode 100644 index db462a4..0000000 --- a/mind-cluster/component/npu-exporter/plugins/collector_for_text_file.go +++ /dev/null @@ -1,358 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -import ( - "encoding/json" - "fmt" - "os" - "sort" - "strings" - "sync" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "ascend-common/common-utils/hwlog" - "ascend-common/common-utils/utils" - "huawei.com/npu-exporter/v6/collector/common" - "huawei.com/npu-exporter/v6/collector/config" - "huawei.com/npu-exporter/v6/collector/container" - npuutils "huawei.com/npu-exporter/v6/utils" - "huawei.com/npu-exporter/v6/utils/logger" -) - -var ( - metricDesc *prometheus.Desc - labelKeys []string // a list of tag keys extracted from the datalist - jsonFilePath string - isSupported bool - currentVersion versionInfo -) - -const ( - size100k = 100 * 1024 - maxLabelSize = 10 - num1000 = 1000 - maxDataListSize = 128 - maxMetricNameSize = 128 - maxDescSize = 1024 - fileMetricsDisabledMsg = "file metrics collection will be disabled" - skipCurrentCollectionMsg = "will skip current collection and report cached metrics" - excludedPermission = 0111 // file should not have any execute permission -) - -type versionInfo struct { - name string - desc string - version string -} - -// TextMetricData represents the JSON structure -type TextMetricData struct { - Version string `json:"version"` - Desc string `json:"desc"` - Name string `json:"name"` - Timestamp int64 `json:"timestamp"` - DataList []DataItem `json:"data_list"` -} - -// DataItem represents each item in data_list -type DataItem struct { - Label map[string]string `json:"label"` - Value float64 `json:"value"` -} - -// InitTextMetricsDesc init text metric -func InitTextMetricsDesc(filePath string) { - if filePath == "" { - return - } - paths := strings.Split(filePath, ",") - if len(paths) > 1 { - logger.Warnf("multiple file paths detected in filePath: %s, only the first file will be used", filePath) - jsonFilePath = strings.TrimSpace(paths[0]) - } else { - jsonFilePath = filePath - } - if utils.IsDir(jsonFilePath) { - logger.Errorf("file path %s is a directory, only support specify file path", filePath) - return - } - fileData, err := waitForFile(jsonFilePath, time.Minute) - if err != nil { - logger.Warnf("read json file %s failed, %s: %v", jsonFilePath, fileMetricsDisabledMsg, err) - return - } - var metricsData TextMetricData - if err := json.Unmarshal(fileData, &metricsData); err != nil { - logger.Warnf("unmarshal json file %s failed, %s: %v, "+ - "Possible causes:\n1. The file is not in JSON format\n2. File size is more than 100KB ", jsonFilePath, fileMetricsDisabledMsg, err) - return - } - - if err := isDataOk(&metricsData); err != nil { - logger.Warnf("%v, %s", err, fileMetricsDisabledMsg) - return - } - - desc := metricsData.Desc - labelKeys = make([]string, 0, len(metricsData.DataList[0].Label)) - for key := range metricsData.DataList[0].Label { - labelKeys = append(labelKeys, key) - } - sort.Strings(labelKeys) - logger.Infof("init text metric succeeded, metricName: %v, version: %v, desc: %v, labels: %v", - metricsData.Name, metricsData.Version, desc, labelKeys) - - metricDesc = prometheus.NewDesc(metricsData.Name, desc, labelKeys, nil) - isSupported = true - currentVersion = versionInfo{ - name: metricsData.Name, - desc: desc, - version: metricsData.Version, - } - err = config.AddPluginCollector("text", &TextMetricsInfoCollector{}) - if err != nil { - logger.Errorf("%v", err) - } -} - -func isDataOk(metricsData *TextMetricData) error { - if len(metricsData.DataList) == 0 { - return fmt.Errorf("dataList is empty in json file %s", jsonFilePath) - } - if len(metricsData.DataList) > maxDataListSize { - return fmt.Errorf("size of dataList(%d) is more than max allowed dataList size(%d) in json file %s", - len(metricsData.DataList), maxDataListSize, jsonFilePath) - } - if len(metricsData.DataList[0].Label) > maxLabelSize { - return fmt.Errorf("size of first item's Label(%d) is more than max allowed label size(%d) in json file %s", - len(metricsData.DataList[0].Label), maxLabelSize, jsonFilePath) - } - if metricsData.Name == "" { - return fmt.Errorf("name field is empty in json file %s", jsonFilePath) - } - if len(metricsData.Name) > maxMetricNameSize { - return fmt.Errorf("length of metric name should not larger than %d, but current is %d", - maxMetricNameSize, len(metricsData.Name)) - } - if metricsData.Desc == "" { - return fmt.Errorf("desc field is empty in json file %s", jsonFilePath) - } - if len(metricsData.Desc) > maxDescSize { - return fmt.Errorf("length of metric desc should not larger than %d, but current is %d", - maxDescSize, len(metricsData.Desc)) - } - if metricsData.Version == "" { - return fmt.Errorf("version field is empty in json file %s", jsonFilePath) - } - // only support 1.0 version currently - if metricsData.Version != "1.0" { - return fmt.Errorf("version should be 1.0, but current is %s", metricsData.Version) - } - if metricsData.Timestamp <= 0 { - return fmt.Errorf("timestamp field is empty or not correct in json file %s", jsonFilePath) - } - return nil -} - -// waitForFile wait for file to exist -func waitForFile(filePath string, timeout time.Duration) ([]byte, error) { - const tickerDuration = 100 - deadline := time.Now().Add(timeout) - ticker := time.NewTicker(tickerDuration * time.Millisecond) - defer ticker.Stop() - once := sync.Once{} - - for { - fileData, err := utils.ReadLimitBytes(filePath, size100k) - err2 := checkFile(filePath) - if err2 != nil { - hwlog.RunLog.Errorf("check file err, %s: %v", filePath, err2) - } - if err2 != nil && !os.IsNotExist(err2) { - return nil, err2 - } - - if err == nil && err2 == nil && len(fileData) > 0 { - logger.Infof("successfully read json file %s", filePath) - return fileData, nil - } - if os.IsNotExist(err) || len(fileData) == 0 { - once.Do(func() { - logger.Warnf("file [%v] is not exist or file is empty, will wait 1 minute", filePath) - }) - if time.Now().After(deadline) { - return nil, fmt.Errorf("file %s does not exist or file is empty after waiting %v", filePath, timeout) - } - select { - case <-ticker.C: - continue - } - } - return nil, err - } -} - -func checkFile(filePath string) error { - absFilePath, err := utils.CheckPath(filePath) - if err != nil { - return err - } - if err = utils.DoCheckOwnerAndPermission(absFilePath, excludedPermission, 0); err != nil { - logger.Errorf("file permission should not included %04o: %v", excludedPermission, err) - return err - } - return nil -} - -// TextMetricsInfoCollector collect custom plugin info -type TextMetricsInfoCollector struct { - common.MetricsCollectorAdapter - Cache sync.Map -} - -// Describe description of the metric -func (c *TextMetricsInfoCollector) Describe(ch chan<- *prometheus.Desc) { - // add desc - if metricDesc != nil { - ch <- metricDesc - } -} - -// CollectToCache collect the metric to cache -func (c *TextMetricsInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) { - // collect metric to cache - logger.Debugf("TextMetricsInfoCollector CollectToCache") - - fileData, err := utils.ReadLimitBytes(jsonFilePath, size100k) - if err != nil { - logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "readFileErr"}, - "read json file %s failed: %v", jsonFilePath, err) - return - } - hwlog.ResetErrCnt("textMetrics", "readFileErr") - - var metricsData TextMetricData - if err := json.Unmarshal(fileData, &metricsData); err != nil { - logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "unmarshalFileErr"}, - "unmarshal json file %s failed: %v", jsonFilePath, err) - return - } - hwlog.ResetErrCnt("textMetrics", "unmarshalFileErr") - - if err := isDataOk(&metricsData); err != nil { - logger.LogfWithOptions(logger.WarnLevel, logger.LogOptions{Domain: "textMetrics", ID: "dataNotOk"}, - "%v, %s", err, skipCurrentCollectionMsg) - return - } - hwlog.ResetErrCnt("textMetrics", "dataNotOk") - - if versionChanged(metricsData) { - logger.LogfWithOptions(logger.ErrorLevel, logger.LogOptions{Domain: "textMetrics", ID: "versionChanged"}, - "json file base info changed, old: %v, new: %v", currentVersion, - versionInfo{name: metricsData.Name, desc: metricsData.Desc, version: metricsData.Version}) - return - } - hwlog.ResetErrCnt("textMetrics", "versionChanged") - - c.Cache.Store(common.GetCacheKey(c), metricsData) -} - -func versionChanged(data TextMetricData) bool { - if currentVersion.name != data.Name || currentVersion.desc != data.Desc || - currentVersion.version != data.Version { - return true - } - return false -} - -// UpdatePrometheus update prometheus metric -func (c *TextMetricsInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) { - logger.Debug("TextMetricsInfoCollector UpdatePrometheus") - if metricDesc == nil { - logger.Warnf("metricDesc is not initialized, skip UpdatePrometheus") - return - } - cacheKey := common.GetCacheKey(c) - data, ok := c.Cache.Load(cacheKey) - if !ok { - logger.Debugf("cache key %s not found", cacheKey) - return - } - - textMetricsData, ok := data.(TextMetricData) - if !ok { - logger.Warnf("cache data type mismatch for key %s", cacheKey) - return - } - - timestamp := time.Unix(0, textMetricsData.Timestamp*num1000) - - for _, item := range textMetricsData.DataList { - labelValues := make([]string, len(labelKeys)) - for i, key := range labelKeys { - if value, ok := item.Label[key]; ok { - labelValues[i] = value - } else { - labelValues[i] = "" - } - } - - ch <- prometheus.NewMetricWithTimestamp(timestamp, - prometheus.MustNewConstMetric(metricDesc, prometheus.GaugeValue, item.Value, labelValues...)) - } -} - -// UpdateTelegraf update telegraf metric -func (c *TextMetricsInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector, - containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} { - logger.Debug("TextMetricsInfoCollector UpdateTelegraf") - - if metricDesc == nil { - logger.Warnf("metricDesc is not initialized, skip UpdateTelegraf") - return fieldsMap - } - - cacheKey := common.GetCacheKey(c) - data, ok := c.Cache.Load(cacheKey) - if !ok { - logger.Debugf("cache key %s not found", cacheKey) - return fieldsMap - } - - textMetricData, ok := data.(TextMetricData) - if !ok { - logger.Warnf("cache data type mismatch for key %s", cacheKey) - return fieldsMap - } - - for _, item := range textMetricData.DataList { - if fieldsMap[common.GeneralDevTagKey] == nil { - fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{}) - } - npuutils.DoUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], metricDesc, item.Value, "") - } - - return fieldsMap -} - -// IsSupported Check whether the current hardware supports this metric -func (c *TextMetricsInfoCollector) IsSupported(n *common.NpuCollector) bool { - return isSupported -} diff --git a/mind-cluster/component/npu-exporter/plugins/register.go b/mind-cluster/component/npu-exporter/plugins/register.go deleted file mode 100644 index e9b5f41..0000000 --- a/mind-cluster/component/npu-exporter/plugins/register.go +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package plugins for custom metrics -package plugins - -// RegisterPlugin register plugin collector -func RegisterPlugin() { - -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/general_logger.go b/mind-cluster/component/npu-exporter/utils/logger/general_logger.go deleted file mode 100644 index 3f1e19c..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/general_logger.go +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "context" - "fmt" - - "ascend-common/common-utils/hwlog" -) - -const ( - maxLogLineLength = 1024 - defaultLogFile = "/var/log/mindx-dl/npu-exporter/npu-exporter.log" -) - -type generalLogger struct { -} - -// dynamicConfigure configures the logger -func (c *generalLogger) dynamicConfigure(Config) { -} - -// log logs with specified level -func (c *generalLogger) log(ctx context.Context, level Level, args ...interface{}) { - fn, ok := logFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), args...) -} - -// logf logs with specified level and format -func (c *generalLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { - fn, ok := logfFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), format, args...) -} - -func (c *generalLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, - args ...interface{}) { - - if opts.MaxCounts == 0 { - opts.MaxCounts = hwlog.ProblemOccurMaxNumbers - } - - if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - fn, ok := logfFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), format, args...) - } -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger.go b/mind-cluster/component/npu-exporter/utils/logger/logger.go deleted file mode 100644 index 723e070..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/logger.go +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "context" - "errors" - "fmt" - - "github.com/influxdata/telegraf" - - "ascend-common/common-utils/hwlog" -) - -// the method mapping table (avoid rebuilding with every call) -var ( - logFuncs = map[Level]logFunc{} - logfFuncs = map[Level]logfFunc{} -) - -const ( - // DebugLevel Debug level - DebugLevel Level = iota - 1 - // InfoLevel Info level - InfoLevel - // WarnLevel Warn level - WarnLevel - // ErrorLevel Error level - ErrorLevel - - // PrometheusPlatform Prometheus platform - PrometheusPlatform = "Prometheus" - // TelegrafPlatform Telegraf platform - TelegrafPlatform = "Telegraf" -) - -// HwLogConfig default log file -var HwLogConfig = &hwlog.LogConfig{ - LogFileName: defaultLogFile, - ExpiredTime: hwlog.DefaultExpiredTime, - CacheSize: hwlog.DefaultCacheSize, - MaxLineLength: maxLogLineLength, -} - -// Level log level -type Level int - -// logFunc log function -type logFunc func(ctx context.Context, args ...interface{}) - -// logfFunc logf function -type logfFunc func(ctx context.Context, format string, args ...interface{}) - -var ( - // logger Unified log printer - logger UnifiedLogger -) - -// InitLogger initialize the log manager -func InitLogger(platform string) error { - - if platform == TelegrafPlatform { - logger = &telegrafLogger{} - HwLogConfig.LogFileName = defaultTelegrafLogPath - HwLogConfig.OnlyToFile = true - } else if platform == PrometheusPlatform { - logger = &generalLogger{} - } else { - return errors.New("platform is not supported:" + platform) - } - - if err := hwlog.InitRunLogger(HwLogConfig, context.Background()); err != nil { - fmt.Printf("hwlog init failed, error is %v\n", err) - return err - } - - logFuncs = map[Level]logFunc{ - DebugLevel: hwlog.RunLog.DebugWithCtx, - InfoLevel: hwlog.RunLog.InfoWithCtx, - WarnLevel: hwlog.RunLog.WarnWithCtx, - ErrorLevel: hwlog.RunLog.ErrorWithCtx, - } - - logfFuncs = map[Level]logfFunc{ - DebugLevel: hwlog.RunLog.DebugfWithCtx, - InfoLevel: hwlog.RunLog.InfofWithCtx, - WarnLevel: hwlog.RunLog.WarnfWithCtx, - ErrorLevel: hwlog.RunLog.ErrorfWithCtx, - } - return nil -} - -// LogOptions options for log -type LogOptions struct { - Domain string - ID interface{} - MaxCounts int -} - -// Config config for telegraf -type Config struct { - Acc telegraf.Accumulator -} - -// UnifiedLogger unified logger interface -type UnifiedLogger interface { - dynamicConfigure(Config) - log(ctx context.Context, level Level, args ...interface{}) - logf(ctx context.Context, level Level, format string, args ...interface{}) - logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, args ...interface{}) -} - -// Debug print log info with debug level -func Debug(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), DebugLevel, args...) -} - -// Info print log info with info level -func Info(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), InfoLevel, args...) -} - -// Warn print log info with warn level -func Warn(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), WarnLevel, args...) -} - -// Error print log info with error level -func Error(args ...interface{}) { - logger.log(hwlog.DeepIncrease(context.Background()), ErrorLevel, args...) -} - -// Debugf print log info with debug level -func Debugf(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), DebugLevel, format, args...) -} - -// Infof print log info with info level -func Infof(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), InfoLevel, format, args...) -} - -// Warnf print log info with warn level -func Warnf(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), WarnLevel, format, args...) -} - -// Errorf print log info with error level -func Errorf(format string, args ...interface{}) { - logger.logf(hwlog.DeepIncrease(context.Background()), ErrorLevel, format, args...) -} - -// LogfWithOptions print log info with error level -func LogfWithOptions(level Level, opts LogOptions, format string, args ...interface{}) { - logger.logfWithOptions(hwlog.DeepIncrease(context.Background()), level, opts, format, args...) -} - -// DynamicConfigure configure the logger -func DynamicConfigure(config Config) { - logger.dynamicConfigure(config) -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/logger_test.go b/mind-cluster/component/npu-exporter/utils/logger/logger_test.go deleted file mode 100644 index a08ad4b..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/logger_test.go +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "errors" - "testing" - - "ascend-common/common-utils/hwlog" -) - -// TestInitLogger tests the InitLogger function -func TestInitLogger(t *testing.T) { - tests := []struct { - name string - platform string - expected error - }{ - { - name: "Telegraf Platform", - platform: TelegrafPlatform, - expected: nil, - }, - { - name: "Prometheus Platform", - platform: PrometheusPlatform, - expected: nil, - }, - { - name: "Unsupported Platform", - platform: "Unsupported", - expected: errors.New("platform is not supported:Unsupported"), - }, - } - - HwLogConfig.LogLevel = 0 - HwLogConfig.MaxBackups = hwlog.DefaultMaxBackups - HwLogConfig.LogFileName = defaultLogFile - HwLogConfig.MaxAge = hwlog.DefaultMinSaveAge - - var noExistLevel Level = 5 - var args = "mock" - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := InitLogger(tt.platform) - if tt.expected == nil && err != nil { - t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) - } else if tt.expected != nil && err.Error() != tt.expected.Error() { - t.Errorf("InitLogger(%s) = %v, want %v", tt.platform, err, tt.expected) - } - - logger.log(nil, DebugLevel, args) - logger.log(nil, InfoLevel, args) - logger.log(nil, WarnLevel, args) - logger.log(nil, noExistLevel, args) - logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") - - logger.logf(nil, DebugLevel, args) - logger.logf(nil, InfoLevel, args) - logger.logf(nil, WarnLevel, args) - logger.logf(nil, noExistLevel, args) - logger.logfWithOptions(nil, DebugLevel, LogOptions{}, "test logf with options %s", "arg") - - }) - } -} - -func TestLoggerMethods(t *testing.T) { - - tests := []struct { - name string - method func(...interface{}) - level Level - args []interface{} - }{ - {"test Debug", Debug, DebugLevel, []interface{}{"debug message"}}, - {"test Info", Info, InfoLevel, []interface{}{"info message"}}, - {"test Warn", Warn, WarnLevel, []interface{}{"warn message"}}, - {"test Error", Error, ErrorLevel, []interface{}{"error message"}}, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - test.method(test.args...) - }) - } - - testsF := []struct { - name string - method func(string, ...interface{}) - level Level - format string - args []interface{} - }{ - {"test Debugf", Debugf, DebugLevel, "debug message %d", []interface{}{1}}, - {"test Infof", Infof, InfoLevel, "info message %d", []interface{}{1}}, - {"test Warnf", Warnf, WarnLevel, "warn message %d", []interface{}{1}}, - {"test Errorf", Errorf, ErrorLevel, "error message %d", []interface{}{1}}, - } - - for _, test := range testsF { - t.Run(test.name, func(t *testing.T) { - test.method(test.format, test.args...) - }) - } -} diff --git a/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go b/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go deleted file mode 100644 index 56c2ac5..0000000 --- a/mind-cluster/component/npu-exporter/utils/logger/telegraf_logger.go +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package logger for general collector -package logger - -import ( - "context" - "errors" - "fmt" - "strings" - - "github.com/influxdata/telegraf" - - "ascend-common/common-utils/hwlog" -) - -var defaultTelegrafLogPath = "/var/log/mindx-dl/npu-exporter/npu-plugin.log" -var dangerousChars = map[string]string{ - "\n": "\\n", - "\r": "\\r", - "\t": "\\t", -} - -type telegrafLogger struct { - acc telegraf.Accumulator -} - -// dynamicConfigure configures the logger -func (c *telegrafLogger) dynamicConfigure(config Config) { - c.acc = config.Acc -} - -// log logs with specified level -func (c *telegrafLogger) log(ctx context.Context, level Level, args ...interface{}) { - c.logf(hwlog.DeepIncrease(ctx), level, "%s", args...) -} - -// logf logs with specified level and format -func (c *telegrafLogger) logf(ctx context.Context, level Level, format string, args ...interface{}) { - sanitized := format - for char, replacement := range dangerousChars { - sanitized = strings.ReplaceAll(sanitized, char, replacement) - } - if level < InfoLevel || c.acc == nil { - fn, ok := logfFuncs[level] - if !ok { - hwlog.RunLog.Warnf("unknown log level: %v", level) - return - } - - fn(hwlog.DeepIncrease(ctx), sanitized, args...) - return - } - - c.acc.AddError(errors.New(fmt.Sprintf(sanitized, args...))) -} - -// LogfWithOptions print log info with options -func (c *telegrafLogger) logfWithOptions(ctx context.Context, level Level, opts LogOptions, format string, - args ...interface{}) { - - if opts.MaxCounts == 0 { - opts.MaxCounts = hwlog.ProblemOccurMaxNumbers - } - - if needPrint, extraErrLog := hwlog.IsNeedPrintWithSpecifiedCounts(opts.Domain, opts.ID, opts.MaxCounts); needPrint { - format = fmt.Sprintf("%s %s", format, extraErrLog) - c.logf(hwlog.DeepIncrease(ctx), level, format, args...) - } -} diff --git a/mind-cluster/component/npu-exporter/utils/utils.go b/mind-cluster/component/npu-exporter/utils/utils.go deleted file mode 100644 index b5da97c..0000000 --- a/mind-cluster/component/npu-exporter/utils/utils.go +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils for common utils -package utils - -import ( - "strings" - - "github.com/prometheus/client_golang/prometheus" -) - -// GetDescName parse metrics name from prometheus.Desc object -func GetDescName(desc *prometheus.Desc) string { - if desc == nil { - return "" - } - str := desc.String() - startIndex := strings.Index(str, "fqName: ") - if startIndex == -1 { - return "" - } - readfqName := str[startIndex+len("fqName: "):] - - endIndex := strings.Index(readfqName, ",") - if endIndex == -1 { - return "" - } - readfqName = readfqName[:endIndex] - - readfqName = strings.Trim(readfqName, "\"") - return readfqName -} - -// DoUpdateTelegraf update telegraf -func DoUpdateTelegraf(fieldMap map[string]interface{}, desc *prometheus.Desc, value interface{}, extInfo string) { - if fieldMap == nil { - return - } - fieldMap[GetDescName(desc)+extInfo] = value -} diff --git a/mind-cluster/component/npu-exporter/utils/utils_test.go b/mind-cluster/component/npu-exporter/utils/utils_test.go deleted file mode 100644 index 1a91d29..0000000 --- a/mind-cluster/component/npu-exporter/utils/utils_test.go +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package utils for common utils -package utils - -import ( - "testing" - - "github.com/agiledragon/gomonkey/v2" - "github.com/prometheus/client_golang/prometheus" - "github.com/smartystreets/goconvey/convey" -) - -const ( - emptyString = "" - testMetricName = "test_metric" - testMetricName2 = "another_metric" - invalidDescStr = "invalid description" - noCommaDescStr = "fqName: test_metric" - normalDescStr = `fqName: "test_metric", help: "test help"` - normalDescStr2 = `fqName: another_metric, help: "another help"` - noQuoteDescStr = `fqName: test_metric, help: "test help"` - testHelp = "test help" -) - -func TestGetDescName(t *testing.T) { - convey.Convey("should return empty string when desc is nil", t, testGetDescNameNil) - convey.Convey("should return empty string when desc.String does not contain fqName prefix", t, - testGetDescNameNoFqName) - convey.Convey("should return empty string when desc.String does not contain comma", t, - testGetDescNameNoComma) - convey.Convey("should return metric name when desc.String contains valid format", t, - testGetDescNameValidFormat) -} - -func testGetDescNameNil() { - result := GetDescName(nil) - convey.So(result, convey.ShouldEqual, emptyString) -} - -func testGetDescNameNoFqName() { - desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) - patch := gomonkey.ApplyMethodReturn(desc, "String", invalidDescStr) - defer patch.Reset() - - result := GetDescName(desc) - convey.So(result, convey.ShouldEqual, emptyString) -} - -func testGetDescNameNoComma() { - desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) - patch := gomonkey.ApplyMethodReturn(desc, "String", noCommaDescStr) - defer patch.Reset() - - result := GetDescName(desc) - convey.So(result, convey.ShouldEqual, emptyString) -} - -func testGetDescNameValidFormat() { - testCases := []struct { - name string - descStr string - expected string - }{ - { - name: "should return metric name when desc.String contains normal format with quotes", - descStr: normalDescStr, - expected: testMetricName, - }, - { - name: "should return metric name when desc.String contains normal format without quotes", - descStr: noQuoteDescStr, - expected: testMetricName, - }, - { - name: "should return correct metric name when desc.String contains another metric", - descStr: normalDescStr2, - expected: testMetricName2, - }, - } - - for _, tc := range testCases { - desc := prometheus.NewDesc(testMetricName, testHelp, nil, nil) - patch := gomonkey.ApplyMethodReturn(desc, "String", tc.descStr) - - result := GetDescName(desc) - convey.So(result, convey.ShouldEqual, tc.expected) - - patch.Reset() - } -} diff --git a/mind-cluster/component/npu-exporter/versions/version.go b/mind-cluster/component/npu-exporter/versions/version.go deleted file mode 100644 index 63dba00..0000000 --- a/mind-cluster/component/npu-exporter/versions/version.go +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright(C) 2021. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package versions record the program version -package versions - -var ( - // BuildVersion record the program build version - BuildVersion string - // BuildName record the program build name - BuildName string -) From ed7cf0bc9088ec210bac469c7b2e6bcce7f039ec Mon Sep 17 00:00:00 2001 From: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Date: Wed, 21 Jan 2026 15:29:06 +0800 Subject: [PATCH 08/10] add README Prerequisites Signed-off-by: daniel1210 <8622091+daniel1210@user.noreply.gitee.com> Signed-off-by: ashergaga <1214443299@qq.com> --- README.md | 4 ++++ README_cn.md | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index 2523e59..aea0e91 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ Memory slicing is supported based on virtualization template, lease available te [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) +```bash +git submodule add https://gitcode.com/Ascend/mind-cluster.git +``` + ## Compile ```bash diff --git a/README_cn.md b/README_cn.md index 156ca53..a4988a4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -10,6 +10,11 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H 部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) +克隆子模块 mind-cluster +```bash +git submodule add https://gitcode.com/Ascend/mind-cluster.git +``` + ## 编译 ```bash From dae397465e904dd1fd5ece8065e1fad80cd21c17 Mon Sep 17 00:00:00 2001 From: ashergaga <1214443299@qq.com> Date: Thu, 22 Jan 2026 03:20:59 +0000 Subject: [PATCH 09/10] add submodule mind-cluster Signed-off-by: ashergaga <1214443299@qq.com> --- .gitmodules | 3 +++ mind-cluster | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 mind-cluster diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..eaa5629 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "mind-cluster"] + path = mind-cluster + url = https://gitcode.com/Ascend/mind-cluster.git diff --git a/mind-cluster b/mind-cluster new file mode 160000 index 0000000..c9cf42d --- /dev/null +++ b/mind-cluster @@ -0,0 +1 @@ +Subproject commit c9cf42da06680ea6f825e4d312d0b5929923f482 From 5cb68f214fd4ed9fd7218ce6bbfa221a6eeec9e2 Mon Sep 17 00:00:00 2001 From: ashergaga <1214443299@qq.com> Date: Thu, 22 Jan 2026 18:23:52 +0800 Subject: [PATCH 10/10] fix compile error Signed-off-by: ashergaga <1214443299@qq.com> --- .github/workflows/go.yml | 1 - Dockerfile | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 339b2d9..f64284e 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -28,7 +28,6 @@ jobs: - uses: actions/setup-go@v5 with: go-version: ${{ env.GO_VERSION }} - - run: go mod tidy - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: diff --git a/Dockerfile b/Dockerfile index b99381f..634337e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ WORKDIR /build ADD . . RUN go mod download github.com/Project-HAMi/HAMi RUN go get github.com/Project-HAMi/ascend-device-plugin/internal/server +RUN go get huawei.com/npu-exporter +RUN go get huawei.com/npu-exporter/utils/logger@v0.0.0-00010101000000-000000000000 RUN make all FROM $BASE_IMAGE