-
Notifications
You must be signed in to change notification settings - Fork 44
监控取不到数据,代码有bug #78
Description
1、一个节点,单卡,节点的Annotations上信息为:
hami.io/node-nvidia-register: '[{"id":"GPU-4dc064a4-4eb1-bd12-fb22-d08c1a52a0e7","count":10,"devmem":23028,"devcore":100,"type":"NVIDIA A10","mode":"hami-core","health":true,"devicepairscore":{}}]'
2、Nvidia Provider 获取设备信息
`
func (n *Nvidia) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
var err error
var deviceInfos []*util.DeviceInfo
deviceEncode, ok := node.Annotations[RegisterAnnos]
if !ok {
n.log.Warnf("%s node cloud not get hami.io/node-nvidia-register annotation", node.Name)
return deviceInfos, nil
}
deviceInfos, err = util.DecodeNodeDevices(deviceEncode, n.log)
return deviceInfos, err
}
`
3、调用DecodeNodeDevices方法
// DecodeNodeDevices decodes the node devices from a string. func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) { if !strings.Contains(str, OneContainerMultiDeviceSplitSymbol) { log.Warn("Node annotations not decode successfully") return []*DeviceInfo{}, errors.New("node annotations not decode successfully") } tmp := strings.Split(str, OneContainerMultiDeviceSplitSymbol) var retval []*DeviceInfo for _, val := range tmp { if strings.Contains(val, ",") { items := strings.Split(val, ",") if len(items) >= 7 || len(items) == 9 { count, _ := strconv.ParseInt(items[1], 10, 32) devmem, _ := strconv.ParseInt(items[2], 10, 32) devcore, _ := strconv.ParseInt(items[3], 10, 32) health, _ := strconv.ParseBool(items[6]) numa, _ := strconv.Atoi(items[5]) mode := "hami-core" index := 0 if len(items) == 9 { index, _ = strconv.Atoi(items[7]) mode = items[8] } i := DeviceInfo{ ID: items[0], AliasId: items[0], Count: int32(count), Devmem: int32(devmem), Devcore: int32(devcore), Type: items[4], Numa: numa, Health: health, Mode: mode, Index: uint(index), } retval = append(retval, &i) } else { return []*DeviceInfo{}, errors.New("node annotations not decode successfully") } } } return retval, nil }
报错!!!!!
WARN ts=2026-01-30T17:43:58+08:00 caller=data/node.go:78 msg=list devices info error: node annotations not decode successfully
不明白,为什么要这样写代码?