Skip to content

监控取不到数据,代码有bug #78

@achu911

Description

@achu911

1、一个节点,单卡,节点的Annotations上信息为:
hami.io/node-nvidia-register: '[{"id":"GPU-4dc064a4-4eb1-bd12-fb22-d08c1a52a0e7","count":10,"devmem":23028,"devcore":100,"type":"NVIDIA A10","mode":"hami-core","health":true,"devicepairscore":{}}]'

2、Nvidia Provider 获取设备信息
`
func (n *Nvidia) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
var err error
var deviceInfos []*util.DeviceInfo

deviceEncode, ok := node.Annotations[RegisterAnnos]
if !ok {
	n.log.Warnf("%s node cloud not get hami.io/node-nvidia-register annotation", node.Name)
	return deviceInfos, nil
}
deviceInfos, err = util.DecodeNodeDevices(deviceEncode, n.log)
return deviceInfos, err

}
`

3、调用DecodeNodeDevices方法
// DecodeNodeDevices decodes the node devices from a string. func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) { if !strings.Contains(str, OneContainerMultiDeviceSplitSymbol) { log.Warn("Node annotations not decode successfully") return []*DeviceInfo{}, errors.New("node annotations not decode successfully") } tmp := strings.Split(str, OneContainerMultiDeviceSplitSymbol) var retval []*DeviceInfo for _, val := range tmp { if strings.Contains(val, ",") { items := strings.Split(val, ",") if len(items) >= 7 || len(items) == 9 { count, _ := strconv.ParseInt(items[1], 10, 32) devmem, _ := strconv.ParseInt(items[2], 10, 32) devcore, _ := strconv.ParseInt(items[3], 10, 32) health, _ := strconv.ParseBool(items[6]) numa, _ := strconv.Atoi(items[5]) mode := "hami-core" index := 0 if len(items) == 9 { index, _ = strconv.Atoi(items[7]) mode = items[8] } i := DeviceInfo{ ID: items[0], AliasId: items[0], Count: int32(count), Devmem: int32(devmem), Devcore: int32(devcore), Type: items[4], Numa: numa, Health: health, Mode: mode, Index: uint(index), } retval = append(retval, &i) } else { return []*DeviceInfo{}, errors.New("node annotations not decode successfully") } } } return retval, nil }

报错!!!!!
WARN ts=2026-01-30T17:43:58+08:00 caller=data/node.go:78 msg=list devices info error: node annotations not decode successfully

不明白,为什么要这样写代码?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions