fix: index queue issue

Code2Life · Code2Life · commit 40df300e683c · 2025-12-05T15:32:38.000+08:00
diff --git a/internal/alert/evaluator.go b/internal/alert/evaluator.go
@@ -108,7 +108,7 @@ func renderQueryTemplate(rule *config.AlertRule) (string, error) {
 	}
 
 	var buf bytes.Buffer
-	data := map[string]interface{}{
+	data := map[string]any{
 		"Threshold":  rule.Threshold,
 		"Conditions": fmt.Sprintf("ts >= now() - '%s'::INTERVAL", rule.EvaluationInterval),
 		"Severity":   rule.Severity,
@@ -169,16 +169,16 @@ func (e *AlertEvaluator) processQueryResults(rows *sql.Rows, rule *config.AlertR
 			return nil, fmt.Errorf("failed to get columns: %w", err)
 		}
 
-		values := make([]interface{}, len(columns))
-		valuePtrs := make([]interface{}, len(columns))
+		values := make([]any, len(columns))
+		valuePtrs := make([]any, len(columns))
 		for i := range values {
 			valuePtrs[i] = &values[i]
 		}
 		if err := rows.Scan(valuePtrs...); err != nil {
 			return nil, fmt.Errorf("failed to scan row: %w", err)
 		}
 
-		rowData := make(map[string]interface{})
+		rowData := make(map[string]any)
 		for i, col := range columns {
 			rowData[col] = values[i]
 		}
diff --git a/internal/config/rules.go b/internal/config/rules.go
@@ -60,7 +60,7 @@ func (r *AlertRule) String() string {
 		r.Name, r.Query, r.Threshold, r.EvaluationInterval, r.ConsecutiveCount, r.Severity)
 }
 
-func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]interface{}) (*PostableAlert, bool, string) {
+func (r *AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]any) (*PostableAlert, bool, string) {
 	if r.FiringAlerts == nil {
 		r.FiringAlerts = make(map[string]*FiringAlertCache)
 	}
@@ -122,7 +122,7 @@ func (r *AlertRule) IsTestMode() bool {
 	return r.TestMode
 }
 
-func (r *AlertRule) toPostableAlert(alertQueryResult map[string]interface{}, startsAt time.Time, isResolved bool) PostableAlert {
+func (r *AlertRule) toPostableAlert(alertQueryResult map[string]any, startsAt time.Time, isResolved bool) PostableAlert {
 	summary, description, instance, err := r.renderAlertContentTemplate(alertQueryResult)
 
 	if err != nil {
@@ -147,7 +147,7 @@ func (r *AlertRule) toPostableAlert(alertQueryResult map[string]interface{}, sta
 	return alert
 }
 
-func (rule *AlertRule) renderAlertContentTemplate(data interface{}) (string, string, string, error) {
+func (rule *AlertRule) renderAlertContentTemplate(data any) (string, string, string, error) {
 	if rule.summaryTmplParsed == nil {
 		summaryTmplParsed, err := template.New("summary").Parse(rule.Summary)
 		rule.summaryTmplParsed = summaryTmplParsed
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -74,6 +74,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 			_ = r.Expander.RemovePreSchedulePod(req.Name, true)
 			r.Allocator.DeallocByPodIdentifier(ctx, req.NamespacedName)
 			metrics.RemoveWorkerMetrics(req.Name, time.Now())
+			r.IndexAllocator.RemoveNodeIndexQueueForPod(req.NamespacedName)
 			log.Info("Released GPU resources when pod deleted", "pod", req.NamespacedName)
 			return ctrl.Result{}, nil
 		}
@@ -113,7 +114,12 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		}
 	}
 
+	if utils.IsPodStopped(pod) {
+		r.Allocator.DeallocByPodIdentifier(ctx, req.NamespacedName)
+	}
+
 	if pod.Labels[constants.LabelComponent] == constants.ComponentWorker {
+		r.IndexAllocator.ReconcileLockState(pod)
 		if pod.DeletionTimestamp.IsZero() {
 			metrics.SetWorkerMetricsByWorkload(pod)
 		}
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -1515,6 +1515,10 @@ func (s *GpuAllocator) reconcileAllocationState() {
 			s.uniqueAllocation[string(worker.UID)] = allocRequest
 			s.podNamespaceNsToPodUID[worker.Namespace+"/"+worker.Name] = string(worker.UID)
 			s.addAllocationMap(worker.Spec.NodeName, worker.ObjectMeta)
+
+			if utils.IsPodPending(&worker) {
+				s.indexAllocator.ReconcileLockState(&worker)
+			}
 		}
 		return scheduled && !deletedAndDeAllocated
 	})
diff --git a/internal/hypervisor/backend/kubernetes/external_dp/detector_test.go b/internal/hypervisor/backend/kubernetes/external_dp/detector_test.go
@@ -32,7 +32,7 @@ func (m *MockAPIServer) UpdateGPUStatus(gpu *tfv1.GPU) error {
 // MockKubeletClient is a mock implementation of KubeletClientInterface
 type MockKubeletClient struct {
 	mock.Mock
-	pods map[string]interface{}
+	pods map[string]any
 }
 
 func (m *MockKubeletClient) GetAllPods() map[string]any {
diff --git a/internal/hypervisor/server/handlers/worker.go b/internal/hypervisor/server/handlers/worker.go
@@ -75,8 +75,8 @@ func (h *WorkerHandler) HandleGetWorker(c *gin.Context) {
 
 	metrics, exists := workerMetrics[workerID]
 	if !exists || metrics == nil {
-		c.JSON(http.StatusOK, api.DataResponse[map[string]interface{}]{
-			Data: map[string]interface{}{
+		c.JSON(http.StatusOK, api.DataResponse[map[string]any]{
+			Data: map[string]any{
 				"worker_uid": workerID,
 				"allocation": allocation,
 			},
diff --git a/internal/hypervisor/tui/client.go b/internal/hypervisor/tui/client.go
@@ -46,7 +46,7 @@ func NewClient(host string, port int) *Client {
 // doRequest performs an HTTP request and decodes the JSON response
 //
 //nolint:unparam // method parameter is kept for API consistency, even though it's always "GET"
-func (c *Client) doRequest(ctx context.Context, method, path string, result interface{}) error {
+func (c *Client) doRequest(ctx context.Context, method, path string, result any) error {
 	url := fmt.Sprintf("%s/%s", c.baseURL, path)
 	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
 	if err != nil {
diff --git a/internal/indexallocator/indexallocator.go b/internal/indexallocator/indexallocator.go
@@ -5,7 +5,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"math"
-	"strconv"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -40,6 +39,15 @@ type IndexAllocator struct {
 	// in use index from 0x01 -> 0xf8, indicates the pod using this index
 	// When pod completed CDI and started or pending image pulling, should be removed from the queue
 	nodeIndexQueue map[string]map[int]types.NamespacedName
+
+	podIndexMap map[types.NamespacedName]indexIdentifier
+
+	asyncCheckingMap map[types.NamespacedName]struct{}
+}
+
+type indexIdentifier struct {
+	nodeName string
+	index    int
 }
 
 func NewIndexAllocator(ctx context.Context, client client.Client) (*IndexAllocator, error) {
@@ -53,6 +61,10 @@ func NewIndexAllocator(ctx context.Context, client client.Client) (*IndexAllocat
 		currentIndex:  0, // Will start from 1 on first assignment
 		ctx:           ctx,
 		initializedCh: make(chan struct{}),
+
+		nodeIndexQueue: make(map[string]map[int]types.NamespacedName, 128),
+
+		podIndexMap: make(map[types.NamespacedName]indexIdentifier, 128),
 	}
 
 	return allocator, nil
@@ -85,66 +97,156 @@ func (s *IndexAllocator) AssignIndex(podName string) (int, error) {
 }
 
 // ReconcileLockState maintains memory state for node level index assign and release queue
-func (s *IndexAllocator) ReconcileLockState(pod *v1.Pod) bool {
+func (s *IndexAllocator) ReconcileLockState(pod *v1.Pod) {
 	if pod.Labels[constants.LabelComponent] != constants.ComponentWorker {
-		return false
+		return
 	}
 	// Check if it's TF indexed Pod by container resource limits
 	// If isIndex But PodIndex not set, check phase, if pending, should assign index, next check
 	if pod.Spec.NodeName == "" {
-		return false
+		return
 	}
 
-	index := pod.Annotations[constants.PodIndexAnnotation]
-	if index == "" {
-		return false
-	}
-	indexInt, err := strconv.Atoi(index)
+	index, err := utils.ParsePodIndexResourceClaim(pod)
 	if err != nil {
-		return false
+		log.FromContext(s.ctx).Error(err, "not TF indexed Pod, skip reconcile lock state", "pod", pod.Name)
+		return
+	}
+	_, indexAllocated := pod.Annotations[constants.PodIndexAnnotation]
+
+	// Only pending pods can occupy the node level index
+	if utils.IsPodPending(pod) {
+		s.storeMutex.Lock()
+		indexQueue := s.nodeIndexQueue[pod.Spec.NodeName]
+		if indexQueue == nil {
+			indexQueue = make(map[int]types.NamespacedName)
+			s.nodeIndexQueue[pod.Spec.NodeName] = indexQueue
+		}
+
+		// If just started and missing in memory, should complement the index queue and pod index map
+		if indexAllocated {
+			// occupy the index if missing (when scheduler restarted)
+			if _, exists := indexQueue[index]; !exists {
+				podMeta := types.NamespacedName{
+					Namespace: pod.Namespace,
+					Name:      pod.Name,
+				}
+				indexQueue[index] = podMeta
+				s.podIndexMap[podMeta] = indexIdentifier{
+					nodeName: pod.Spec.NodeName,
+					index:    index,
+				}
+			}
+			s.storeMutex.Unlock()
+			return
+		}
+
+		if podMeta, exists := indexQueue[index]; exists {
+			// If already occupied by other Pod, check if it's the same Pod
+			if podMeta.Namespace != pod.Namespace || podMeta.Name != pod.Name {
+				log.FromContext(s.ctx).Error(fmt.Errorf("pod index conflict"), "can not reconcile index lock, more than one pending pods occupy the same index", "pod", pod.Name, "index", index)
+				s.storeMutex.Unlock()
+				return
+			}
+		} else {
+			// new Pod occupy the index, add to index queue
+			indexQueue[index] = types.NamespacedName{
+				Namespace: pod.Namespace,
+				Name:      pod.Name,
+			}
+			s.podIndexMap[types.NamespacedName{
+				Namespace: pod.Namespace,
+				Name:      pod.Name,
+			}] = indexIdentifier{
+				nodeName: pod.Spec.NodeName,
+				index:    index,
+			}
+			s.storeMutex.Unlock()
+			// Brand new pending pod, ensure the async checking loop for assigning index annotation
+			s.AsyncCheckNodeIndexAvailableAndAssign(pod, index)
+		}
+	} else if utils.IsPodRunning(pod) {
+		s.RemoveNodeIndexQueueForPod(types.NamespacedName{
+			Namespace: pod.Namespace,
+			Name:      pod.Name,
+		})
 	}
+}
 
+func (s *IndexAllocator) RemoveNodeIndexQueueForPod(namespacedName types.NamespacedName) {
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
 
-	// Check Pod status
-	// TODO: call in Pod controller and gpu Allocator init stage
-
-	indexQueue := s.nodeIndexQueue[pod.Spec.NodeName]
-	if indexQueue == nil {
-		indexQueue = make(map[int]types.NamespacedName)
-		s.nodeIndexQueue[pod.Spec.NodeName] = indexQueue
+	indexIdentifier, exists := s.podIndexMap[namespacedName]
+	if !exists {
+		return
 	}
-	indexQueue[indexInt] = types.NamespacedName{
-		Namespace: pod.Namespace,
-		Name:      pod.Name,
+	if indexQueue, exists := s.nodeIndexQueue[indexIdentifier.nodeName]; exists {
+		if val, exists := indexQueue[indexIdentifier.index]; exists {
+			if val.Namespace == namespacedName.Namespace && val.Name == namespacedName.Name {
+				delete(indexQueue, indexIdentifier.index)
+				log.FromContext(s.ctx).Info("Removed pod from node index queue after pod running/stopped/deleted", "pod", namespacedName, "index", indexIdentifier.index)
+			}
+			delete(s.podIndexMap, namespacedName)
+		}
 	}
-	return true
 }
 
-func (s *IndexAllocator) CheckNodeIndexAvailableForPod(pod *v1.Pod, index int) bool {
+func (s *IndexAllocator) CheckNodeIndexAndTryOccupy(pod *v1.Pod, index int) bool {
 	<-s.initializedCh
 	nodeName := pod.Spec.NodeName
 	if nodeName == "" {
 		// should not happen, unscheduled pod
 		return false
 	}
 	s.storeMutex.RLock()
-	defer s.storeMutex.RUnlock()
 	indexQueue := s.nodeIndexQueue[nodeName]
 	if len(indexQueue) == 0 {
+		s.storeMutex.RUnlock()
 		return false
 	}
 	_, exists := indexQueue[index]
-	return !exists
+	s.storeMutex.RUnlock()
+	// Occupy index for node
+	if !exists {
+		s.storeMutex.Lock()
+		indexQueue[index] = types.NamespacedName{
+			Namespace: pod.Namespace,
+			Name:      pod.Name,
+		}
+		s.storeMutex.Unlock()
+		return true
+	}
+	return false
 }
 
 func (s *IndexAllocator) SetReady() {
 	close(s.initializedCh)
 }
 
-func (s *IndexAllocator) CheckNodeIndexAvailableAndAssign(pod *v1.Pod, index int) {
+func (s *IndexAllocator) AsyncCheckNodeIndexAvailableAndAssign(pod *v1.Pod, index int) {
+	s.storeMutex.Lock()
+	defer s.storeMutex.Unlock()
+	podMeta := types.NamespacedName{
+		Namespace: pod.Namespace,
+		Name:      pod.Name,
+	}
+	if _, exists := s.asyncCheckingMap[podMeta]; exists {
+		// already started checking loop, skip
+		return
+	}
+	s.asyncCheckingMap[podMeta] = struct{}{}
+
 	go func() {
+		defer func() {
+			s.storeMutex.Lock()
+			delete(s.asyncCheckingMap, types.NamespacedName{
+				Namespace: pod.Namespace,
+				Name:      pod.Name,
+			})
+			s.storeMutex.Unlock()
+		}()
+
 		// Infinity backoff retry until index is available, and also reconcile started
 		_ = retry.OnError(wait.Backoff{
 			Duration: 3 * time.Second,
@@ -173,9 +275,10 @@ func (s *IndexAllocator) CheckNodeIndexAvailableAndAssign(pod *v1.Pod, index int
 						"pod", pod.Name, "node", pod.Spec.NodeName)
 					return nil
 				}
+				// else do nothing, may caused by duplicated reconciling
 			}
 
-			if !s.CheckNodeIndexAvailableForPod(pod, index) {
+			if !s.CheckNodeIndexAndTryOccupy(pod, index) {
 				return fmt.Errorf("index is not available")
 			}
 			// Index available, patch annotation to transit Pod from Pending to DeviceAllocating in hypervisor
diff --git a/internal/metrics/connect.go b/internal/metrics/connect.go
@@ -153,7 +153,7 @@ func (t *TimeSeriesDB) SetTableTTL(ttl string) error {
 
 func (t *TimeSeriesDB) FindRecentNodeMetrics() ([]NodeResourceMetrics, error) {
 	var monitors []NodeResourceMetrics
-	err := t.DB.Find(&monitors, map[string]interface{}{
+	err := t.DB.Find(&monitors, map[string]any{
 		"ts": gorm.Expr("now() - interval 1 hour"),
 	}).Error
 	return monitors, err
diff --git a/internal/metrics/encoders/otel.go b/internal/metrics/encoders/otel.go
diff --git a/internal/scheduler/expander/handler.go b/internal/scheduler/expander/handler.go
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
diff --git a/internal/utils/reconcile.go b/internal/utils/reconcile.go

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ func renderQueryTemplate(rule *config.AlertRule) (string, error) {`
`108`	`108`	`}`
`109`	`109`
`110`	`110`	`var buf bytes.Buffer`
`111`		`- data := map[string]interface{}{`
	`111`	`+ data := map[string]any{`
`112`	`112`	`"Threshold": rule.Threshold,`
`113`	`113`	`"Conditions": fmt.Sprintf("ts >= now() - '%s'::INTERVAL", rule.EvaluationInterval),`
`114`	`114`	`"Severity": rule.Severity,`
`@@ -169,16 +169,16 @@ func (e AlertEvaluator) processQueryResults(rows sql.Rows, rule *config.AlertR`
`169`	`169`	`return nil, fmt.Errorf("failed to get columns: %w", err)`
`170`	`170`	`}`
`171`	`171`
`172`		`- values := make([]interface{}, len(columns))`
`173`		`- valuePtrs := make([]interface{}, len(columns))`
	`172`	`+ values := make([]any, len(columns))`
	`173`	`+ valuePtrs := make([]any, len(columns))`
`174`	`174`	`for i := range values {`
`175`	`175`	`valuePtrs[i] = &values[i]`
`176`	`176`	`}`
`177`	`177`	`if err := rows.Scan(valuePtrs...); err != nil {`
`178`	`178`	`return nil, fmt.Errorf("failed to scan row: %w", err)`
`179`	`179`	`}`
`180`	`180`
`181`		`- rowData := make(map[string]interface{})`
	`181`	`+ rowData := make(map[string]any)`
`182`	`182`	`for i, col := range columns {`
`183`	`183`	`rowData[col] = values[i]`
`184`	`184`	`}`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ func (r *AlertRule) String() string {`
`60`	`60`	`r.Name, r.Query, r.Threshold, r.EvaluationInterval, r.ConsecutiveCount, r.Severity)`
`61`	`61`	`}`
`62`	`62`
`63`		`-func (r AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]interface{}) (PostableAlert, bool, string) {`
	`63`	`+func (r AlertRule) AddFiringAlertAndCheckResolved(alertQueryResult map[string]any) (PostableAlert, bool, string) {`
`64`	`64`	`if r.FiringAlerts == nil {`
`65`	`65`	`r.FiringAlerts = make(map[string]*FiringAlertCache)`
`66`	`66`	`}`
`@@ -122,7 +122,7 @@ func (r *AlertRule) IsTestMode() bool {`
`122`	`122`	`return r.TestMode`
`123`	`123`	`}`
`124`	`124`
`125`		`-func (r *AlertRule) toPostableAlert(alertQueryResult map[string]interface{}, startsAt time.Time, isResolved bool) PostableAlert {`
	`125`	`+func (r *AlertRule) toPostableAlert(alertQueryResult map[string]any, startsAt time.Time, isResolved bool) PostableAlert {`
`126`	`126`	`summary, description, instance, err := r.renderAlertContentTemplate(alertQueryResult)`
`127`	`127`
`128`	`128`	`if err != nil {`
`@@ -147,7 +147,7 @@ func (r *AlertRule) toPostableAlert(alertQueryResult map[string]interface{}, sta`
`147`	`147`	`return alert`
`148`	`148`	`}`
`149`	`149`
`150`		`-func (rule *AlertRule) renderAlertContentTemplate(data interface{}) (string, string, string, error) {`
	`150`	`+func (rule *AlertRule) renderAlertContentTemplate(data any) (string, string, string, error) {`
`151`	`151`	`if rule.summaryTmplParsed == nil {`
`152`	`152`	`summaryTmplParsed, err := template.New("summary").Parse(rule.Summary)`
`153`	`153`	`rule.summaryTmplParsed = summaryTmplParsed`
Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R`
`74`	`74`	`_ = r.Expander.RemovePreSchedulePod(req.Name, true)`
`75`	`75`	`r.Allocator.DeallocByPodIdentifier(ctx, req.NamespacedName)`
`76`	`76`	`metrics.RemoveWorkerMetrics(req.Name, time.Now())`
	`77`	`+ r.IndexAllocator.RemoveNodeIndexQueueForPod(req.NamespacedName)`
`77`	`78`	`log.Info("Released GPU resources when pod deleted", "pod", req.NamespacedName)`
`78`	`79`	`return ctrl.Result{}, nil`
`79`	`80`	`}`
`@@ -113,7 +114,12 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R`
`113`	`114`	`}`
`114`	`115`	`}`
`115`	`116`
	`117`	`+ if utils.IsPodStopped(pod) {`
	`118`	`+ r.Allocator.DeallocByPodIdentifier(ctx, req.NamespacedName)`
	`119`	`+ }`
	`120`	`+`
`116`	`121`	`if pod.Labels[constants.LabelComponent] == constants.ComponentWorker {`
	`122`	`+ r.IndexAllocator.ReconcileLockState(pod)`
`117`	`123`	`if pod.DeletionTimestamp.IsZero() {`
`118`	`124`	`metrics.SetWorkerMetricsByWorkload(pod)`
`119`	`125`	`}`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ func (m MockAPIServer) UpdateGPUStatus(gpu tfv1.GPU) error {`
`32`	`32`	`// MockKubeletClient is a mock implementation of KubeletClientInterface`
`33`	`33`	`type MockKubeletClient struct {`
`34`	`34`	`mock.Mock`
`35`		`- pods map[string]interface{}`
	`35`	`+ pods map[string]any`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`func (m *MockKubeletClient) GetAllPods() map[string]any {`