55 "encoding/json"
66 "fmt"
77 "math"
8- "strconv"
98 "sync"
109 "sync/atomic"
1110 "time"
@@ -40,6 +39,15 @@ type IndexAllocator struct {
4039 // in use index from 0x01 -> 0xf8, indicates the pod using this index
4140 // When pod completed CDI and started or pending image pulling, should be removed from the queue
4241 nodeIndexQueue map [string ]map [int ]types.NamespacedName
42+
43+ podIndexMap map [types.NamespacedName ]indexIdentifier
44+
45+ asyncCheckingMap map [types.NamespacedName ]struct {}
46+ }
47+
48+ type indexIdentifier struct {
49+ nodeName string
50+ index int
4351}
4452
4553func NewIndexAllocator (ctx context.Context , client client.Client ) (* IndexAllocator , error ) {
@@ -53,6 +61,10 @@ func NewIndexAllocator(ctx context.Context, client client.Client) (*IndexAllocat
5361 currentIndex : 0 , // Will start from 1 on first assignment
5462 ctx : ctx ,
5563 initializedCh : make (chan struct {}),
64+
65+ nodeIndexQueue : make (map [string ]map [int ]types.NamespacedName , 128 ),
66+
67+ podIndexMap : make (map [types.NamespacedName ]indexIdentifier , 128 ),
5668 }
5769
5870 return allocator , nil
@@ -85,66 +97,156 @@ func (s *IndexAllocator) AssignIndex(podName string) (int, error) {
8597}
8698
8799// ReconcileLockState maintains memory state for node level index assign and release queue
88- func (s * IndexAllocator ) ReconcileLockState (pod * v1.Pod ) bool {
100+ func (s * IndexAllocator ) ReconcileLockState (pod * v1.Pod ) {
89101 if pod .Labels [constants .LabelComponent ] != constants .ComponentWorker {
90- return false
102+ return
91103 }
92104 // Check if it's TF indexed Pod by container resource limits
93105 // If isIndex But PodIndex not set, check phase, if pending, should assign index, next check
94106 if pod .Spec .NodeName == "" {
95- return false
107+ return
96108 }
97109
98- index := pod .Annotations [constants .PodIndexAnnotation ]
99- if index == "" {
100- return false
101- }
102- indexInt , err := strconv .Atoi (index )
110+ index , err := utils .ParsePodIndexResourceClaim (pod )
103111 if err != nil {
104- return false
112+ log .FromContext (s .ctx ).Error (err , "not TF indexed Pod, skip reconcile lock state" , "pod" , pod .Name )
113+ return
114+ }
115+ _ , indexAllocated := pod .Annotations [constants .PodIndexAnnotation ]
116+
117+ // Only pending pods can occupy the node level index
118+ if utils .IsPodPending (pod ) {
119+ s .storeMutex .Lock ()
120+ indexQueue := s .nodeIndexQueue [pod .Spec .NodeName ]
121+ if indexQueue == nil {
122+ indexQueue = make (map [int ]types.NamespacedName )
123+ s .nodeIndexQueue [pod .Spec .NodeName ] = indexQueue
124+ }
125+
126+ // If just started and missing in memory, should complement the index queue and pod index map
127+ if indexAllocated {
128+ // occupy the index if missing (when scheduler restarted)
129+ if _ , exists := indexQueue [index ]; ! exists {
130+ podMeta := types.NamespacedName {
131+ Namespace : pod .Namespace ,
132+ Name : pod .Name ,
133+ }
134+ indexQueue [index ] = podMeta
135+ s .podIndexMap [podMeta ] = indexIdentifier {
136+ nodeName : pod .Spec .NodeName ,
137+ index : index ,
138+ }
139+ }
140+ s .storeMutex .Unlock ()
141+ return
142+ }
143+
144+ if podMeta , exists := indexQueue [index ]; exists {
145+ // If already occupied by other Pod, check if it's the same Pod
146+ if podMeta .Namespace != pod .Namespace || podMeta .Name != pod .Name {
147+ log .FromContext (s .ctx ).Error (fmt .Errorf ("pod index conflict" ), "can not reconcile index lock, more than one pending pods occupy the same index" , "pod" , pod .Name , "index" , index )
148+ s .storeMutex .Unlock ()
149+ return
150+ }
151+ } else {
152+ // new Pod occupy the index, add to index queue
153+ indexQueue [index ] = types.NamespacedName {
154+ Namespace : pod .Namespace ,
155+ Name : pod .Name ,
156+ }
157+ s .podIndexMap [types.NamespacedName {
158+ Namespace : pod .Namespace ,
159+ Name : pod .Name ,
160+ }] = indexIdentifier {
161+ nodeName : pod .Spec .NodeName ,
162+ index : index ,
163+ }
164+ s .storeMutex .Unlock ()
165+ // Brand new pending pod, ensure the async checking loop for assigning index annotation
166+ s .AsyncCheckNodeIndexAvailableAndAssign (pod , index )
167+ }
168+ } else if utils .IsPodRunning (pod ) {
169+ s .RemoveNodeIndexQueueForPod (types.NamespacedName {
170+ Namespace : pod .Namespace ,
171+ Name : pod .Name ,
172+ })
105173 }
174+ }
106175
176+ func (s * IndexAllocator ) RemoveNodeIndexQueueForPod (namespacedName types.NamespacedName ) {
107177 s .storeMutex .Lock ()
108178 defer s .storeMutex .Unlock ()
109179
110- // Check Pod status
111- // TODO: call in Pod controller and gpu Allocator init stage
112-
113- indexQueue := s .nodeIndexQueue [pod .Spec .NodeName ]
114- if indexQueue == nil {
115- indexQueue = make (map [int ]types.NamespacedName )
116- s .nodeIndexQueue [pod .Spec .NodeName ] = indexQueue
180+ indexIdentifier , exists := s .podIndexMap [namespacedName ]
181+ if ! exists {
182+ return
117183 }
118- indexQueue [indexInt ] = types.NamespacedName {
119- Namespace : pod .Namespace ,
120- Name : pod .Name ,
184+ if indexQueue , exists := s .nodeIndexQueue [indexIdentifier .nodeName ]; exists {
185+ if val , exists := indexQueue [indexIdentifier .index ]; exists {
186+ if val .Namespace == namespacedName .Namespace && val .Name == namespacedName .Name {
187+ delete (indexQueue , indexIdentifier .index )
188+ log .FromContext (s .ctx ).Info ("Removed pod from node index queue after pod running/stopped/deleted" , "pod" , namespacedName , "index" , indexIdentifier .index )
189+ }
190+ delete (s .podIndexMap , namespacedName )
191+ }
121192 }
122- return true
123193}
124194
125- func (s * IndexAllocator ) CheckNodeIndexAvailableForPod (pod * v1.Pod , index int ) bool {
195+ func (s * IndexAllocator ) CheckNodeIndexAndTryOccupy (pod * v1.Pod , index int ) bool {
126196 <- s .initializedCh
127197 nodeName := pod .Spec .NodeName
128198 if nodeName == "" {
129199 // should not happen, unscheduled pod
130200 return false
131201 }
132202 s .storeMutex .RLock ()
133- defer s .storeMutex .RUnlock ()
134203 indexQueue := s .nodeIndexQueue [nodeName ]
135204 if len (indexQueue ) == 0 {
205+ s .storeMutex .RUnlock ()
136206 return false
137207 }
138208 _ , exists := indexQueue [index ]
139- return ! exists
209+ s .storeMutex .RUnlock ()
210+ // Occupy index for node
211+ if ! exists {
212+ s .storeMutex .Lock ()
213+ indexQueue [index ] = types.NamespacedName {
214+ Namespace : pod .Namespace ,
215+ Name : pod .Name ,
216+ }
217+ s .storeMutex .Unlock ()
218+ return true
219+ }
220+ return false
140221}
141222
142223func (s * IndexAllocator ) SetReady () {
143224 close (s .initializedCh )
144225}
145226
146- func (s * IndexAllocator ) CheckNodeIndexAvailableAndAssign (pod * v1.Pod , index int ) {
227+ func (s * IndexAllocator ) AsyncCheckNodeIndexAvailableAndAssign (pod * v1.Pod , index int ) {
228+ s .storeMutex .Lock ()
229+ defer s .storeMutex .Unlock ()
230+ podMeta := types.NamespacedName {
231+ Namespace : pod .Namespace ,
232+ Name : pod .Name ,
233+ }
234+ if _ , exists := s .asyncCheckingMap [podMeta ]; exists {
235+ // already started checking loop, skip
236+ return
237+ }
238+ s .asyncCheckingMap [podMeta ] = struct {}{}
239+
147240 go func () {
241+ defer func () {
242+ s .storeMutex .Lock ()
243+ delete (s .asyncCheckingMap , types.NamespacedName {
244+ Namespace : pod .Namespace ,
245+ Name : pod .Name ,
246+ })
247+ s .storeMutex .Unlock ()
248+ }()
249+
148250 // Infinity backoff retry until index is available, and also reconcile started
149251 _ = retry .OnError (wait.Backoff {
150252 Duration : 3 * time .Second ,
@@ -173,9 +275,10 @@ func (s *IndexAllocator) CheckNodeIndexAvailableAndAssign(pod *v1.Pod, index int
173275 "pod" , pod .Name , "node" , pod .Spec .NodeName )
174276 return nil
175277 }
278+ // else do nothing, may caused by duplicated reconciling
176279 }
177280
178- if ! s .CheckNodeIndexAvailableForPod (pod , index ) {
281+ if ! s .CheckNodeIndexAndTryOccupy (pod , index ) {
179282 return fmt .Errorf ("index is not available" )
180283 }
181284 // Index available, patch annotation to transit Pod from Pending to DeviceAllocating in hypervisor
0 commit comments