55 "encoding/json"
66 "fmt"
77 "math"
8- "strconv"
98 "sync"
109 "sync/atomic"
1110 "time"
@@ -40,6 +39,15 @@ type IndexAllocator struct {
4039 // in use index from 0x01 -> 0xf8, indicates the pod using this index
4140 // When pod completed CDI and started or pending image pulling, should be removed from the queue
4241 nodeIndexQueue map [string ]map [int ]types.NamespacedName
42+
43+ podIndexMap map [types.NamespacedName ]indexIdentifier
44+
45+ asyncCheckingMap map [types.NamespacedName ]struct {}
46+ }
47+
48+ type indexIdentifier struct {
49+ nodeName string
50+ index int
4351}
4452
4553func NewIndexAllocator (ctx context.Context , client client.Client ) (* IndexAllocator , error ) {
@@ -53,6 +61,10 @@ func NewIndexAllocator(ctx context.Context, client client.Client) (*IndexAllocat
5361 currentIndex : 0 , // Will start from 1 on first assignment
5462 ctx : ctx ,
5563 initializedCh : make (chan struct {}),
64+
65+ nodeIndexQueue : make (map [string ]map [int ]types.NamespacedName , 128 ),
66+
67+ podIndexMap : make (map [types.NamespacedName ]indexIdentifier , 128 ),
5668 }
5769
5870 return allocator , nil
@@ -84,66 +96,156 @@ func (s *IndexAllocator) AssignIndex(podName string) (int, error) {
8496}
8597
8698// ReconcileLockState maintains memory state for node level index assign and release queue
87- func (s * IndexAllocator ) ReconcileLockState (pod * v1.Pod ) bool {
99+ func (s * IndexAllocator ) ReconcileLockState (pod * v1.Pod ) {
88100 if pod .Labels [constants .LabelComponent ] != constants .ComponentWorker {
89- return false
101+ return
90102 }
91103 // Check if it's TF indexed Pod by container resource limits
92104 // If isIndex But PodIndex not set, check phase, if pending, should assign index, next check
93105 if pod .Spec .NodeName == "" {
94- return false
106+ return
95107 }
96108
97- index := pod .Annotations [constants .PodIndexAnnotation ]
98- if index == "" {
99- return false
100- }
101- indexInt , err := strconv .Atoi (index )
109+ index , err := utils .ParsePodIndexResourceClaim (pod )
102110 if err != nil {
103- return false
111+ log .FromContext (s .ctx ).Error (err , "not TF indexed Pod, skip reconcile lock state" , "pod" , pod .Name )
112+ return
113+ }
114+ _ , indexAllocated := pod .Annotations [constants .PodIndexAnnotation ]
115+
116+ // Only pending pods can occupy the node level index
117+ if utils .IsPodPending (pod ) {
118+ s .storeMutex .Lock ()
119+ indexQueue := s .nodeIndexQueue [pod .Spec .NodeName ]
120+ if indexQueue == nil {
121+ indexQueue = make (map [int ]types.NamespacedName )
122+ s .nodeIndexQueue [pod .Spec .NodeName ] = indexQueue
123+ }
124+
125+ // If just started and missing in memory, should complement the index queue and pod index map
126+ if indexAllocated {
127+ // occupy the index if missing (when scheduler restarted)
128+ if _ , exists := indexQueue [index ]; ! exists {
129+ podMeta := types.NamespacedName {
130+ Namespace : pod .Namespace ,
131+ Name : pod .Name ,
132+ }
133+ indexQueue [index ] = podMeta
134+ s .podIndexMap [podMeta ] = indexIdentifier {
135+ nodeName : pod .Spec .NodeName ,
136+ index : index ,
137+ }
138+ }
139+ s .storeMutex .Unlock ()
140+ return
141+ }
142+
143+ if podMeta , exists := indexQueue [index ]; exists {
144+ // If already occupied by other Pod, check if it's the same Pod
145+ if podMeta .Namespace != pod .Namespace || podMeta .Name != pod .Name {
146+ log .FromContext (s .ctx ).Error (fmt .Errorf ("pod index conflict" ), "can not reconcile index lock, more than one pending pods occupy the same index" , "pod" , pod .Name , "index" , index )
147+ s .storeMutex .Unlock ()
148+ return
149+ }
150+ } else {
151+ // new Pod occupy the index, add to index queue
152+ indexQueue [index ] = types.NamespacedName {
153+ Namespace : pod .Namespace ,
154+ Name : pod .Name ,
155+ }
156+ s .podIndexMap [types.NamespacedName {
157+ Namespace : pod .Namespace ,
158+ Name : pod .Name ,
159+ }] = indexIdentifier {
160+ nodeName : pod .Spec .NodeName ,
161+ index : index ,
162+ }
163+ s .storeMutex .Unlock ()
164+ // Brand new pending pod, ensure the async checking loop for assigning index annotation
165+ s .AsyncCheckNodeIndexAvailableAndAssign (pod , index )
166+ }
167+ } else if utils .IsPodRunning (pod ) {
168+ s .RemoveNodeIndexQueueForPod (types.NamespacedName {
169+ Namespace : pod .Namespace ,
170+ Name : pod .Name ,
171+ })
104172 }
173+ }
105174
175+ func (s * IndexAllocator ) RemoveNodeIndexQueueForPod (namespacedName types.NamespacedName ) {
106176 s .storeMutex .Lock ()
107177 defer s .storeMutex .Unlock ()
108178
109- // Check Pod status
110- // TODO: call in Pod controller and gpu Allocator init stage
111-
112- indexQueue := s .nodeIndexQueue [pod .Spec .NodeName ]
113- if indexQueue == nil {
114- indexQueue = make (map [int ]types.NamespacedName )
115- s .nodeIndexQueue [pod .Spec .NodeName ] = indexQueue
179+ indexIdentifier , exists := s .podIndexMap [namespacedName ]
180+ if ! exists {
181+ return
116182 }
117- indexQueue [indexInt ] = types.NamespacedName {
118- Namespace : pod .Namespace ,
119- Name : pod .Name ,
183+ if indexQueue , exists := s .nodeIndexQueue [indexIdentifier .nodeName ]; exists {
184+ if val , exists := indexQueue [indexIdentifier .index ]; exists {
185+ if val .Namespace == namespacedName .Namespace && val .Name == namespacedName .Name {
186+ delete (indexQueue , indexIdentifier .index )
187+ log .FromContext (s .ctx ).Info ("Removed pod from node index queue after pod running/stopped/deleted" , "pod" , namespacedName , "index" , indexIdentifier .index )
188+ }
189+ delete (s .podIndexMap , namespacedName )
190+ }
120191 }
121- return true
122192}
123193
124- func (s * IndexAllocator ) CheckNodeIndexAvailableForPod (pod * v1.Pod , index int ) bool {
194+ func (s * IndexAllocator ) CheckNodeIndexAndTryOccupy (pod * v1.Pod , index int ) bool {
125195 <- s .initializedCh
126196 nodeName := pod .Spec .NodeName
127197 if nodeName == "" {
128198 // should not happen, unscheduled pod
129199 return false
130200 }
131201 s .storeMutex .RLock ()
132- defer s .storeMutex .RUnlock ()
133202 indexQueue := s .nodeIndexQueue [nodeName ]
134203 if len (indexQueue ) == 0 {
204+ s .storeMutex .RUnlock ()
135205 return false
136206 }
137207 _ , exists := indexQueue [index ]
138- return ! exists
208+ s .storeMutex .RUnlock ()
209+ // Occupy index for node
210+ if ! exists {
211+ s .storeMutex .Lock ()
212+ indexQueue [index ] = types.NamespacedName {
213+ Namespace : pod .Namespace ,
214+ Name : pod .Name ,
215+ }
216+ s .storeMutex .Unlock ()
217+ return true
218+ }
219+ return false
139220}
140221
141222func (s * IndexAllocator ) SetReady () {
142223 close (s .initializedCh )
143224}
144225
145- func (s * IndexAllocator ) CheckNodeIndexAvailableAndAssign (pod * v1.Pod , index int ) {
226+ func (s * IndexAllocator ) AsyncCheckNodeIndexAvailableAndAssign (pod * v1.Pod , index int ) {
227+ s .storeMutex .Lock ()
228+ defer s .storeMutex .Unlock ()
229+ podMeta := types.NamespacedName {
230+ Namespace : pod .Namespace ,
231+ Name : pod .Name ,
232+ }
233+ if _ , exists := s .asyncCheckingMap [podMeta ]; exists {
234+ // already started checking loop, skip
235+ return
236+ }
237+ s .asyncCheckingMap [podMeta ] = struct {}{}
238+
146239 go func () {
240+ defer func () {
241+ s .storeMutex .Lock ()
242+ delete (s .asyncCheckingMap , types.NamespacedName {
243+ Namespace : pod .Namespace ,
244+ Name : pod .Name ,
245+ })
246+ s .storeMutex .Unlock ()
247+ }()
248+
147249 // Infinity backoff retry until index is available, and also reconcile started
148250 _ = retry .OnError (wait.Backoff {
149251 Duration : 3 * time .Second ,
@@ -172,9 +274,10 @@ func (s *IndexAllocator) CheckNodeIndexAvailableAndAssign(pod *v1.Pod, index int
172274 "pod" , pod .Name , "node" , pod .Spec .NodeName )
173275 return nil
174276 }
277+ // else do nothing, may caused by duplicated reconciling
175278 }
176279
177- if ! s .CheckNodeIndexAvailableForPod (pod , index ) {
280+ if ! s .CheckNodeIndexAndTryOccupy (pod , index ) {
178281 return fmt .Errorf ("index is not available" )
179282 }
180283 // Index available, patch annotation to transit Pod from Pending to DeviceAllocating in hypervisor
0 commit comments