Skip to content

Commit 829a37a

Browse files
Merge pull request #1371 from jmencak/4.20-bootcmdline-conflict
NO-JIRA: Address false reports of bootcmdline conflicts
2 parents 186dee8 + e8a6d46 commit 829a37a

File tree

1 file changed

+39
-4
lines changed

1 file changed

+39
-4
lines changed

pkg/operator/controller.go

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,7 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
731731
profile.Spec.Config.ProviderName = providerName
732732
profile.Spec.Profile = computed.AllProfiles
733733
profile.Status.Conditions = tunedpkg.InitializeStatusConditions()
734+
delete(c.pc.state.bootcmdline, nodeName) // bootcmdline retrieved from node annotation is potentially stale, let it resync on node update
734735

735736
klog.V(2).Infof("syncProfile(): updating Profile %s [%s]", profile.Name, computed.TunedProfileName)
736737
_, err = c.clients.Tuned.TunedV1().Profiles(ntoconfig.WatchNamespace()).Update(context.TODO(), profile, metav1.UpdateOptions{})
@@ -780,18 +781,17 @@ func (c *Controller) syncMachineConfig(labels map[string]string, profile *tunedv
780781
name := GetMachineConfigNameForPools(pools)
781782
klog.V(2).Infof("syncMachineConfig(): %v", name)
782783

783-
var bootcmdlineSet bool
784784
nodes, err := c.pc.getNodesForPool(pools[0])
785785
if err != nil {
786786
return err
787787
}
788788

789-
bootcmdline, bootcmdlineSet = c.pc.state.bootcmdline[profile.Name]
790-
if !bootcmdlineSet {
791-
klog.V(2).Infof("syncMachineConfig(): bootcmdline for %s not cached, sync canceled", profile.Name)
789+
if ok := c.allNodesHaveBootcmdlineSet(nodes); !ok {
790+
klog.V(2).Infof("syncMachineConfig(): bootcmdline for %s not cached for all nodes, sync canceled", profile.Name)
792791
return nil
793792
}
794793

794+
bootcmdline = c.pc.state.bootcmdline[profile.Name]
795795
if ok := c.allNodesAgreeOnBootcmdline(nodes); !ok {
796796
// Log an error and do not requeue, this is a configuration issue.
797797
klog.Errorf("not all %d Nodes in MCP %v agree on bootcmdline: %s", len(nodes), pools[0].Name, bootcmdline)
@@ -848,6 +848,36 @@ func (c *Controller) syncMachineConfig(labels map[string]string, profile *tunedv
848848
return nil
849849
}
850850

851+
// allNodesHaveBootcmdlineSet returns true if all Nodes in slice 'nodes' have
852+
// their bootcmdline annotation (TunedBootcmdlineAnnotationKey) value cached in
853+
// the profilecalculator's cache. The values in the cache are populated on Node
854+
// updates (regular ones or just resyncs) and removed from the cache on Profile
855+
// updates. Note this is not a bullet-proof solution to false bootcmdline
856+
// conflicts, because we can still have races, such as regular k8s object
857+
// Node resync (or other independent Node updates) right after Profile update.
858+
// While the cached bootcmdline value will be deleted after Profile update, it
859+
// can still be populated by a stale value from Node's annotation because of the
860+
// Node's resync (or other independent updates) before the proper bootcmdline
861+
// is calculated by the TuneD pod and Node's (TunedBootcmdlineAnnotationKey)
862+
// annotation updated. A bullet-proof solution would involve adding a new
863+
// annotation such as tuned.openshift.io/lastProfileObservedGen to the Node
864+
// tied to the Profile and comparing it to the Profile's generation prior to
865+
// updating a MachineConfig. However, we want to avoid putting extra load
866+
// on the API server as much as possible and this simpler solution already
867+
// significantly reduces false reports of bootcmdline conflicts.
868+
func (c *Controller) allNodesHaveBootcmdlineSet(nodes []*corev1.Node) bool {
869+
for _, node := range nodes {
870+
if v, bootcmdlineSet := c.pc.state.bootcmdline[node.Name]; !bootcmdlineSet {
871+
klog.V(3).Infof("allNodesHaveBootcmdlineSet(): bootcmdline not set for node %s", node.Name)
872+
return false
873+
} else {
874+
klog.V(3).Infof("allNodesHaveBootcmdlineSet(): bootcmdline %q set for node %s", v, node.Name)
875+
}
876+
}
877+
878+
return true
879+
}
880+
851881
// allNodesAgreeOnBootcmdline returns true if the current cached annotation 'TunedBootcmdlineAnnotationKey'
852882
// of all Nodes in slice 'nodes' has the same value.
853883
func (c *Controller) allNodesAgreeOnBootcmdline(nodes []*corev1.Node) bool {
@@ -881,6 +911,11 @@ func (c *Controller) syncMachineConfigHyperShift(nodePoolName string, profile *t
881911
return fmt.Errorf("could not fetch a list of Nodes for NodePool %s: %v", nodePoolName, err)
882912
}
883913

914+
if ok := c.allNodesHaveBootcmdlineSet(nodes); !ok {
915+
klog.V(2).Infof("syncMachineConfigHyperShift(): bootcmdline for %s not cached for all nodes, sync canceled", profile.Name)
916+
return nil
917+
}
918+
884919
bootcmdline := c.pc.state.bootcmdline[profile.Name]
885920
if ok := c.allNodesAgreeOnBootcmdline(nodes); !ok {
886921
return fmt.Errorf("not all %d Nodes in NodePool %v agree on bootcmdline: %s", len(nodes), nodePoolName, bootcmdline)

0 commit comments

Comments
 (0)