openshift
diff --git a/‎go.mod‎
Lines changed: 1 addition & 0 deletions b/‎go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎go.sum‎
Lines changed: 2 additions & 0 deletions b/‎go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/performanceprofile/profilecreator/autosize/autosize.go‎
Lines changed: 228 additions & 10 deletions b/‎pkg/performanceprofile/profilecreator/autosize/autosize.go‎
Lines changed: 228 additions & 10 deletions
@@ -28,6 +28,7 @@ require (
 	github.com/prometheus/client_golang v1.21.1
 	github.com/spf13/cobra v1.9.1
 	github.com/spf13/pflag v1.0.6
+	gonum.org/v1/gonum v0.16.0
 	gopkg.in/fsnotify.v1 v1.4.7
 	gopkg.in/ini.v1 v1.67.0
 	gopkg.in/yaml.v2 v2.4.0
 
@@ -940,6 +940,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
 gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
+gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
+gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
 google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
 
@@ -1,27 +1,245 @@
 package autosize
 
-import "github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator"
+import (
+	"errors"
+	"fmt"
+	"log"
+	"math"
 
-// shortcut
-var Alert = profilecreator.Alert
+	"gonum.org/v1/gonum/optimize"
+
+	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator"
+)
+
+// Assumptions:
+// 1. All the machines in the node pool have identical HW specs and need identical sizing.
+// 2. We cannot distinguyish betwee infra/OS CPU requirements and control plane CPU requirement.
+//    We will conflate the two costs in the latter.
+//
+// Definitions:
+// x_c: CPUs for the control plane - includes x_i: CPUs for OS/Infra
+// x_w: CPUs for the workload
+// Tc: Total available CPUs (includes OS/Infra
+//
+// Hard Constraints:
+//   x_c, x_w are integers because we need to dedicate full cores
+//   x_c, x_w >= 0
+//   x_c + x_w <= Tc
+//   x_c >= req(x_w) // control plane and infra cost is a function of the expected workload
+//
+// Objective:
+// We want to maximize x_w, or, equivalently, minimize x_c
+
+const (
+	defaultPenaltyWeight                 float64 = 100.0
+	defaultReservedRatioInitial          float64 = 0.0625 // 1/16. determined empirically. Use only as initial value.
+	defaultReservedRatioMax              float64 = 0.25   // 1/4. determined empirically. This is the practical upper bound.
+	defaultControlPlaneWorkloadCoreRatio float64 = 0.075  // TODO: how much control plane/infra power do we need to support the workload?
+)
+
+var (
+	ErrUnderallocatedControlPlane = errors.New("not enough CPUs for control plane")
+	ErrOverallocatedControlPlane  = errors.New("too many CPUs for control plane")
+	ErrInconsistentAllocation     = errors.New("inconsistent CPus allocation")
+)
+
+type Env struct {
+	Log *log.Logger
+}
+
+func DefaultEnv() Env {
+	return Env{
+		Log: profilecreator.GetAlertSink(),
+	}
+}
 
 type Params struct {
-	NodePoolSize     int
-	OfflinedCPUCount int
+	OfflinedCPUCount    int
+	UserLevelNetworking bool
+	MachineData         *profilecreator.GHWHandler
+	// cached vars
+	totalCPUs    int
+	isSMTEnabled bool
+}
+
+func (p Params) String() string {
+	return fmt.Sprintf("cpus=%d offline=%v smtLevel=%v", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel())
+}
+
+func setupMachineData(p *Params) error {
+	var err error
+
+	ok, err := p.MachineData.IsHyperthreadingEnabled()
+	if err != nil {
+		return err
+	}
+	p.isSMTEnabled = ok
+
+	cpus, err := p.MachineData.CPU()
+	if err != nil {
+		return err
+	}
+	p.totalCPUs = int(cpus.TotalHardwareThreads)
+
+	return nil
+}
+
+// Get Tc
+func (p Params) TotalCPUs() int {
+	return p.totalCPUs
+}
+
+func (p Params) IsSMTEnabled() bool {
+	return p.isSMTEnabled
+}
+
+func (p Params) smtLevel() int {
+	if !p.isSMTEnabled {
+		return 1
+	}
+	return 2 // TODO
+}
+
+func (p Params) DefaultControlPlaneCores() int {
+	// intentionally overallocate to have a safe baseline
+	Tc := p.TotalCPUs()
+	return int(math.Round(float64(Tc) * defaultReservedRatioInitial)) // TODO handle SMT
+}
+
+// Get x_c, x_w as initial hardcoded value. Subject to optimization
+func (p Params) DefaultAllocation() Values {
+	Tc := p.TotalCPUs()
+	x_c := p.DefaultControlPlaneCores()
+	return Values{
+		ReservedCPUCount: x_c,
+		IsolatedCPUCount: Tc - x_c,
+	}
+}
+
+func (p Params) initialValue() []float64 {
+	vals := p.DefaultAllocation()
+	return []float64{
+		float64(vals.ReservedCPUCount), // x_c
+		float64(vals.IsolatedCPUCount), // x_w
+	}
+}
+
+func (p Params) controlPlaneRequirement(x_w float64) float64 {
+	R := defaultControlPlaneWorkloadCoreRatio
+	if p.UserLevelNetworking {
+		R = 0.0
+	}
+	// TODO: the most obvious relationship is for kernel level networking.
+	// We start with a linear relationship because its simplicity.
+	return float64(p.DefaultControlPlaneCores()) + R*x_w
+}
+
+type Score struct {
+	Cost float64 // the lower the better
+}
+
+func (sc Score) String() string {
+	val := -sc.Cost // positive values are easier to grasp
+	return fmt.Sprintf("optimization result: %.3f (higher is better)", val)
 }
 
 type Values struct {
+	// we intentionally compute the recommended cpu count, not precise allocation, because
+	// this is better done by other packages. We may expose the precise allocation as hint
+	// or for reference purposes in the future
 	ReservedCPUCount int
+	IsolatedCPUCount int
 }
 
-type Env struct{}
+func (vals Values) String() string {
+	return fmt.Sprintf("reserved=%v/isolated=%v", vals.ReservedCPUCount, vals.IsolatedCPUCount)
+}
 
-func DefaultEnv() Env {
-	return Env{}
+// gonum doesn't support bounds yet so we have to make this an explicit step
+// https://github.com/gonum/gonum/issues/1725
+func Validate(params Params, vals Values) error {
+	Tc := params.TotalCPUs()
+	if vals.ReservedCPUCount < 1 { // TODO handle SMT
+		return ErrUnderallocatedControlPlane
+	}
+	if vals.ReservedCPUCount > int(math.Round((float64(Tc) * defaultReservedRatioMax))) { // works, but likely unacceptable
+		return ErrOverallocatedControlPlane
+	}
+	if Tc != vals.ReservedCPUCount+vals.IsolatedCPUCount {
+		return ErrInconsistentAllocation
+	}
+	return nil
 }
 
-type Score struct{}
+// Objective function to minimize.
+// x[0] is x_c
+// x[1] is x_w
+func objective(p Params, x []float64) float64 {
+	xc := x[0]
+	xw := x[1]
+
+	// Our original objective is to maximize xw, so we minimize -xw
+	target := -xw
+
+	// gonum doesn't support bounds yet so we have to use penalties:
+	// https://github.com/gonum/gonum/issues/1725
+
+	// Hard Constraints
+	var hardPenalty float64
+	// Don't exceed total CPUs
+	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, xc+xw-float64(p.TotalCPUs())), 2)
+
+	// Meet the control plane/infra requirement to avoid the workload to starve
+	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, p.controlPlaneRequirement(xw)-xc), 2)
+
+	// Must use positive CPU values (since gonum/optimize doesn't have simple bounds for all solvers)
+	hardPenalty += defaultPenaltyWeight * (math.Pow(math.Max(0, -xc), 2) + math.Pow(math.Max(0, -xw), 2))
+
+	// Allocate in multiples of SMT level (usually 2) -- TODO: should be soft?
+	hardPenalty += defaultPenaltyWeight * (math.Pow(math.Max(0, -float64(int(math.Round(xc))%p.smtLevel())), 2))
+
+	return target + hardPenalty
+}
 
 func Compute(env Env, params Params) (Values, Score, error) {
-	return Values{}, Score{}, nil
+	err := setupMachineData(&params)
+	if err != nil {
+		env.Log.Printf("Optimization failed: %v", err)
+		return params.DefaultAllocation(), Score{}, err
+	}
+
+	problem := optimize.Problem{
+		Func: func(x []float64) float64 {
+			return objective(params, x)
+		},
+	}
+
+	settings := &optimize.Settings{
+		MajorIterations: 99,
+	}
+
+	env.Log.Printf("Optimization start. Default allocation: %v", params.DefaultAllocation().String())
+	env.Log.Printf("Optimization start. Params: %v", params.String())
+
+	result, err := optimize.Minimize(problem, params.initialValue(), settings, &optimize.NelderMead{})
+	if err != nil {
+		env.Log.Printf("Optimization failed: %v", err)
+		return params.DefaultAllocation(), Score{}, err
+	}
+
+	totCPUs := params.TotalCPUs()
+	score := Score{Cost: result.F}
+	x_w := int(math.Round(result.Location.X[1]))
+	vals := Values{
+		IsolatedCPUCount: x_w,
+		ReservedCPUCount: totCPUs - x_w, // we can use x_c, but we just leverage invariants
+	}
+
+	if err := Validate(params, vals); err != nil {
+		env.Log.Printf("Optimization invalid: %v", err)
+		return params.DefaultAllocation(), Score{}, err
+	}
+
+	env.Log.Printf("Optimization done. Score: %v %s totalCPUs=%d", score.String(), vals.String(), totCPUs)
+	return vals, score, nil
 }