Skip to content

Commit 6955fc6

Browse files
committed
WIP: [AA][HI]: perfprof creator: autosizing implementation
These changes where AI-Assisted (hence the AA tag), then largely amended by a human (hence the HI tag - Human Intervention). The initial penalty cost structure was suggested by google Gemini 2.5 flash, and then amended by human intervention. Assisted-by: Google Gemini Assisted-by-model: gemini-2.5-flash Signed-off-by: Francesco Romani <fromani@redhat.com>
1 parent 58c0c5f commit 6955fc6

File tree

435 files changed

+95242
-10
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

435 files changed

+95242
-10
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ require (
2828
github.com/prometheus/client_golang v1.21.1
2929
github.com/spf13/cobra v1.9.1
3030
github.com/spf13/pflag v1.0.6
31+
gonum.org/v1/gonum v0.16.0
3132
gopkg.in/fsnotify.v1 v1.4.7
3233
gopkg.in/ini.v1 v1.67.0
3334
gopkg.in/yaml.v2 v2.4.0

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
940940
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
941941
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
942942
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
943+
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
944+
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
943945
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
944946
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
945947
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
Lines changed: 228 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,245 @@
11
package autosize
22

3-
import "github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator"
3+
import (
4+
"errors"
5+
"fmt"
6+
"log"
7+
"math"
48

5-
// shortcut
6-
var Alert = profilecreator.Alert
9+
"gonum.org/v1/gonum/optimize"
10+
11+
"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator"
12+
)
13+
14+
// Assumptions:
15+
// 1. All the machines in the node pool have identical HW specs and need identical sizing.
16+
// 2. We cannot distinguyish betwee infra/OS CPU requirements and control plane CPU requirement.
17+
// We will conflate the two costs in the latter.
18+
//
19+
// Definitions:
20+
// x_c: CPUs for the control plane - includes x_i: CPUs for OS/Infra
21+
// x_w: CPUs for the workload
22+
// Tc: Total available CPUs (includes OS/Infra
23+
//
24+
// Hard Constraints:
25+
// x_c, x_w are integers because we need to dedicate full cores
26+
// x_c, x_w >= 0
27+
// x_c + x_w <= Tc
28+
// x_c >= req(x_w) // control plane and infra cost is a function of the expected workload
29+
//
30+
// Objective:
31+
// We want to maximize x_w, or, equivalently, minimize x_c
32+
33+
const (
34+
defaultPenaltyWeight float64 = 100.0
35+
defaultReservedRatioInitial float64 = 0.0625 // 1/16. determined empirically. Use only as initial value.
36+
defaultReservedRatioMax float64 = 0.25 // 1/4. determined empirically. This is the practical upper bound.
37+
defaultControlPlaneWorkloadCoreRatio float64 = 0.075 // TODO: how much control plane/infra power do we need to support the workload?
38+
)
39+
40+
var (
41+
ErrUnderallocatedControlPlane = errors.New("not enough CPUs for control plane")
42+
ErrOverallocatedControlPlane = errors.New("too many CPUs for control plane")
43+
ErrInconsistentAllocation = errors.New("inconsistent CPus allocation")
44+
)
45+
46+
type Env struct {
47+
Log *log.Logger
48+
}
49+
50+
func DefaultEnv() Env {
51+
return Env{
52+
Log: profilecreator.GetAlertSink(),
53+
}
54+
}
755

856
type Params struct {
9-
NodePoolSize int
10-
OfflinedCPUCount int
57+
OfflinedCPUCount int
58+
UserLevelNetworking bool
59+
MachineData *profilecreator.GHWHandler
60+
// cached vars
61+
totalCPUs int
62+
isSMTEnabled bool
63+
}
64+
65+
func (p Params) String() string {
66+
return fmt.Sprintf("cpus=%d offline=%v smtLevel=%v", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel())
67+
}
68+
69+
func setupMachineData(p *Params) error {
70+
var err error
71+
72+
ok, err := p.MachineData.IsHyperthreadingEnabled()
73+
if err != nil {
74+
return err
75+
}
76+
p.isSMTEnabled = ok
77+
78+
cpus, err := p.MachineData.CPU()
79+
if err != nil {
80+
return err
81+
}
82+
p.totalCPUs = int(cpus.TotalHardwareThreads)
83+
84+
return nil
85+
}
86+
87+
// Get Tc
88+
func (p Params) TotalCPUs() int {
89+
return p.totalCPUs
90+
}
91+
92+
func (p Params) IsSMTEnabled() bool {
93+
return p.isSMTEnabled
94+
}
95+
96+
func (p Params) smtLevel() int {
97+
if !p.isSMTEnabled {
98+
return 1
99+
}
100+
return 2 // TODO
101+
}
102+
103+
func (p Params) DefaultControlPlaneCores() int {
104+
// intentionally overallocate to have a safe baseline
105+
Tc := p.TotalCPUs()
106+
return int(math.Round(float64(Tc) * defaultReservedRatioInitial)) // TODO handle SMT
107+
}
108+
109+
// Get x_c, x_w as initial hardcoded value. Subject to optimization
110+
func (p Params) DefaultAllocation() Values {
111+
Tc := p.TotalCPUs()
112+
x_c := p.DefaultControlPlaneCores()
113+
return Values{
114+
ReservedCPUCount: x_c,
115+
IsolatedCPUCount: Tc - x_c,
116+
}
117+
}
118+
119+
func (p Params) initialValue() []float64 {
120+
vals := p.DefaultAllocation()
121+
return []float64{
122+
float64(vals.ReservedCPUCount), // x_c
123+
float64(vals.IsolatedCPUCount), // x_w
124+
}
125+
}
126+
127+
func (p Params) controlPlaneRequirement(x_w float64) float64 {
128+
R := defaultControlPlaneWorkloadCoreRatio
129+
if p.UserLevelNetworking {
130+
R = 0.0
131+
}
132+
// TODO: the most obvious relationship is for kernel level networking.
133+
// We start with a linear relationship because its simplicity.
134+
return float64(p.DefaultControlPlaneCores()) + R*x_w
135+
}
136+
137+
type Score struct {
138+
Cost float64 // the lower the better
139+
}
140+
141+
func (sc Score) String() string {
142+
val := -sc.Cost // positive values are easier to grasp
143+
return fmt.Sprintf("optimization result: %.3f (higher is better)", val)
11144
}
12145

13146
type Values struct {
147+
// we intentionally compute the recommended cpu count, not precise allocation, because
148+
// this is better done by other packages. We may expose the precise allocation as hint
149+
// or for reference purposes in the future
14150
ReservedCPUCount int
151+
IsolatedCPUCount int
15152
}
16153

17-
type Env struct{}
154+
func (vals Values) String() string {
155+
return fmt.Sprintf("reserved=%v/isolated=%v", vals.ReservedCPUCount, vals.IsolatedCPUCount)
156+
}
18157

19-
func DefaultEnv() Env {
20-
return Env{}
158+
// gonum doesn't support bounds yet so we have to make this an explicit step
159+
// https://github.com/gonum/gonum/issues/1725
160+
func Validate(params Params, vals Values) error {
161+
Tc := params.TotalCPUs()
162+
if vals.ReservedCPUCount < 1 { // TODO handle SMT
163+
return ErrUnderallocatedControlPlane
164+
}
165+
if vals.ReservedCPUCount > int(math.Round((float64(Tc) * defaultReservedRatioMax))) { // works, but likely unacceptable
166+
return ErrOverallocatedControlPlane
167+
}
168+
if Tc != vals.ReservedCPUCount+vals.IsolatedCPUCount {
169+
return ErrInconsistentAllocation
170+
}
171+
return nil
21172
}
22173

23-
type Score struct{}
174+
// Objective function to minimize.
175+
// x[0] is x_c
176+
// x[1] is x_w
177+
func objective(p Params, x []float64) float64 {
178+
xc := x[0]
179+
xw := x[1]
180+
181+
// Our original objective is to maximize xw, so we minimize -xw
182+
target := -xw
183+
184+
// gonum doesn't support bounds yet so we have to use penalties:
185+
// https://github.com/gonum/gonum/issues/1725
186+
187+
// Hard Constraints
188+
var hardPenalty float64
189+
// Don't exceed total CPUs
190+
hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, xc+xw-float64(p.TotalCPUs())), 2)
191+
192+
// Meet the control plane/infra requirement to avoid the workload to starve
193+
hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, p.controlPlaneRequirement(xw)-xc), 2)
194+
195+
// Must use positive CPU values (since gonum/optimize doesn't have simple bounds for all solvers)
196+
hardPenalty += defaultPenaltyWeight * (math.Pow(math.Max(0, -xc), 2) + math.Pow(math.Max(0, -xw), 2))
197+
198+
// Allocate in multiples of SMT level (usually 2) -- TODO: should be soft?
199+
hardPenalty += defaultPenaltyWeight * (math.Pow(math.Max(0, -float64(int(math.Round(xc))%p.smtLevel())), 2))
200+
201+
return target + hardPenalty
202+
}
24203

25204
func Compute(env Env, params Params) (Values, Score, error) {
26-
return Values{}, Score{}, nil
205+
err := setupMachineData(&params)
206+
if err != nil {
207+
env.Log.Printf("Optimization failed: %v", err)
208+
return params.DefaultAllocation(), Score{}, err
209+
}
210+
211+
problem := optimize.Problem{
212+
Func: func(x []float64) float64 {
213+
return objective(params, x)
214+
},
215+
}
216+
217+
settings := &optimize.Settings{
218+
MajorIterations: 99,
219+
}
220+
221+
env.Log.Printf("Optimization start. Default allocation: %v", params.DefaultAllocation().String())
222+
env.Log.Printf("Optimization start. Params: %v", params.String())
223+
224+
result, err := optimize.Minimize(problem, params.initialValue(), settings, &optimize.NelderMead{})
225+
if err != nil {
226+
env.Log.Printf("Optimization failed: %v", err)
227+
return params.DefaultAllocation(), Score{}, err
228+
}
229+
230+
totCPUs := params.TotalCPUs()
231+
score := Score{Cost: result.F}
232+
x_w := int(math.Round(result.Location.X[1]))
233+
vals := Values{
234+
IsolatedCPUCount: x_w,
235+
ReservedCPUCount: totCPUs - x_w, // we can use x_c, but we just leverage invariants
236+
}
237+
238+
if err := Validate(params, vals); err != nil {
239+
env.Log.Printf("Optimization invalid: %v", err)
240+
return params.DefaultAllocation(), Score{}, err
241+
}
242+
243+
env.Log.Printf("Optimization done. Score: %v %s totalCPUs=%d", score.String(), vals.String(), totCPUs)
244+
return vals, score, nil
27245
}

0 commit comments

Comments
 (0)