Skip to content

Commit 3607a10

Browse files
committed
feat: partitioned scheduling
1 parent 6f043eb commit 3607a10

File tree

22 files changed

+1746
-91
lines changed

22 files changed

+1746
-91
lines changed

.vscode/settings.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
"AWSGPU",
1818
"batchv",
1919
"Biren",
20+
"bubbletea",
21+
"BUILDPLATFORM",
22+
"buildx",
2023
"burstable",
2124
"Cambricon",
2225
"CDNA",
@@ -25,6 +28,7 @@
2528
"certificaterequests",
2629
"certmanager",
2730
"CFLAGS",
31+
"charmbracelet",
2832
"clientcmd",
2933
"clientcmdapi",
3034
"clientgoscheme",
@@ -52,23 +56,29 @@
5256
"envtest",
5357
"essd",
5458
"Eventf",
59+
"eventhandlers",
5560
"evictable",
5661
"featuregate",
5762
"finalizer",
5863
"Finalizers",
5964
"frameworkruntime",
6065
"fsnotify",
6166
"FULLTEXT",
67+
"GOBIN",
6268
"goconst",
6369
"gocyclo",
6470
"goerrors",
71+
"golangci",
6572
"golint",
6673
"Gomega",
6774
"gonic",
75+
"GOPATH",
6876
"gopsutil",
6977
"gorm",
7078
"gosec",
79+
"GPGPU",
7180
"gpuallocator",
81+
"GPUIDs",
7282
"gpunode",
7383
"gpunodeclaim",
7484
"gpunodeclaims",
@@ -87,10 +97,13 @@
8797
"Hygon",
8898
"iface",
8999
"imageutils",
100+
"indexallocator",
90101
"influxdata",
91102
"Infof",
92103
"internalcache",
93104
"internalqueue",
105+
"intstr",
106+
"IVSHMEM",
94107
"jsonpatch",
95108
"karpenter",
96109
"karpv",
@@ -106,6 +119,8 @@
106119
"libcuda",
107120
"libnvidia",
108121
"lineprotocol",
122+
"lipgloss",
123+
"LOCALBIN",
109124
"mapstructure",
110125
"metav",
111126
"metricsserver",
@@ -121,24 +136,29 @@
121136
"noderesources",
122137
"nolint",
123138
"NUMA",
139+
"nvdp",
124140
"Nvlink",
125141
"NVML",
126142
"objs",
127143
"omitempty",
128144
"onsi",
145+
"pids",
129146
"pluginapi",
147+
"podname",
130148
"portallocator",
131149
"Postable",
132150
"printcolumn",
133151
"prometheusagents",
134152
"prometheuses",
135153
"prometheusrules",
136154
"queuesort",
155+
"Radeon",
137156
"RDNA",
138157
"readyz",
139158
"replicaset",
140159
"replicasets",
141160
"rolebinding",
161+
"RTXA",
142162
"runbook",
143163
"runpod",
144164
"samber",
@@ -151,6 +171,7 @@
151171
"schedv",
152172
"serviceaccount",
153173
"shirou",
174+
"shmem",
154175
"shortuuid",
155176
"statefulset",
156177
"statefulsets",
@@ -161,6 +182,7 @@
161182
"strategicpatch",
162183
"strategicpatches",
163184
"stretchr",
185+
"strncpy",
164186
"subresource",
165187
"Tabler",
166188
"tensorfusion",
@@ -175,6 +197,8 @@
175197
"testutil",
176198
"tflops",
177199
"timberio",
200+
"Timeslicing",
201+
"tmpfs",
178202
"Tmpl",
179203
"tokenreviews",
180204
"Tolerations",
@@ -183,7 +207,9 @@
183207
"utilerrors",
184208
"utilruntime",
185209
"vgpu",
210+
"Warningf",
186211
"webhookcorev",
212+
"workerstate",
187213
"workloadprofiles",
188214
"workqueue",
189215
"Xlarge"

api/v1/gpu_types.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ type GPUStatus struct {
6565

6666
// +optional
6767
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
68+
69+
// +optional
70+
// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
71+
// Reported from discovery, each template has fixed resource allocation
72+
PartitionTemplates []PartitionTemplate `json:"partitionTemplates,omitempty"`
73+
74+
// +optional
75+
// AllocatedPartitions tracks allocated partitions on this GPU
76+
// Key is partitionUUID, value contains template info and allocated resources
77+
AllocatedPartitions map[string]AllocatedPartition `json:"allocatedPartitions,omitempty"`
6878
}
6979

7080
// +kubebuilder:validation:Enum=tensor-fusion;nvidia-device-plugin
@@ -98,6 +108,36 @@ type PodGPUInfo struct {
98108
QoS QoSLevel `json:"qos,omitempty"`
99109
}
100110

111+
// PartitionTemplate represents a hardware partition template (e.g., MIG profile)
112+
// Only stores template ID and name in GPU status. Detailed resource information
113+
// is stored in public GPU info config.
114+
type PartitionTemplate struct {
115+
// TemplateID is the unique identifier for this partition template (e.g., "1g.24gb", "4g.94gb")
116+
TemplateID string `json:"templateId"`
117+
118+
// Name is a human-readable name for this template
119+
Name string `json:"name"`
120+
}
121+
122+
// AllocatedPartition represents an allocated partition on a GPU
123+
// Key in AllocatedPartitions map is podUID
124+
type AllocatedPartition struct {
125+
// TemplateID is the template used to create this partition
126+
TemplateID string `json:"templateId"`
127+
128+
// PodUID is the UID of the pod using this partition (used as map key)
129+
PodUID string `json:"podUid"`
130+
131+
// PodName is the name of the pod using this partition
132+
PodName string `json:"podName"`
133+
134+
// Namespace is the namespace of the pod using this partition
135+
Namespace string `json:"namespace"`
136+
137+
// AllocatedAt is when this partition was allocated
138+
AllocatedAt metav1.Time `json:"allocatedAt"`
139+
}
140+
101141
// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
102142
type TensorFusionGPUPhase string
103143

api/v1/gpuresourcequota_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ type AllocRequest struct {
196196
QoS QoSLevel
197197

198198
Isolation IsolationModeType
199+
200+
// PartitionTemplateID is the template ID used for partitioned mode allocation
201+
// This is set by the scheduler when a partition is matched, or read from pod annotation
202+
PartitionTemplateID string
199203
}
200204

201205
func (p *AllocRequest) Clone() fwk.StateData {

internal/autoscaler/autoscaler_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,11 @@ var _ = Describe("Autoscaler", func() {
9191

9292
// create two workloads
9393
pool := tfEnv.GetGPUPool(0)
94-
// with two replias
94+
// with two replicas
9595
workload0 := createWorkload(pool, 0, 2)
9696
workload0Workers := getWorkers(workload0)
9797
key0 := WorkloadID{workload0.Namespace, workload0.Name}
98-
// with one replia
98+
// with one replica
9999
workload1 := createWorkload(pool, 1, 1)
100100
workload1Workers := getWorkers(workload1)
101101
key1 := WorkloadID{workload1.Namespace, workload1.Name}
@@ -539,8 +539,8 @@ func (f *FakeRecommender) Name() string {
539539
return "fake"
540540
}
541541

542-
func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*recommender.RecResult, error) {
543-
meta.SetStatusCondition(&workoad.Status.Conditions, metav1.Condition{
542+
func (f *FakeRecommender) Recommend(ctx context.Context, workload *workload.State) (*recommender.RecResult, error) {
543+
meta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{
544544
Type: constants.ConditionStatusTypeRecommendationProvided,
545545
Status: metav1.ConditionTrue,
546546
LastTransitionTime: metav1.Now(),

internal/cloudprovider/pricing/pricing.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
3232
"github.com/NexusGPU/tensor-fusion/internal/config"
3333
"github.com/NexusGPU/tensor-fusion/internal/constants"
34+
"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
3435
"k8s.io/apimachinery/pkg/api/resource"
3536
"sigs.k8s.io/controller-runtime/pkg/log"
3637
)
@@ -104,6 +105,9 @@ func SetTflopsMapAndInitGPUPricingInfo(ctx context.Context, gpuInfos *[]config.G
104105
tflopsMap[gpuInfo.Model] = completeInfo
105106
}
106107

108+
// Load partition templates from config
109+
gpuallocator.LoadPartitionTemplatesFromConfig(*gpuInfos)
110+
107111
initOnce.Do(func() {
108112
globalAWSGPUInstanceData = make(map[string]GPUNodeInstanceInfoAndPrice)
109113
globalAzureGPUInstanceData = make(map[string]GPUNodeInstanceInfoAndPrice)

internal/component/component.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func calculateDesiredUpdatedDelta(total int, updatedSize int, batchPercentage in
170170
currentBatchIndex = newUpdateProgress / batchPercentage
171171
desiredSize = min((currentBatchIndex+1)*int32(batchSize), int32(total))
172172
delta = desiredSize - int32(updatedSize)
173-
// if rolling udpate policy changed or new nodes were added during update, we need to update progress
173+
// if rolling update policy changed or new nodes were added during update, we need to update progress
174174
if delta < 0 {
175175
newUpdateProgress = min(newUpdateProgress+batchPercentage, 100)
176176
} else {

internal/config/gpu_info.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,40 @@ type GpuInfo struct {
1010
CostPerHour float64 `json:"costPerHour"`
1111
Fp16TFlops resource.Quantity `json:"fp16TFlops"`
1212
FullModelName string `json:"fullModelName"`
13+
14+
// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
15+
// Only applicable for GPUs that support hardware partitioning
16+
PartitionTemplates []PartitionTemplateInfo `json:"partitionTemplates,omitempty"`
17+
18+
// MaxPartitions is the maximum number of partitions this GPU can support (e.g., 7 for MIG)
19+
MaxPartitions uint32 `json:"maxPartitions,omitempty"`
20+
}
21+
22+
// PartitionTemplateInfo contains detailed resource information for a partition template
23+
type PartitionTemplateInfo struct {
24+
// TemplateID is the unique identifier (e.g., "1g.24gb", "4g.94gb")
25+
TemplateID string `json:"templateId"`
26+
27+
// Name is a human-readable name
28+
Name string `json:"name"`
29+
30+
// MemoryBytes is the memory allocated to this partition in bytes
31+
MemoryBytes uint64 `json:"memoryBytes"`
32+
33+
// ComputeUnits is the number of compute units (SMs) allocated
34+
ComputeUnits uint64 `json:"computeUnits"`
35+
36+
// Tflops is the TFLOPS capacity of this partition
37+
Tflops float64 `json:"tflops"`
38+
39+
// SliceCount is the number of slices (for MIG, this is the denominator, e.g., 7 for 1/7)
40+
SliceCount uint32 `json:"sliceCount"`
41+
42+
// IsDefault indicates if this is a default template
43+
IsDefault bool `json:"isDefault,omitempty"`
44+
45+
// Description provides additional information about this template
46+
Description string `json:"description,omitempty"`
1347
}
1448

1549
func MockGpuInfo() *[]GpuInfo {

internal/constants/constants.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,10 @@ const (
8383
// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
8484
GPUModelAnnotation = Domain + "/gpu-model"
8585
// GPU ID list is assigned by scheduler, should not specified by user
86-
GPUDeviceIDsAnnotation = Domain + "/gpu-ids"
86+
GPUDeviceIDsAnnotation = Domain + "/gpu-ids"
87+
// PartitionTemplateIDAnnotation is the partition UUID assigned to a pod in partitioned mode
88+
// This is read by accelerator.c to mock slice GPU like MIG does
89+
PartitionTemplateIDAnnotation = Domain + "/partition"
8790
DedicatedGPUAnnotation = Domain + "/dedicated-gpu"
8891
SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"
8992
PricingAnnotation = Domain + "/hourly-pricing"

internal/gpuallocator/filter/filter_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func TestFilters(t *testing.T) {
111111
filter := NewResourceFilter(tfv1.Resource{
112112
Tflops: resource.MustParse("8"),
113113
Vram: resource.MustParse("30Gi"),
114-
}, nil)
114+
})
115115
result, err := filter.Filter(ctx, testPodKey, gpus)
116116
assert.NoError(t, err)
117117
assert.Len(t, result, 2)
@@ -126,7 +126,7 @@ func TestFilters(t *testing.T) {
126126
With(NewResourceFilter(tfv1.Resource{
127127
Tflops: resource.MustParse("8"),
128128
Vram: resource.MustParse("30Gi"),
129-
}, nil))
129+
}))
130130

131131
// Apply filters
132132
result, _, err := registry.Apply(ctx, testPodKey, gpus, false)
@@ -137,10 +137,11 @@ func TestFilters(t *testing.T) {
137137

138138
t.Run("FilterRegistry with gpu indices filtering", func(t *testing.T) {
139139
registry := NewFilterRegistry().
140+
With(NewGPUIndexFilter([]int32{2, 3})).
140141
With(NewResourceFilter(tfv1.Resource{
141142
Tflops: resource.MustParse("1"),
142143
Vram: resource.MustParse("1Gi"),
143-
}, []int32{2, 3}))
144+
}))
144145

145146
// Apply filters
146147
result, _, err := registry.Apply(ctx, testPodKey, gpus, false)
@@ -160,7 +161,7 @@ func TestFilters(t *testing.T) {
160161
With(NewResourceFilter(tfv1.Resource{
161162
Tflops: resource.MustParse("8"),
162163
Vram: resource.MustParse("30Gi"),
163-
}, nil))
164+
}))
164165

165166
// Apply base registry filters
166167
baseResult, _, err := baseRegistry.Apply(ctx, testPodKey, gpus, false)

0 commit comments

Comments
 (0)