Skip to content

Commit 5073aba

Browse files
committed
fix: support partition allocation in scheduler
1 parent afd962a commit 5073aba

File tree

7 files changed

+676
-96
lines changed

7 files changed

+676
-96
lines changed

api/v1/workloadprofile_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ type WorkloadProfileSpec struct {
6363
// How to isolate resources, could be `shared` or `soft` or `hard` or `partitioned`
6464
Isolation IsolationModeType `json:"isolation,omitempty"`
6565

66+
// +optional
67+
// PartitionTemplateID specifies the partition template ID for partitioned isolation mode
68+
// This is read from pod annotation tensor-fusion.ai/partition if specified
69+
PartitionTemplateID string `json:"partitionTemplateId,omitempty"`
70+
6671
// +optional
6772
// GPUModel specifies the required GPU model (e.g., "A100", "H100")
6873
GPUModel string `json:"gpuModel,omitempty"`
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package filter
18+
19+
import (
20+
"context"
21+
"testing"
22+
23+
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
24+
"github.com/stretchr/testify/assert"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
)
27+
28+
func TestPartitionTemplateFilter(t *testing.T) {
29+
testPodKey := tfv1.NameNamespace{
30+
Name: "test-pod",
31+
Namespace: "test-namespace",
32+
}
33+
34+
tests := []struct {
35+
name string
36+
isolationMode tfv1.IsolationModeType
37+
requiredTemplate string
38+
maxPartitionsMap map[string]uint32
39+
gpus []*tfv1.GPU
40+
expectedCount int
41+
expectedGPUNames []string
42+
}{
43+
{
44+
name: "non-partitioned mode should pass all GPUs",
45+
isolationMode: tfv1.IsolationModeSoft,
46+
requiredTemplate: "",
47+
maxPartitionsMap: map[string]uint32{},
48+
gpus: []*tfv1.GPU{
49+
{
50+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-1"},
51+
Status: tfv1.GPUStatus{
52+
PartitionTemplates: []tfv1.PartitionTemplate{
53+
{TemplateID: "1g.24gb", Name: "1g.24gb"},
54+
},
55+
},
56+
},
57+
},
58+
expectedCount: 1,
59+
expectedGPUNames: []string{"gpu-1"},
60+
},
61+
{
62+
name: "partitioned mode - GPU without templates filtered out",
63+
isolationMode: tfv1.IsolationModePartitioned,
64+
requiredTemplate: "",
65+
maxPartitionsMap: map[string]uint32{"A100": 7},
66+
gpus: []*tfv1.GPU{
67+
{
68+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-1"},
69+
Status: tfv1.GPUStatus{
70+
GPUModel: "A100",
71+
PartitionTemplates: []tfv1.PartitionTemplate{},
72+
},
73+
},
74+
{
75+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-2"},
76+
Status: tfv1.GPUStatus{
77+
GPUModel: "A100",
78+
PartitionTemplates: []tfv1.PartitionTemplate{
79+
{TemplateID: "1g.24gb", Name: "1g.24gb"},
80+
},
81+
},
82+
},
83+
},
84+
expectedCount: 1,
85+
expectedGPUNames: []string{"gpu-2"},
86+
},
87+
{
88+
name: "partitioned mode - specific template required",
89+
isolationMode: tfv1.IsolationModePartitioned,
90+
requiredTemplate: "1g.24gb",
91+
maxPartitionsMap: map[string]uint32{"A100": 7},
92+
gpus: []*tfv1.GPU{
93+
{
94+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-1"},
95+
Status: tfv1.GPUStatus{
96+
GPUModel: "A100",
97+
PartitionTemplates: []tfv1.PartitionTemplate{
98+
{TemplateID: "4g.94gb", Name: "4g.94gb"},
99+
},
100+
},
101+
},
102+
{
103+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-2"},
104+
Status: tfv1.GPUStatus{
105+
GPUModel: "A100",
106+
PartitionTemplates: []tfv1.PartitionTemplate{
107+
{TemplateID: "1g.24gb", Name: "1g.24gb"},
108+
},
109+
},
110+
},
111+
},
112+
expectedCount: 1,
113+
expectedGPUNames: []string{"gpu-2"},
114+
},
115+
{
116+
name: "partitioned mode - max partitions reached",
117+
isolationMode: tfv1.IsolationModePartitioned,
118+
requiredTemplate: "",
119+
maxPartitionsMap: map[string]uint32{"A100": 7},
120+
gpus: []*tfv1.GPU{
121+
{
122+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-1"},
123+
Status: tfv1.GPUStatus{
124+
GPUModel: "A100",
125+
PartitionTemplates: []tfv1.PartitionTemplate{
126+
{TemplateID: "1g.24gb", Name: "1g.24gb"},
127+
},
128+
AllocatedPartitions: map[string]tfv1.AllocatedPartition{
129+
"pod-1": {TemplateID: "1g.24gb", PodUID: "pod-1"},
130+
"pod-2": {TemplateID: "1g.24gb", PodUID: "pod-2"},
131+
"pod-3": {TemplateID: "1g.24gb", PodUID: "pod-3"},
132+
"pod-4": {TemplateID: "1g.24gb", PodUID: "pod-4"},
133+
"pod-5": {TemplateID: "1g.24gb", PodUID: "pod-5"},
134+
"pod-6": {TemplateID: "1g.24gb", PodUID: "pod-6"},
135+
"pod-7": {TemplateID: "1g.24gb", PodUID: "pod-7"},
136+
},
137+
},
138+
},
139+
{
140+
ObjectMeta: metav1.ObjectMeta{Name: "gpu-2"},
141+
Status: tfv1.GPUStatus{
142+
GPUModel: "A100",
143+
PartitionTemplates: []tfv1.PartitionTemplate{
144+
{TemplateID: "1g.24gb", Name: "1g.24gb"},
145+
},
146+
AllocatedPartitions: map[string]tfv1.AllocatedPartition{
147+
"pod-1": {TemplateID: "1g.24gb", PodUID: "pod-1"},
148+
},
149+
},
150+
},
151+
},
152+
expectedCount: 1,
153+
expectedGPUNames: []string{"gpu-2"},
154+
},
155+
}
156+
157+
ctx := context.Background()
158+
159+
for _, tt := range tests {
160+
t.Run(tt.name, func(t *testing.T) {
161+
filter := NewPartitionTemplateFilter(tt.isolationMode, tt.requiredTemplate, tt.maxPartitionsMap)
162+
result, err := filter.Filter(ctx, testPodKey, tt.gpus)
163+
164+
assert.NoError(t, err)
165+
assert.Len(t, result, tt.expectedCount)
166+
if len(tt.expectedGPUNames) > 0 {
167+
resultNames := make([]string, len(result))
168+
for i, gpu := range result {
169+
resultNames[i] = gpu.Name
170+
}
171+
assert.ElementsMatch(t, tt.expectedGPUNames, resultNames)
172+
}
173+
})
174+
}
175+
}
176+

0 commit comments

Comments
 (0)