Skip to content

Commit 6f043eb

Browse files
committed
fix: refactor hypervisor
1 parent 01a3dc6 commit 6f043eb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+8744
-564
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,6 @@ logs
4545
provider/build
4646

4747
cmd/hypervisor/hypervisor
48-
*.o
48+
*.o
49+
50+
_obj

.vscode/settings.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"certgen",
2525
"certificaterequests",
2626
"certmanager",
27+
"CFLAGS",
2728
"clientcmd",
2829
"clientcmdapi",
2930
"clientgoscheme",
@@ -45,6 +46,7 @@
4546
"datanode",
4647
"deepcopy",
4748
"defaultbinder",
49+
"deviceplugin",
4850
"dylib",
4951
"eastus",
5052
"envtest",
@@ -55,6 +57,7 @@
5557
"finalizer",
5658
"Finalizers",
5759
"frameworkruntime",
60+
"fsnotify",
5861
"FULLTEXT",
5962
"goconst",
6063
"gocyclo",
@@ -99,6 +102,7 @@
99102
"kubescheduler",
100103
"kubeschedulerconfig",
101104
"kustomization",
105+
"libaccelerator",
102106
"libcuda",
103107
"libnvidia",
104108
"lineprotocol",
@@ -113,6 +117,7 @@
113117
"nindent",
114118
"nodeclaim",
115119
"nodeclassref",
120+
"nodelist",
116121
"noderesources",
117122
"nolint",
118123
"NUMA",
@@ -121,6 +126,7 @@
121126
"objs",
122127
"omitempty",
123128
"onsi",
129+
"pluginapi",
124130
"portallocator",
125131
"Postable",
126132
"printcolumn",
@@ -148,6 +154,10 @@
148154
"shortuuid",
149155
"statefulset",
150156
"statefulsets",
157+
"stdbool",
158+
"stddef",
159+
"stdint",
160+
"stdlib",
151161
"strategicpatch",
152162
"strategicpatches",
153163
"stretchr",

Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,21 @@ build: manifests generate fmt vet ## Build manager binary.
110110
run: manifests generate fmt vet ## Run a controller from your host.
111111
go run ./cmd/main.go
112112

113+
.PHONY: build-provider
114+
build-provider: ## Build accelerator stub library.
115+
$(MAKE) -C provider stub
116+
117+
.PHONY: build-hypervisor
118+
build-hypervisor: build-provider ## Build hypervisor binary with CGO enabled.
119+
@PROVIDER_DIR=$$(pwd)/provider; \
120+
CGO_ENABLED=1 \
121+
CGO_CFLAGS="-I$$PROVIDER_DIR" \
122+
go build -o bin/hypervisor ./cmd/hypervisor
123+
124+
.PHONY: clean-cache
125+
clean-cache: ## Clean Go build cache.
126+
go clean -cache -testcache
127+
113128
# If you wish to build the manager image targeting other platforms you can use the --platform flag.
114129
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
115130
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/

api/v1/gpu_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ type GPUStatus struct {
3838

3939
UUID string `json:"uuid"`
4040

41+
// +optional
42+
// +kubebuilder:default=soft
43+
IsolationMode IsolationModeType `json:"isolationMode,omitempty"`
44+
4145
// +optional
4246
Index *int32 `json:"index,omitempty"`
4347

api/v1/gpupool_types.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ type GPUPoolSpec struct {
3333
// +optional
3434
DefaultUsingLocalGPU *bool `json:"defaultUsingLocalGPU,omitempty"`
3535

36+
// +optional
37+
// +kubebuilder:default=NVIDIA
38+
Vendor string `json:"vendor,omitempty"`
39+
3640
CapacityConfig *CapacityConfig `json:"capacityConfig,omitempty"`
3741

3842
NodeManagerConfig *NodeManagerConfig `json:"nodeManagerConfig,omitempty"`
@@ -88,12 +92,23 @@ type NodeManagerConfig struct {
8892
// +kubebuilder:default="AutoSelect"
8993
ProvisioningMode ProvisioningMode `json:"provisioningMode,omitempty"`
9094

95+
// +optional
96+
// +kubebuilder:default=NVIDIA
97+
// In single AI accelerator hardware vendor mode, when default vendor set
98+
// All nodes provisioned by NodeProvisioner or selected by NodeSelector will be set with vendor label
99+
DefaultVendor string `json:"defaultVendor,omitempty"`
100+
91101
// +optional
92102
NodeProvisioner *NodeProvisioner `json:"nodeProvisioner,omitempty"`
93103

94104
// +optional
95105
NodeSelector *corev1.NodeSelector `json:"nodeSelector,omitempty"`
96106

107+
// +optional
108+
// When this field set, the GPU pool will be in multi AI accelerator vendor mode
109+
// each GPU node's vendor name is set to map key, e.g. { AMD: { nodeSelectorTerms }}
110+
MultiVendorNodeSelector map[string]*corev1.NodeSelector `json:"multiVendorNodeSelector,omitempty"`
111+
97112
// +optional
98113
NodeCompaction *NodeCompaction `json:"nodeCompaction,omitempty"`
99114

api/v1/gpuresourcequota_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ type AllocRequest struct {
194194
PodMeta metav1.ObjectMeta
195195

196196
QoS QoSLevel
197+
198+
Isolation IsolationModeType
197199
}
198200

199201
func (p *AllocRequest) Clone() fwk.StateData {

charts/tensor-fusion/templates/controller-deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ spec:
5757
fieldPath: metadata.namespace
5858
# when deploy with AutoSelect mode, GPU node is managed by Kubernetes rather than TensorFusion, thus, need to specify the label selector to generate the GPUNode custom resource
5959
- name: INITIAL_GPU_NODE_LABEL_SELECTOR
60-
value: "{{ default "nvidia.com/gpu.present=true" .Values.initialGpuNodeLabelSelector }}"
60+
value: "{{ .Values.initialGpuNodeLabelSelector }}"
6161
- name: TSDB_MYSQL_HOST
6262
value: "{{ .Values.greptime.host }}"
6363
- name: TSDB_MYSQL_PORT
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
initialGpuNodeLabelSelector: ""

cmd/hypervisor-tui/main.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"flag"
22+
"os"
23+
24+
"github.com/NexusGPU/tensor-fusion/internal/hypervisor/tui"
25+
tea "github.com/charmbracelet/bubbletea"
26+
"k8s.io/klog/v2"
27+
)
28+
29+
var (
30+
host = flag.String("host", "localhost", "Hypervisor server host")
31+
port = flag.Int("port", 8000, "Hypervisor server port")
32+
)
33+
34+
func main() {
35+
flag.Parse()
36+
klog.InitFlags(nil)
37+
defer klog.Flush()
38+
39+
ctx, cancel := context.WithCancel(context.Background())
40+
defer cancel()
41+
42+
// Create HTTP client
43+
client := tui.NewClient(*host, *port)
44+
45+
// Create TUI model
46+
model := tui.NewModel(ctx, client)
47+
48+
// Start TUI
49+
p := tea.NewProgram(model, tea.WithAltScreen())
50+
if _, err := p.Run(); err != nil {
51+
klog.Fatalf("Error running TUI: %v", err)
52+
os.Exit(1)
53+
}
54+
}

0 commit comments

Comments
 (0)