NexusGPU
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.vscode/settings.json‎
Lines changed: 10 additions & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 15 additions & 0 deletions b/‎Makefile‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎api/v1/gpu_types.go‎
Lines changed: 4 additions & 0 deletions b/‎api/v1/gpu_types.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎api/v1/gpupool_types.go‎
Lines changed: 15 additions & 0 deletions b/‎api/v1/gpupool_types.go‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎api/v1/gpuresourcequota_types.go‎
Lines changed: 2 additions & 0 deletions b/‎api/v1/gpuresourcequota_types.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/templates/controller-deployment.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/templates/controller-deployment.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/values-multi-vendor.yaml‎
Lines changed: 1 addition & 0 deletions b/‎charts/tensor-fusion/values-multi-vendor.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmd/hypervisor-tui/main.go‎
Lines changed: 54 additions & 0 deletions b/‎cmd/hypervisor-tui/main.go‎
Lines changed: 54 additions & 0 deletions
@@ -45,4 +45,6 @@ logs
 provider/build
 
 cmd/hypervisor/hypervisor
-*.o
+*.o
+
+_obj
@@ -24,6 +24,7 @@
         "certgen",
         "certificaterequests",
         "certmanager",
+        "CFLAGS",
         "clientcmd",
         "clientcmdapi",
         "clientgoscheme",
@@ -45,6 +46,7 @@
         "datanode",
         "deepcopy",
         "defaultbinder",
+        "deviceplugin",
         "dylib",
         "eastus",
         "envtest",
@@ -55,6 +57,7 @@
         "finalizer",
         "Finalizers",
         "frameworkruntime",
+        "fsnotify",
         "FULLTEXT",
         "goconst",
         "gocyclo",
@@ -99,6 +102,7 @@
         "kubescheduler",
         "kubeschedulerconfig",
         "kustomization",
+        "libaccelerator",
         "libcuda",
         "libnvidia",
         "lineprotocol",
@@ -113,6 +117,7 @@
         "nindent",
         "nodeclaim",
         "nodeclassref",
+        "nodelist",
         "noderesources",
         "nolint",
         "NUMA",
@@ -121,6 +126,7 @@
         "objs",
         "omitempty",
         "onsi",
+        "pluginapi",
         "portallocator",
         "Postable",
         "printcolumn",
@@ -148,6 +154,10 @@
         "shortuuid",
         "statefulset",
         "statefulsets",
+        "stdbool",
+        "stddef",
+        "stdint",
+        "stdlib",
         "strategicpatch",
         "strategicpatches",
         "stretchr",
 
@@ -110,6 +110,21 @@ build: manifests generate fmt vet ## Build manager binary.
 run: manifests generate fmt vet ## Run a controller from your host.
 	go run ./cmd/main.go
 
+.PHONY: build-provider
+build-provider: ## Build accelerator stub library.
+	$(MAKE) -C provider stub
+
+.PHONY: build-hypervisor
+build-hypervisor: build-provider ## Build hypervisor binary with CGO enabled.
+	@PROVIDER_DIR=$$(pwd)/provider; \
+	CGO_ENABLED=1 \
+	CGO_CFLAGS="-I$$PROVIDER_DIR" \
+	go build -o bin/hypervisor ./cmd/hypervisor
+
+.PHONY: clean-cache
+clean-cache: ## Clean Go build cache.
+	go clean -cache -testcache
+
 # If you wish to build the manager image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
 # More info: https://docs.docker.com/develop/develop-images/build_enhancements/
 
@@ -38,6 +38,10 @@ type GPUStatus struct {
 
 	UUID string `json:"uuid"`
 
+	// +optional
+	// +kubebuilder:default=soft
+	IsolationMode IsolationModeType `json:"isolationMode,omitempty"`
+
 	// +optional
 	Index *int32 `json:"index,omitempty"`
 
 
@@ -33,6 +33,10 @@ type GPUPoolSpec struct {
 	// +optional
 	DefaultUsingLocalGPU *bool `json:"defaultUsingLocalGPU,omitempty"`
 
+	// +optional
+	// +kubebuilder:default=NVIDIA
+	Vendor string `json:"vendor,omitempty"`
+
 	CapacityConfig *CapacityConfig `json:"capacityConfig,omitempty"`
 
 	NodeManagerConfig *NodeManagerConfig `json:"nodeManagerConfig,omitempty"`
@@ -88,12 +92,23 @@ type NodeManagerConfig struct {
 	// +kubebuilder:default="AutoSelect"
 	ProvisioningMode ProvisioningMode `json:"provisioningMode,omitempty"`
 
+	// +optional
+	// +kubebuilder:default=NVIDIA
+	// In single AI accelerator hardware vendor mode, when default vendor set
+	// All nodes provisioned by NodeProvisioner or selected by NodeSelector will be set with vendor label
+	DefaultVendor string `json:"defaultVendor,omitempty"`
+
 	// +optional
 	NodeProvisioner *NodeProvisioner `json:"nodeProvisioner,omitempty"`
 
 	// +optional
 	NodeSelector *corev1.NodeSelector `json:"nodeSelector,omitempty"`
 
+	// +optional
+	// When this field set, the GPU pool will be in multi AI accelerator vendor mode
+	// each GPU node's vendor name is set to map key, e.g. { AMD: { nodeSelectorTerms }}
+	MultiVendorNodeSelector map[string]*corev1.NodeSelector `json:"multiVendorNodeSelector,omitempty"`
+
 	// +optional
 	NodeCompaction *NodeCompaction `json:"nodeCompaction,omitempty"`
 
 
@@ -194,6 +194,8 @@ type AllocRequest struct {
 	PodMeta metav1.ObjectMeta
 
 	QoS QoSLevel
+
+	Isolation IsolationModeType
 }
 
 func (p *AllocRequest) Clone() fwk.StateData {
 
@@ -57,7 +57,7 @@ spec:
                   fieldPath: metadata.namespace
             # when deploy with AutoSelect mode, GPU node is managed by Kubernetes rather than TensorFusion, thus, need to specify the label selector to generate the GPUNode custom resource
             - name: INITIAL_GPU_NODE_LABEL_SELECTOR
-              value: "{{ default "nvidia.com/gpu.present=true" .Values.initialGpuNodeLabelSelector }}"
+              value: "{{ .Values.initialGpuNodeLabelSelector }}"
             - name: TSDB_MYSQL_HOST
               value: "{{ .Values.greptime.host }}"
             - name: TSDB_MYSQL_PORT
 
@@ -0,0 +1 @@
+initialGpuNodeLabelSelector: ""
@@ -0,0 +1,54 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"context"
+	"flag"
+	"os"
+
+	"github.com/NexusGPU/tensor-fusion/internal/hypervisor/tui"
+	tea "github.com/charmbracelet/bubbletea"
+	"k8s.io/klog/v2"
+)
+
+var (
+	host = flag.String("host", "localhost", "Hypervisor server host")
+	port = flag.Int("port", 8000, "Hypervisor server port")
+)
+
+func main() {
+	flag.Parse()
+	klog.InitFlags(nil)
+	defer klog.Flush()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Create HTTP client
+	client := tui.NewClient(*host, *port)
+
+	// Create TUI model
+	model := tui.NewModel(ctx, client)
+
+	// Start TUI
+	p := tea.NewProgram(model, tea.WithAltScreen())
+	if _, err := p.Run(); err != nil {
+		klog.Fatalf("Error running TUI: %v", err)
+		os.Exit(1)
+	}
+}
Original file line number	Diff line number	Diff line change
`@@ -194,6 +194,8 @@ type AllocRequest struct {`
`194`	`194`	`PodMeta metav1.ObjectMeta`
`195`	`195`
`196`	`196`	`QoS QoSLevel`
	`197`	`+`
	`198`	`+ Isolation IsolationModeType`
`197`	`199`	`}`
`198`	`200`
`199`	`201`	`func (p *AllocRequest) Clone() fwk.StateData {`