NexusGPU · wangqianqianjun · Aug 25, 2025 · Aug 27, 2025 · Aug 30, 2025 · Aug 31, 2025
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -29,7 +29,7 @@ jobs:
         uses: actions/checkout@v5
 
       - name: Setup Go
-        uses: actions/setup-go@v5
+        uses: actions/setup-go@v6
         with:
           go-version: '~1.24'
 

diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml
@@ -1,5 +1,8 @@
 name: E2E Tests
 
+permissions:
+  contents: read
+
 on:
   workflow_dispatch:
 

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -28,13 +28,13 @@ jobs:
     strategy:
       matrix:
         # from https://github.com/kubernetes-sigs/controller-tools/blob/main/envtest-releases.yaml
-        envtest_k8s_version: [1.23.5, 1.33.0]
+        envtest_k8s_version: [1.23.5, 1.34.0]
     steps:
       - name: Clone the code
         uses: actions/checkout@v5
 
       - name: Setup Go
-        uses: actions/setup-go@v5
+        uses: actions/setup-go@v6
         with:
           go-version: '~1.24'
 

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -61,7 +61,8 @@
                 "KUBECONFIG": "~/.kube/config-local-studio",
                 "ENABLE_WEBHOOKS": "false",
                 "ENABLE_SCHEDULER": "true",
-                "ENABLE_CR_CONTROLLER": "true"
+                "ENABLE_CR_CONTROLLER": "true",
+                "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true"
             },
             "args": [
                 "--metrics-path", "${workspaceFolder}/logs/metrics.log",

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -25,6 +25,7 @@
         "clientcmdapi",
         "clientgoscheme",
         "clientset",
+        "clientsetfake",
         "cloudnative",
         "cloudprovider",
         "clusterissuers",
@@ -46,6 +47,8 @@
         "envtest",
         "essd",
         "Eventf",
+        "evictable",
+        "featuregate",
         "finalizer",
         "Finalizers",
         "frameworkruntime",
@@ -78,6 +81,8 @@
         "iface",
         "imageutils",
         "influxdata",
+        "internalcache",
+        "internalqueue",
         "jsonpatch",
         "karpenter",
         "karpv",
@@ -129,6 +134,7 @@
         "schedulingconfigtemplate",
         "schedulingconfigtemplates",
         "schedulingcorev",
+        "schedv",
         "serviceaccount",
         "shirou",
         "shortuuid",

diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go
@@ -238,6 +238,12 @@ type QosConfig struct {
 	Definitions []QosDefinition `json:"definitions,omitempty"`
 	DefaultQoS  QoSLevel        `json:"defaultQoS,omitempty"`
 	Pricing     []QosPricing    `json:"pricing,omitempty"`
+
+	// Eviction protection price ratio applied to cost calculation during protection period
+	// This multiplier increases pricing for protected workloads to discourage preemption
+	// +optional
+	// +kubebuilder:default="1.2"
+	EvictionProtectionPriceRatio string `json:"evictionProtectionPriceRatio,omitempty"`
 }
 
 type QosDefinition struct {

diff --git a/api/v1/gpuresourcequota_types.go b/api/v1/gpuresourcequota_types.go
@@ -19,7 +19,7 @@ package v1
 import (
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/kubernetes/pkg/scheduler/framework"
+	fwk "k8s.io/kube-scheduler/framework"
 )
 
 // GPUResourceQuotaSpec defines the desired state of GPUResourceQuota
@@ -192,6 +192,12 @@ type AllocRequest struct {
 
 	// cel filter expression
 	CELFilterExpression string
+
+	QoS QoSLevel
+}
+
+func (p *AllocRequest) Clone() fwk.StateData {
+	return p
 }
 
 type GPUAllocationInfo struct {
@@ -209,7 +215,7 @@ type AdjustRequest struct {
 	NewLimit   Resource
 }
 
-func (ar *AllocRequest) Clone() framework.StateData {
+func (ar *AdjustRequest) Clone() fwk.StateData {
 	return ar
 }
 

diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go
@@ -39,6 +39,10 @@ type SchedulingConfigTemplateSpec struct {
 	// single GPU device multi-process queuing and fair scheduling with QoS constraint
 	// +optional
 	Hypervisor *HypervisorScheduling `json:"hypervisor,omitempty"`
+
+	// enable Dynamic Resource Allocation (DRA) for GPU resource management
+	// +optional
+	DRA *DRAConfig `json:"dra,omitempty"`
 }
 
 type PlacementConfig struct {
@@ -206,6 +210,17 @@ type MultiProcessQueuing struct {
 	QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"`
 }
 
+// DRAConfig configures Dynamic Resource Allocation support
+type DRAConfig struct {
+	// Enable DRA mode for all workloads in this configuration template
+	// +optional
+	Enable *bool `json:"enable,omitempty"`
+
+	// ResourceClaimTemplateName specifies the ResourceClaim template name to use
+	// +optional
+	ResourceClaimTemplateName string `json:"resourceClaimTemplateName,omitempty"`
+}
+
 // SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.
 type SchedulingConfigTemplateStatus struct {
 	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.5.5
+version: 1.5.9
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
@@ -562,6 +562,12 @@ spec:
                           type: integer
                       type: object
                     type: array
+                  evictionProtectionPriceRatio:
+                    default: "1.2"
+                    description: |-
+                      Eviction protection price ratio applied to cost calculation during protection period
+                      This multiplier increases pricing for protected workloads to discourage preemption
+                    type: string
                   pricing:
                     items:
                       properties:

diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -143,6 +143,20 @@ spec:
                         type: string
                     type: object
                 type: object
+              dra:
+                description: enable Dynamic Resource Allocation (DRA) for GPU resource
+                  management
+                properties:
+                  enable:
+                    description: Enable DRA mode for all workloads in this configuration
+                      template
+                    type: boolean
+                  resourceClass:
+                    default: tensorfusion.ai/gpu
+                    description: ResourceClass specifies the DRA resource class name
+                      to use
+                    type: string
+                type: object
               hypervisor:
                 description: single GPU device multi-process queuing and fair scheduling
                   with QoS constraint

diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -629,6 +629,12 @@ spec:
                                     type: integer
                                 type: object
                               type: array
+                            evictionProtectionPriceRatio:
+                              default: "1.2"
+                              description: |-
+                                Eviction protection price ratio applied to cost calculation during protection period
+                                This multiplier increases pricing for protected workloads to discourage preemption
+                              type: string
                             pricing:
                               items:
                                 properties:

diff --git a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml
@@ -11,7 +11,7 @@ webhooks:
       namespace: {{ include "tensor-fusion.namespace" . }}
       path: /mutate-v1-pod
   failurePolicy: {{ .Values.controller.admissionWebhooks.failurePolicy }}
-  name: mpod-v1.kb.io
+  name: mpod.tensor-fusion.ai
   rules:
   - apiGroups:
     - ""

diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml
@@ -32,6 +32,7 @@ spec:
       {{- end }}
       serviceAccountName: {{ include "tensor-fusion.serviceAccountName" . }}
       enableServiceLinks: false
+      priorityClassName: "system-cluster-critical"
       containers:
         - name: controller
           image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag | default .Chart.AppVersion }}"

diff --git a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml
@@ -45,6 +45,18 @@ data:
       costPerHour: 1.64
       fp16TFlops: 312
 
+    - model: A100_PCIe_40GB
+      fullModelName: "NVIDIA A100-PCIE-40GB"
+      vendor: NVIDIA
+      costPerHour: 1.64
+      fp16TFlops: 312
+
+    - model: A100_PCIe_80GB
+      fullModelName: "NVIDIA A100-PCIE-80GB"
+      vendor: NVIDIA
+      costPerHour: 1.64
+      fp16TFlops: 312
+
     - model: A100_SXM_40G
       fullModelName: "NVIDIA A100-SXM4-40GB"
       vendor: NVIDIA
@@ -70,13 +82,13 @@ data:
       fp16TFlops: 312
 
     - model: A800_PCIe_80G
-      fullModelName: "NVIDIA A800 80GB PCIe"
+      fullModelName: "NVIDIA A800-PCIE-80GB"
       vendor: NVIDIA
       costPerHour: 1.64
       fp16TFlops: 312
 
     - model: A800_PCIe_40G
-      fullModelName: "NVIDIA A800 40GB PCIe"
+      fullModelName: "NVIDIA A800-PCIE-40GB"
       vendor: NVIDIA
       costPerHour: 1.64
       fp16TFlops: 312  
@@ -95,7 +107,7 @@ data:
       fp16TFlops: 125
 
     - model: A40
-      fullModelName: "NVIDIA A40 48GB PCIe"
+      fullModelName: "NVIDIA A40-PCIE-48GB"
       vendor: NVIDIA
       costPerHour: 0.4
       fp16TFlops: 149.7

diff --git a/charts/tensor-fusion/templates/priorityclass.yaml b/charts/tensor-fusion/templates/priorityclass.yaml
@@ -0,0 +1,23 @@
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: tensor-fusion-critical
+value: 100000
+globalDefault: false
+description: "TensorFusion critical priority"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: tensor-fusion-high
+value: 10000
+globalDefault: false
+description: "TensorFusion high priority"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: tensor-fusion-medium
+value: 0
+globalDefault: false
+description: "TensorFusion medium priority"
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -31,7 +31,7 @@ controller:
   image:
     repository: tensorfusion/tensor-fusion-operator
     # Overrides the image tag whose default is the chart appVersion.
-    tag: "latest"
+    tag: "1.43.4"
   # This is for setting Kubernetes Annotations to a Pod.
   # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ 
 
@@ -120,7 +120,7 @@ agent:
 
   image:
     repository: tensorfusion/tensor-fusion-agent
-    tag: "latest"
+    tag: "1.0.0"
 
   resources:
     requests:
@@ -169,8 +169,8 @@ schedulerConfig:
   kind: KubeSchedulerConfiguration
   clientConnection:
     kubeconfig: ""
-    qps: 50
-    burst: 100
+    qps: 1000
+    burst: 2000
   profiles:
   # Refer: https://kubernetes.io/docs/reference/scheduling/config/
   - schedulerName: tensor-fusion-scheduler