NexusGPU
diff --git a/‎.vscode/settings.json‎
Lines changed: 26 additions & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎api/v1/gpu_types.go‎
Lines changed: 40 additions & 0 deletions b/‎api/v1/gpu_types.go‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎api/v1/gpuresourcequota_types.go‎
Lines changed: 4 additions & 0 deletions b/‎api/v1/gpuresourcequota_types.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎internal/autoscaler/autoscaler_test.go‎
Lines changed: 4 additions & 4 deletions b/‎internal/autoscaler/autoscaler_test.go‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎internal/cloudprovider/pricing/pricing.go‎
Lines changed: 4 additions & 0 deletions b/‎internal/cloudprovider/pricing/pricing.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎internal/component/component.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/component/component.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/config/gpu_info.go‎
Lines changed: 34 additions & 0 deletions b/‎internal/config/gpu_info.go‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎internal/constants/constants.go‎
Lines changed: 4 additions & 1 deletion b/‎internal/constants/constants.go‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎internal/gpuallocator/filter/filter_test.go‎
Lines changed: 5 additions & 4 deletions b/‎internal/gpuallocator/filter/filter_test.go‎
Lines changed: 5 additions & 4 deletions
@@ -17,6 +17,9 @@
         "AWSGPU",
         "batchv",
         "Biren",
+        "bubbletea",
+        "BUILDPLATFORM",
+        "buildx",
         "burstable",
         "Cambricon",
         "CDNA",
@@ -25,6 +28,7 @@
         "certificaterequests",
         "certmanager",
         "CFLAGS",
+        "charmbracelet",
         "clientcmd",
         "clientcmdapi",
         "clientgoscheme",
@@ -52,23 +56,29 @@
         "envtest",
         "essd",
         "Eventf",
+        "eventhandlers",
         "evictable",
         "featuregate",
         "finalizer",
         "Finalizers",
         "frameworkruntime",
         "fsnotify",
         "FULLTEXT",
+        "GOBIN",
         "goconst",
         "gocyclo",
         "goerrors",
+        "golangci",
         "golint",
         "Gomega",
         "gonic",
+        "GOPATH",
         "gopsutil",
         "gorm",
         "gosec",
+        "GPGPU",
         "gpuallocator",
+        "GPUIDs",
         "gpunode",
         "gpunodeclaim",
         "gpunodeclaims",
@@ -87,10 +97,13 @@
         "Hygon",
         "iface",
         "imageutils",
+        "indexallocator",
         "influxdata",
         "Infof",
         "internalcache",
         "internalqueue",
+        "intstr",
+        "IVSHMEM",
         "jsonpatch",
         "karpenter",
         "karpv",
@@ -106,6 +119,8 @@
         "libcuda",
         "libnvidia",
         "lineprotocol",
+        "lipgloss",
+        "LOCALBIN",
         "mapstructure",
         "metav",
         "metricsserver",
@@ -121,24 +136,29 @@
         "noderesources",
         "nolint",
         "NUMA",
+        "nvdp",
         "Nvlink",
         "NVML",
         "objs",
         "omitempty",
         "onsi",
+        "pids",
         "pluginapi",
+        "podname",
         "portallocator",
         "Postable",
         "printcolumn",
         "prometheusagents",
         "prometheuses",
         "prometheusrules",
         "queuesort",
+        "Radeon",
         "RDNA",
         "readyz",
         "replicaset",
         "replicasets",
         "rolebinding",
+        "RTXA",
         "runbook",
         "runpod",
         "samber",
@@ -151,6 +171,7 @@
         "schedv",
         "serviceaccount",
         "shirou",
+        "shmem",
         "shortuuid",
         "statefulset",
         "statefulsets",
@@ -161,6 +182,7 @@
         "strategicpatch",
         "strategicpatches",
         "stretchr",
+        "strncpy",
         "subresource",
         "Tabler",
         "tensorfusion",
@@ -175,6 +197,8 @@
         "testutil",
         "tflops",
         "timberio",
+        "Timeslicing",
+        "tmpfs",
         "Tmpl",
         "tokenreviews",
         "Tolerations",
@@ -183,7 +207,9 @@
         "utilerrors",
         "utilruntime",
         "vgpu",
+        "Warningf",
         "webhookcorev",
+        "workerstate",
         "workloadprofiles",
         "workqueue",
         "Xlarge"
 
@@ -65,6 +65,16 @@ type GPUStatus struct {
 
 	// +optional
 	RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
+
+	// +optional
+	// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
+	// Reported from discovery, each template has fixed resource allocation
+	PartitionTemplates []PartitionTemplate `json:"partitionTemplates,omitempty"`
+
+	// +optional
+	// AllocatedPartitions tracks allocated partitions on this GPU
+	// Key is partitionUUID, value contains template info and allocated resources
+	AllocatedPartitions map[string]AllocatedPartition `json:"allocatedPartitions,omitempty"`
 }
 
 // +kubebuilder:validation:Enum=tensor-fusion;nvidia-device-plugin
@@ -98,6 +108,36 @@ type PodGPUInfo struct {
 	QoS       QoSLevel `json:"qos,omitempty"`
 }
 
+// PartitionTemplate represents a hardware partition template (e.g., MIG profile)
+// Only stores template ID and name in GPU status. Detailed resource information
+// is stored in public GPU info config.
+type PartitionTemplate struct {
+	// TemplateID is the unique identifier for this partition template (e.g., "1g.24gb", "4g.94gb")
+	TemplateID string `json:"templateId"`
+
+	// Name is a human-readable name for this template
+	Name string `json:"name"`
+}
+
+// AllocatedPartition represents an allocated partition on a GPU
+// Key in AllocatedPartitions map is podUID
+type AllocatedPartition struct {
+	// TemplateID is the template used to create this partition
+	TemplateID string `json:"templateId"`
+
+	// PodUID is the UID of the pod using this partition (used as map key)
+	PodUID string `json:"podUid"`
+
+	// PodName is the name of the pod using this partition
+	PodName string `json:"podName"`
+
+	// Namespace is the namespace of the pod using this partition
+	Namespace string `json:"namespace"`
+
+	// AllocatedAt is when this partition was allocated
+	AllocatedAt metav1.Time `json:"allocatedAt"`
+}
+
 // +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
 type TensorFusionGPUPhase string
 
 
@@ -196,6 +196,10 @@ type AllocRequest struct {
 	QoS QoSLevel
 
 	Isolation IsolationModeType
+
+	// PartitionTemplateID is the template ID used for partitioned mode allocation
+	// This is set by the scheduler when a partition is matched, or read from pod annotation
+	PartitionTemplateID string
 }
 
 func (p *AllocRequest) Clone() fwk.StateData {
 
@@ -91,11 +91,11 @@ var _ = Describe("Autoscaler", func() {
 
 			// create two workloads
 			pool := tfEnv.GetGPUPool(0)
-			// with two replias
+			// with two replicas
 			workload0 := createWorkload(pool, 0, 2)
 			workload0Workers := getWorkers(workload0)
 			key0 := WorkloadID{workload0.Namespace, workload0.Name}
-			// with one replia
+			// with one replica
 			workload1 := createWorkload(pool, 1, 1)
 			workload1Workers := getWorkers(workload1)
 			key1 := WorkloadID{workload1.Namespace, workload1.Name}
@@ -539,8 +539,8 @@ func (f *FakeRecommender) Name() string {
 	return "fake"
 }
 
-func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*recommender.RecResult, error) {
-	meta.SetStatusCondition(&workoad.Status.Conditions, metav1.Condition{
+func (f *FakeRecommender) Recommend(ctx context.Context, workload *workload.State) (*recommender.RecResult, error) {
+	meta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{
 		Type:               constants.ConditionStatusTypeRecommendationProvided,
 		Status:             metav1.ConditionTrue,
 		LastTransitionTime: metav1.Now(),
 
@@ -31,6 +31,7 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )
@@ -104,6 +105,9 @@ func SetTflopsMapAndInitGPUPricingInfo(ctx context.Context, gpuInfos *[]config.G
 		tflopsMap[gpuInfo.Model] = completeInfo
 	}
 
+	// Load partition templates from config
+	gpuallocator.LoadPartitionTemplatesFromConfig(*gpuInfos)
+
 	initOnce.Do(func() {
 		globalAWSGPUInstanceData = make(map[string]GPUNodeInstanceInfoAndPrice)
 		globalAzureGPUInstanceData = make(map[string]GPUNodeInstanceInfoAndPrice)
 
@@ -170,7 +170,7 @@ func calculateDesiredUpdatedDelta(total int, updatedSize int, batchPercentage in
 		currentBatchIndex = newUpdateProgress / batchPercentage
 		desiredSize = min((currentBatchIndex+1)*int32(batchSize), int32(total))
 		delta = desiredSize - int32(updatedSize)
-		// if rolling udpate policy changed or new nodes were added during update, we need to update progress
+		// if rolling update policy changed or new nodes were added during update, we need to update progress
 		if delta < 0 {
 			newUpdateProgress = min(newUpdateProgress+batchPercentage, 100)
 		} else {
 
@@ -10,6 +10,40 @@ type GpuInfo struct {
 	CostPerHour   float64           `json:"costPerHour"`
 	Fp16TFlops    resource.Quantity `json:"fp16TFlops"`
 	FullModelName string            `json:"fullModelName"`
+
+	// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
+	// Only applicable for GPUs that support hardware partitioning
+	PartitionTemplates []PartitionTemplateInfo `json:"partitionTemplates,omitempty"`
+
+	// MaxPartitions is the maximum number of partitions this GPU can support (e.g., 7 for MIG)
+	MaxPartitions uint32 `json:"maxPartitions,omitempty"`
+}
+
+// PartitionTemplateInfo contains detailed resource information for a partition template
+type PartitionTemplateInfo struct {
+	// TemplateID is the unique identifier (e.g., "1g.24gb", "4g.94gb")
+	TemplateID string `json:"templateId"`
+
+	// Name is a human-readable name
+	Name string `json:"name"`
+
+	// MemoryBytes is the memory allocated to this partition in bytes
+	MemoryBytes uint64 `json:"memoryBytes"`
+
+	// ComputeUnits is the number of compute units (SMs) allocated
+	ComputeUnits uint64 `json:"computeUnits"`
+
+	// Tflops is the TFLOPS capacity of this partition
+	Tflops float64 `json:"tflops"`
+
+	// SliceCount is the number of slices (for MIG, this is the denominator, e.g., 7 for 1/7)
+	SliceCount uint32 `json:"sliceCount"`
+
+	// IsDefault indicates if this is a default template
+	IsDefault bool `json:"isDefault,omitempty"`
+
+	// Description provides additional information about this template
+	Description string `json:"description,omitempty"`
 }
 
 func MockGpuInfo() *[]GpuInfo {
 
@@ -83,7 +83,10 @@ const (
 	// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
 	GPUModelAnnotation = Domain + "/gpu-model"
 	// GPU ID list is assigned by scheduler, should not specified by user
-	GPUDeviceIDsAnnotation            = Domain + "/gpu-ids"
+	GPUDeviceIDsAnnotation = Domain + "/gpu-ids"
+	// PartitionTemplateIDAnnotation is the partition UUID assigned to a pod in partitioned mode
+	// This is read by accelerator.c to mock slice GPU like MIG does
+	PartitionTemplateIDAnnotation     = Domain + "/partition"
 	DedicatedGPUAnnotation            = Domain + "/dedicated-gpu"
 	SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"
 	PricingAnnotation                 = Domain + "/hourly-pricing"
 
@@ -111,7 +111,7 @@ func TestFilters(t *testing.T) {
 		filter := NewResourceFilter(tfv1.Resource{
 			Tflops: resource.MustParse("8"),
 			Vram:   resource.MustParse("30Gi"),
-		}, nil)
+		})
 		result, err := filter.Filter(ctx, testPodKey, gpus)
 		assert.NoError(t, err)
 		assert.Len(t, result, 2)
@@ -126,7 +126,7 @@ func TestFilters(t *testing.T) {
 			With(NewResourceFilter(tfv1.Resource{
 				Tflops: resource.MustParse("8"),
 				Vram:   resource.MustParse("30Gi"),
-			}, nil))
+			}))
 
 		// Apply filters
 		result, _, err := registry.Apply(ctx, testPodKey, gpus, false)
@@ -137,10 +137,11 @@ func TestFilters(t *testing.T) {
 
 	t.Run("FilterRegistry with gpu indices filtering", func(t *testing.T) {
 		registry := NewFilterRegistry().
+			With(NewGPUIndexFilter([]int32{2, 3})).
 			With(NewResourceFilter(tfv1.Resource{
 				Tflops: resource.MustParse("1"),
 				Vram:   resource.MustParse("1Gi"),
-			}, []int32{2, 3}))
+			}))
 
 		// Apply filters
 		result, _, err := registry.Apply(ctx, testPodKey, gpus, false)
@@ -160,7 +161,7 @@ func TestFilters(t *testing.T) {
 			With(NewResourceFilter(tfv1.Resource{
 				Tflops: resource.MustParse("8"),
 				Vram:   resource.MustParse("30Gi"),
-			}, nil))
+			}))
 
 		// Apply base registry filters
 		baseResult, _, err := baseRegistry.Apply(ctx, testPodKey, gpus, false)
Original file line number	Diff line number	Diff line change
`@@ -196,6 +196,10 @@ type AllocRequest struct {`
`196`	`196`	`QoS QoSLevel`
`197`	`197`
`198`	`198`	`Isolation IsolationModeType`
	`199`	`+`
	`200`	`+ // PartitionTemplateID is the template ID used for partitioned mode allocation`
	`201`	`+ // This is set by the scheduler when a partition is matched, or read from pod annotation`
	`202`	`+ PartitionTemplateID string`
`199`	`203`	`}`
`200`	`204`
`201`	`205`	`func (p *AllocRequest) Clone() fwk.StateData {`