From c345426bbf46179d53794b29b226e83a16810e57 Mon Sep 17 00:00:00 2001 From: dylan Date: Mon, 25 Aug 2025 06:46:26 -0700 Subject: [PATCH 01/34] support cel filter --- api/v1/schedulingconfigtemplate_types.go | 32 ++ api/v1/zz_generated.deepcopy.go | 20 + ...r-fusion.ai_schedulingconfigtemplates.yaml | 35 ++ ...r-fusion.ai_schedulingconfigtemplates.yaml | 35 ++ config/samples/cel_filter_example.yaml | 74 ++++ docs/cel-filters.md | 264 +++++++++++++ go.mod | 2 +- internal/gpuallocator/cel_integration_test.go | 260 +++++++++++++ .../gpuallocator/filter/cel_filter/adapter.go | 39 ++ .../filter/cel_filter/cel_config.go | 90 +++++ .../filter/cel_filter/cel_config_test.go | 246 +++++++++++++ .../filter/cel_filter/cel_filter.go | 213 +++++++++++ .../filter/cel_filter/cel_filter_test.go | 347 ++++++++++++++++++ .../filter/cel_filter/constants.go | 44 +++ internal/gpuallocator/gpuallocator.go | 12 + 15 files changed, 1712 insertions(+), 1 deletion(-) create mode 100644 config/samples/cel_filter_example.yaml create mode 100644 docs/cel-filters.md create mode 100644 internal/gpuallocator/cel_integration_test.go create mode 100644 internal/gpuallocator/filter/cel_filter/adapter.go create mode 100644 internal/gpuallocator/filter/cel_filter/cel_config.go create mode 100644 internal/gpuallocator/filter/cel_filter/cel_config_test.go create mode 100644 internal/gpuallocator/filter/cel_filter/cel_filter.go create mode 100644 internal/gpuallocator/filter/cel_filter/cel_filter_test.go create mode 100644 internal/gpuallocator/filter/cel_filter/constants.go diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index 44f07bef..80ef55e6 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -51,6 +51,10 @@ type PlacementConfig struct { // +optional GPUFilters []GPUFilter `json:"gpuFilters,omitempty"` + + // CEL-based GPU filters for advanced filtering logic + // +optional + CELFilters []CELFilterConfig `json:"celFilters,omitempty"` } // +kubebuilder:validation:Enum=CompactFirst;LowLoadFirst @@ -85,6 +89,34 @@ type GPUFilter struct { Params runtime.RawExtension `json:"params,omitempty"` } +// CELFilterConfig defines the configuration for CEL-based filtering +// +// example: +// ```yaml +// - name: "avoid-overloaded-gpus" +// expression: "gpu.available.tflops > 0.5 && size(gpu.runningApps) < 3" +// priority: 100 +// - name: "prefer-specific-model" +// expression: "gpu.gpuModel.startsWith('NVIDIA') && gpu.labels.has('gpu-tier') && gpu.labels['gpu-tier'] == 'premium'" +// priority: 50 +// +// ``` +type CELFilterConfig struct { + // Name for this filter (for debugging/logging) + // +optional + Name string `json:"name,omitempty"` + + // CEL expression for filtering GPUs + // The expression should return a boolean value + // Available variables: gpu, workerPodKey, request + Expression string `json:"expression"` + + // Priority for this filter (higher priority filters run first) + // +kubebuilder:default=0 + // +optional + Priority int `json:"priority,omitempty"` +} + type AutoScalingConfig struct { // layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly // VPA-like, aggregate metrics data <1m diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 9be4f47c..27f8e8fd 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -184,6 +184,21 @@ func (in *AutoSetRequests) DeepCopy() *AutoSetRequests { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CELFilterConfig) DeepCopyInto(out *CELFilterConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CELFilterConfig. +func (in *CELFilterConfig) DeepCopy() *CELFilterConfig { + if in == nil { + return nil + } + out := new(CELFilterConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig) { *out = *in @@ -1681,6 +1696,11 @@ func (in *PlacementConfig) DeepCopyInto(out *PlacementConfig) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.CELFilters != nil { + in, out := &in.CELFilters, &out.CELFilters + *out = make([]CELFilterConfig, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementConfig. diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 91a01eae..f7aeb8fa 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -203,6 +203,41 @@ spec: allowUsingLocalGPU: default: true type: boolean + celFilters: + description: CEL-based GPU filters for advanced filtering logic + items: + description: |- + CELFilterConfig defines the configuration for CEL-based filtering + + example: + ```yaml + - name: "avoid-overloaded-gpus" + expression: "gpu.available.tflops > 0.5 && size(gpu.runningApps) < 3" + priority: 100 + - name: "prefer-specific-model" + expression: "gpu.gpuModel.startsWith('NVIDIA') && gpu.labels.has('gpu-tier') && gpu.labels['gpu-tier'] == 'premium'" + priority: 50 + + ``` + properties: + expression: + description: |- + CEL expression for filtering GPUs + The expression should return a boolean value + Available variables: gpu, workerPodKey, request + type: string + name: + description: Name for this filter (for debugging/logging) + type: string + priority: + default: 0 + description: Priority for this filter (higher priority filters + run first) + type: integer + required: + - expression + type: object + type: array gpuFilters: items: description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n- diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 91a01eae..f7aeb8fa 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -203,6 +203,41 @@ spec: allowUsingLocalGPU: default: true type: boolean + celFilters: + description: CEL-based GPU filters for advanced filtering logic + items: + description: |- + CELFilterConfig defines the configuration for CEL-based filtering + + example: + ```yaml + - name: "avoid-overloaded-gpus" + expression: "gpu.available.tflops > 0.5 && size(gpu.runningApps) < 3" + priority: 100 + - name: "prefer-specific-model" + expression: "gpu.gpuModel.startsWith('NVIDIA') && gpu.labels.has('gpu-tier') && gpu.labels['gpu-tier'] == 'premium'" + priority: 50 + + ``` + properties: + expression: + description: |- + CEL expression for filtering GPUs + The expression should return a boolean value + Available variables: gpu, workerPodKey, request + type: string + name: + description: Name for this filter (for debugging/logging) + type: string + priority: + default: 0 + description: Priority for this filter (higher priority filters + run first) + type: integer + required: + - expression + type: object + type: array gpuFilters: items: description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n- diff --git a/config/samples/cel_filter_example.yaml b/config/samples/cel_filter_example.yaml new file mode 100644 index 00000000..aaf4895e --- /dev/null +++ b/config/samples/cel_filter_example.yaml @@ -0,0 +1,74 @@ +apiVersion: tensor-fusion.ai/v1 +kind: SchedulingConfigTemplate +metadata: + name: cel-filter-example +spec: + placement: + mode: CompactFirst + allowUsingLocalGPU: true + + # Traditional GPU filters (still supported) + gpuFilters: + - type: avoidTooMuchConnectionsOnSameGPU + params: + connectionNum: 150 + + # CEL-based filters for advanced filtering logic + celFilters: + # High priority filter: only use running GPUs + - name: "only-running-gpus" + expression: "gpu.phase == 'Running'" + priority: 100 + + # Medium-high priority: ensure sufficient resources available + - name: "sufficient-resources" + expression: "gpu.available.tflops >= 0.5 && gpu.available.vram >= 4096000000" + priority: 90 + + # Medium priority: prefer premium tier GPUs + - name: "prefer-premium-gpus" + expression: "gpu.labels != null && 'gpu-tier' in gpu.labels && gpu.labels['gpu-tier'] == 'premium'" + priority: 80 + + # Lower priority: avoid overloaded GPUs + - name: "avoid-overloaded-gpus" + expression: "size(gpu.runningApps) < 3" + priority: 70 + + # GPU model specific filters + - name: "nvidia-only" + expression: "gpu.gpuModel.startsWith('NVIDIA')" + priority: 60 + + # Complex condition example + - name: "complex-filter" + expression: | + gpu.phase == 'Running' && + gpu.available.tflops > 0.3 && + ( + (gpu.labels != null && 'workload-type' in gpu.labels && gpu.labels['workload-type'] == 'training') || + (size(gpu.runningApps) == 0) + ) + priority: 50 + + # Optional: AutoScaling configuration + autoScaling: + autoSetLimits: + enable: true + targetResource: "all" + evaluationPeriod: "5m" + extraTFlopsBufferRatio: "0.1" + +--- +apiVersion: tensor-fusion.ai/v1 +kind: SchedulingConfigTemplate +metadata: + name: simple-cel-example +spec: + placement: + mode: LowLoadFirst + celFilters: + # Simple example: only use GPUs with more than 50% TFlops available + - name: "high-availability" + expression: "gpu.available.tflops > gpu.capacity.tflops * 0.5" + priority: 100 \ No newline at end of file diff --git a/docs/cel-filters.md b/docs/cel-filters.md new file mode 100644 index 00000000..590e1d90 --- /dev/null +++ b/docs/cel-filters.md @@ -0,0 +1,264 @@ +# CEL Filters for GPU Allocation + +CEL (Common Expression Language) filters provide a powerful and flexible way to define custom GPU filtering logic in TensorFusion. This feature allows you to write expressions that determine which GPUs are eligible for allocation based on various criteria. + +## Overview + +CEL filters are defined in the `SchedulingConfigTemplate` resource and are applied during the GPU allocation process. They work alongside traditional GPU filters and provide more sophisticated filtering capabilities. + +## Configuration + +CEL filters are configured in the `placement.celFilters` field of a `SchedulingConfigTemplate`: + +```yaml +apiVersion: tensor-fusion.ai/v1 +kind: SchedulingConfigTemplate +metadata: + name: my-template +spec: + placement: + celFilters: + - name: "filter-name" + expression: "gpu.phase == 'Running'" + priority: 100 +``` + +### Fields + +- `name` (optional): A descriptive name for the filter, used for logging and debugging +- `expression` (required): The CEL expression that returns a boolean value +- `priority` (optional, default: 0): Higher priority filters are applied first + +## Available Variables + +CEL expressions have access to the following variables: + +### `gpu` Object + +The `gpu` variable contains information about the GPU being evaluated: + +```javascript +{ + "name": "gpu-1", // GPU name + "namespace": "default", // GPU namespace + "gpuModel": "NVIDIA A100", // GPU model + "uuid": "gpu-uuid", // GPU UUID + "phase": "Running", // GPU phase (Running, Pending, etc.) + "usedBy": "tensor-fusion", // Usage system + "labels": {...}, // Kubernetes labels + "annotations": {...}, // Kubernetes annotations + "capacity": { // Total GPU capacity + "tflops": 1.5, + "vram": 85899345920 // in bytes + }, + "available": { // Available GPU resources + "tflops": 1.0, + "vram": 64424509440 // in bytes + }, + "nodeSelector": {...}, // Node selector information + "runningApps": [ // Currently running applications + { + "name": "app-1", + "namespace": "default", + "count": 1 + } + ] +} +``` + +### `workerPodKey` Object + +Information about the requesting worker pod: + +```javascript +{ + "name": "worker-pod", + "namespace": "default" +} +``` + +## Expression Examples + +### Basic Filtering + +```yaml +# Only use running GPUs +- name: "running-only" + expression: "gpu.phase == 'Running'" + priority: 100 + +# Filter by GPU model +- name: "nvidia-only" + expression: "gpu.gpuModel.startsWith('NVIDIA')" + priority: 90 + +# Ensure minimum resources available +- name: "min-resources" + expression: "gpu.available.tflops >= 0.5 && gpu.available.vram >= 4294967296" + priority: 80 +``` + +### Label-Based Filtering + +```yaml +# Filter by labels +- name: "premium-tier" + expression: "gpu.labels != null && 'gpu-tier' in gpu.labels && gpu.labels['gpu-tier'] == 'premium'" + priority: 70 + +# Multiple label conditions +- name: "training-gpus" + expression: | + gpu.labels != null && + 'workload-type' in gpu.labels && + gpu.labels['workload-type'] == 'training' && + 'zone' in gpu.labels && + gpu.labels['zone'].startsWith('us-west') + priority: 60 +``` + +### Resource-Based Filtering + +```yaml +# Percentage of available resources +- name: "high-availability" + expression: "gpu.available.tflops > gpu.capacity.tflops * 0.7" + priority: 80 + +# Avoid overloaded GPUs +- name: "load-balancing" + expression: "size(gpu.runningApps) < 3" + priority: 50 + +# Memory-intensive workloads +- name: "high-memory" + expression: "gpu.available.vram > 34359738368" # > 32GB + priority: 60 +``` + +### Complex Conditions + +```yaml +# Complex multi-criteria filter +- name: "complex-filter" + expression: | + gpu.phase == 'Running' && + gpu.gpuModel.contains('A100') && + gpu.available.tflops > 0.8 && + ( + size(gpu.runningApps) == 0 || + (size(gpu.runningApps) < 2 && gpu.available.vram > 42949672960) + ) + priority: 90 +``` + +## CEL Language Features + +CEL supports many built-in functions and operators: + +### String Operations +- `startsWith()`, `endsWith()`, `contains()` +- String concatenation with `+` +- Regular expressions with `matches()` + +### Numeric Operations +- Standard arithmetic operators: `+`, `-`, `*`, `/`, `%` +- Comparison operators: `>`, `>=`, `<`, `<=`, `==`, `!=` + +### Logical Operations +- `&&` (and), `||` (or), `!` (not) + +### Collection Operations +- `size()` - get collection size +- `in` operator - check membership +- List/map access with `[]` + +### Conditional Expressions +- Ternary operator: `condition ? true_value : false_value` + +## Best Practices + +### Performance +1. **Order by Priority**: Place most restrictive filters first (highest priority) +2. **Avoid Complex Expressions**: Keep expressions simple for better performance +3. **Cache-Friendly**: Use consistent filter logic to benefit from any caching + +### Reliability +1. **Null Checks**: Always check for null values when accessing optional fields +2. **Fail-Safe Logic**: Design expressions to exclude GPUs on error rather than include them +3. **Test Thoroughly**: Test expressions with various GPU configurations + +### Maintainability +1. **Descriptive Names**: Use clear, descriptive names for filters +2. **Comments**: Add comments for complex expressions +3. **Modular Design**: Break complex logic into multiple simpler filters + +## Example Complete Configuration + +```yaml +apiVersion: tensor-fusion.ai/v1 +kind: SchedulingConfigTemplate +metadata: + name: production-gpu-scheduling +spec: + placement: + mode: CompactFirst + + # Traditional filters (still supported) + gpuFilters: + - type: avoidTooMuchConnectionsOnSameGPU + params: + connectionNum: 100 + + # CEL filters for advanced logic + celFilters: + # Critical filters (high priority) + - name: "operational-gpus-only" + expression: "gpu.phase == 'Running' && gpu.usedBy == 'tensor-fusion'" + priority: 100 + + - name: "sufficient-resources" + expression: "gpu.available.tflops >= 0.3 && gpu.available.vram >= 2147483648" + priority: 95 + + # Preference filters (medium priority) + - name: "prefer-nvidia" + expression: "gpu.gpuModel.startsWith('NVIDIA')" + priority: 80 + + - name: "balanced-load" + expression: "size(gpu.runningApps) < 2" + priority: 70 + + # Quality filters (lower priority) + - name: "premium-hardware" + expression: | + gpu.labels != null && + 'gpu-tier' in gpu.labels && + gpu.labels['gpu-tier'] in ['premium', 'high-performance'] + priority: 50 +``` + +## Troubleshooting + +### Common Issues + +1. **Expression Compilation Errors**: Check syntax and ensure all referenced fields exist +2. **Runtime Errors**: Add null checks for optional fields +3. **No GPUs Selected**: Verify that at least some GPUs meet all filter criteria +4. **Performance Issues**: Simplify complex expressions or reduce the number of filters + +### Debugging + +Enable debug logging to see detailed information about filter execution: + +```yaml +# In your logging configuration +logLevel: debug +``` + +Look for log entries containing "CEL filter applied" to see filtering results. + +## Migration from Traditional Filters + +CEL filters can be used alongside traditional GPU filters. They are applied after traditional filters in the filtering pipeline. You can gradually migrate complex traditional filters to CEL expressions for better maintainability. \ No newline at end of file diff --git a/go.mod b/go.mod index 9bf5280f..6fefa5d5 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/awslabs/operatorpkg v0.0.0-20250721225858-4e7491c57aa5 github.com/gin-contrib/gzip v1.2.3 github.com/gin-gonic/gin v1.10.1 + github.com/google/cel-go v0.23.2 github.com/influxdata/line-protocol/v2 v2.2.1 github.com/lithammer/shortuuid/v4 v4.2.0 github.com/mitchellh/mapstructure v1.5.0 @@ -85,7 +86,6 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect diff --git a/internal/gpuallocator/cel_integration_test.go b/internal/gpuallocator/cel_integration_test.go new file mode 100644 index 00000000..7913c116 --- /dev/null +++ b/internal/gpuallocator/cel_integration_test.go @@ -0,0 +1,260 @@ +package gpuallocator + +import ( + "context" + "testing" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" + cel_filter "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter/cel_filter" + "github.com/stretchr/testify/require" +) + +func TestGpuAllocator_CELFilters_Integration(t *testing.T) { + // Create test scheme + scheme := runtime.NewScheme() + err := tfv1.AddToScheme(scheme) + require.NoError(t, err) + + // Create test resources + schedulingTemplate := &tfv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-template", + }, + Spec: tfv1.SchedulingConfigTemplateSpec{ + Placement: tfv1.PlacementConfig{ + Mode: tfv1.PlacementModeCompactFirst, + CELFilters: []tfv1.CELFilterConfig{ + { + Name: "running-gpus-only", + Expression: "gpu.phase == 'Running'", + Priority: 100, + }, + { + Name: "sufficient-tflops", + Expression: "gpu.available.tflops >= 0.5", + Priority: 90, + }, + { + Name: "nvidia-gpus-only", + Expression: "gpu.gpuModel.contains('NVIDIA')", + Priority: 80, + }, + }, + }, + }, + } + + pool := &tfv1.GPUPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + }, + Spec: tfv1.GPUPoolSpec{ + SchedulingConfigTemplate: &schedulingTemplate.Name, + }, + } + + // Create test GPUs + gpus := []tfv1.GPU{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-1-pass-all", + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + GPUModel: "NVIDIA A100", + Available: &tfv1.Resource{ + Tflops: resource.MustParse("1.0"), + Vram: resource.MustParse("60Gi"), + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-2-fail-phase", + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhasePending, + GPUModel: "NVIDIA A100", + Available: &tfv1.Resource{ + Tflops: resource.MustParse("1.0"), + Vram: resource.MustParse("60Gi"), + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-3-fail-tflops", + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + GPUModel: "NVIDIA A100", + Available: &tfv1.Resource{ + Tflops: resource.MustParse("0.3"), + Vram: resource.MustParse("60Gi"), + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-4-fail-model", + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + GPUModel: "AMD Radeon RX 7900 XTX", + Available: &tfv1.Resource{ + Tflops: resource.MustParse("1.0"), + Vram: resource.MustParse("24Gi"), + }, + }, + }, + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(schedulingTemplate, pool). + Build() + + // Test CEL filters using CELConfigManager + celConfigManager := cel_filter.NewCELConfigManager(fakeClient) + celFilters, err := celConfigManager.GetCELFiltersForPool(context.Background(), pool.Name) + require.NoError(t, err) + require.Len(t, celFilters, 3) + + // Test filtering with CEL filters + celFilterAdapters := cel_filter.CreateCELFilterAdapters(celFilters) + filterRegistry := filter.NewFilterRegistry().With(celFilterAdapters...) + + filteredGPUs, _, err := filterRegistry.Apply( + context.Background(), + tfv1.NameNamespace{Name: "test-pod", Namespace: "default"}, + gpus, + false, + ) + require.NoError(t, err) + + // Only gpu-1 should pass all filters + require.Len(t, filteredGPUs, 1) + require.Equal(t, "gpu-1-pass-all", filteredGPUs[0].Name) +} + +func TestGpuAllocator_CELFilters_ErrorHandling(t *testing.T) { + // Create test scheme + scheme := runtime.NewScheme() + err := tfv1.AddToScheme(scheme) + require.NoError(t, err) + + // Create scheduling template with invalid CEL expression + schedulingTemplate := &tfv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-template", + }, + Spec: tfv1.SchedulingConfigTemplateSpec{ + Placement: tfv1.PlacementConfig{ + Mode: tfv1.PlacementModeCompactFirst, + CELFilters: []tfv1.CELFilterConfig{ + { + Name: "invalid-expression", + Expression: "gpu.phase ==", // Invalid syntax + Priority: 100, + }, + }, + }, + }, + } + + pool := &tfv1.GPUPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + }, + Spec: tfv1.GPUPoolSpec{ + SchedulingConfigTemplate: &schedulingTemplate.Name, + }, + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(schedulingTemplate, pool). + Build() + + // Test that invalid CEL expression results in error + celConfigManager := cel_filter.NewCELConfigManager(fakeClient) + _, err = celConfigManager.GetCELFiltersForPool(context.Background(), pool.Name) + require.Error(t, err) + require.Contains(t, err.Error(), "create CEL filter") +} + +func TestGpuAllocator_CELFilters_Priority_Ordering(t *testing.T) { + // Create test scheme + scheme := runtime.NewScheme() + err := tfv1.AddToScheme(scheme) + require.NoError(t, err) + + // Create scheduling template with multiple CEL filters with different priorities + schedulingTemplate := &tfv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "priority-template", + }, + Spec: tfv1.SchedulingConfigTemplateSpec{ + Placement: tfv1.PlacementConfig{ + Mode: tfv1.PlacementModeCompactFirst, + CELFilters: []tfv1.CELFilterConfig{ + { + Name: "low-priority", + Expression: "gpu.name.contains('gpu')", + Priority: 10, + }, + { + Name: "high-priority", + Expression: "gpu.phase == 'Running'", + Priority: 100, + }, + { + Name: "medium-priority", + Expression: "gpu.gpuModel.contains('NVIDIA')", + Priority: 50, + }, + }, + }, + }, + } + + pool := &tfv1.GPUPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + }, + Spec: tfv1.GPUPoolSpec{ + SchedulingConfigTemplate: &schedulingTemplate.Name, + }, + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(schedulingTemplate, pool). + Build() + + // Test that CEL filters are sorted by priority + celConfigManager := cel_filter.NewCELConfigManager(fakeClient) + celFilters, err := celConfigManager.GetCELFiltersForPool(context.Background(), pool.Name) + require.NoError(t, err) + require.Len(t, celFilters, 3) + + // Check that filters are ordered by priority (high to low) + // Note: We can't easily check the internal order without exposing more internals, + // but we can verify that all filters are created successfully + filterNames := make([]string, len(celFilters)) + for i, filter := range celFilters { + filterNames[i] = filter.Name() + } + + expectedFilters := []string{"high-priority", "medium-priority", "low-priority"} + require.ElementsMatch(t, expectedFilters, filterNames) +} diff --git a/internal/gpuallocator/filter/cel_filter/adapter.go b/internal/gpuallocator/filter/cel_filter/adapter.go new file mode 100644 index 00000000..2d3877f3 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/adapter.go @@ -0,0 +1,39 @@ +package cel_filter + +import ( + "context" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" +) + +// CELFilterAdapter adapts CELFilter to implement filter.GPUFilter interface +type CELFilterAdapter struct { + celFilter *CELFilter +} + +// NewCELFilterAdapter creates a new adapter for CELFilter +func NewCELFilterAdapter(celFilter *CELFilter) filter.GPUFilter { + return &CELFilterAdapter{ + celFilter: celFilter, + } +} + +// Filter implements the filter.GPUFilter interface +func (a *CELFilterAdapter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []tfv1.GPU) ([]tfv1.GPU, error) { + return a.celFilter.Filter(ctx, workerPodKey, gpus) +} + +// Name implements the filter.GPUFilter interface +func (a *CELFilterAdapter) Name() string { + return a.celFilter.Name() +} + +// CreateCELFilterAdapters creates filter.GPUFilter adapters from CELFilter instances +func CreateCELFilterAdapters(celFilters []*CELFilter) []filter.GPUFilter { + adapters := make([]filter.GPUFilter, len(celFilters)) + for i, celFilter := range celFilters { + adapters[i] = NewCELFilterAdapter(celFilter) + } + return adapters +} diff --git a/internal/gpuallocator/filter/cel_filter/cel_config.go b/internal/gpuallocator/filter/cel_filter/cel_config.go new file mode 100644 index 00000000..fc3a0f86 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/cel_config.go @@ -0,0 +1,90 @@ +package cel_filter + +import ( + "context" + "fmt" + "sort" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// CELConfigManager handles CEL filter configuration retrieval and creation +type CELConfigManager struct { + client client.Client +} + +// NewCELConfigManager creates a new CEL configuration manager +func NewCELConfigManager(client client.Client) *CELConfigManager { + return &CELConfigManager{ + client: client, + } +} + +// GetCELFiltersForPool retrieves CEL filters from SchedulingConfigTemplate for a given pool +func (m *CELConfigManager) GetCELFiltersForPool(ctx context.Context, poolName string) ([]*CELFilter, error) { + // Get pool to find SchedulingConfigTemplate + pool := &tfv1.GPUPool{} + if err := m.client.Get(ctx, client.ObjectKey{Name: poolName}, pool); err != nil { + return nil, fmt.Errorf("get pool %s: %w", poolName, err) + } + + // If no SchedulingConfigTemplate is specified, return empty + if pool.Spec.SchedulingConfigTemplate == nil { + return nil, nil + } + + return m.GetCELFiltersFromTemplate(ctx, *pool.Spec.SchedulingConfigTemplate) +} + +// GetCELFiltersFromTemplate retrieves CEL filters directly from a SchedulingConfigTemplate +func (m *CELConfigManager) GetCELFiltersFromTemplate(ctx context.Context, templateName string) ([]*CELFilter, error) { + // Get the SchedulingConfigTemplate + schedulingConfigTemplate := &tfv1.SchedulingConfigTemplate{} + if err := m.client.Get(ctx, client.ObjectKey{Name: templateName}, schedulingConfigTemplate); err != nil { + return nil, fmt.Errorf("get scheduling config template %s: %w", templateName, err) + } + + return m.CreateCELFiltersFromConfig(schedulingConfigTemplate.Spec.Placement.CELFilters) +} + +// CreateCELFiltersFromConfig creates CEL filters from configuration slice +func (m *CELConfigManager) CreateCELFiltersFromConfig(celConfigs []tfv1.CELFilterConfig) ([]*CELFilter, error) { + if len(celConfigs) == 0 { + return nil, nil + } + + // Sort CEL configs by priority (higher priority first) + sortedConfigs := make([]tfv1.CELFilterConfig, len(celConfigs)) + copy(sortedConfigs, celConfigs) + sort.Slice(sortedConfigs, func(i, j int) bool { + return sortedConfigs[i].Priority > sortedConfigs[j].Priority + }) + + // Create CEL filters + var celFilters []*CELFilter + for _, config := range sortedConfigs { + celFilter, err := NewCELFilter(CELFilterConfig{ + Name: config.Name, + Expression: config.Expression, + Priority: config.Priority, + }) + if err != nil { + return nil, fmt.Errorf("create CEL filter %q: %w", config.Name, err) + } + celFilters = append(celFilters, celFilter) + } + + return celFilters, nil +} + +// ValidateCELConfig validates a CEL filter configuration +func (m *CELConfigManager) ValidateCELConfig(config tfv1.CELFilterConfig) error { + // Try to create the filter to validate the expression + _, err := NewCELFilter(CELFilterConfig{ + Name: config.Name, + Expression: config.Expression, + Priority: config.Priority, + }) + return err +} diff --git a/internal/gpuallocator/filter/cel_filter/cel_config_test.go b/internal/gpuallocator/filter/cel_filter/cel_config_test.go new file mode 100644 index 00000000..8e8b0ad5 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/cel_config_test.go @@ -0,0 +1,246 @@ +package cel_filter + +import ( + "context" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/stretchr/testify/require" +) + +// Test constants for CEL expressions (same as in cel_filter_test.go) +const ( + // Phase expressions + testExamplePhaseRunning = `gpu.phase == 'Running'` + + // Resource expressions + testExampleMinTFlops = `gpu.available.tflops >= 0.5` + testExampleSpecificModel = `gpu.gpuModel.contains('A100')` + + // Label expressions + testExampleNVIDIAOnly = `gpu.gpuModel.startsWith('NVIDIA')` + + // Complex expressions + testExampleComplex = `gpu.phase == 'Running' && gpu.available.tflops > 0.5 && size(gpu.runningApps) < 2` +) + +func TestCELConfigManager_GetCELFiltersForPool(t *testing.T) { + // Create test scheme + scheme := runtime.NewScheme() + err := tfv1.AddToScheme(scheme) + require.NoError(t, err) + + // Create test resources + schedulingTemplate := &tfv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-template", + }, + Spec: tfv1.SchedulingConfigTemplateSpec{ + Placement: tfv1.PlacementConfig{ + CELFilters: []tfv1.CELFilterConfig{ + { + Name: "high-priority", + Expression: testExamplePhaseRunning, + Priority: 100, + }, + { + Name: "low-priority", + Expression: testExampleNVIDIAOnly, + Priority: 10, + }, + }, + }, + }, + } + + pool := &tfv1.GPUPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + }, + Spec: tfv1.GPUPoolSpec{ + SchedulingConfigTemplate: &schedulingTemplate.Name, + }, + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(schedulingTemplate, pool). + Build() + + // Test CELConfigManager + manager := NewCELConfigManager(fakeClient) + celFilters, err := manager.GetCELFiltersForPool(context.Background(), pool.Name) + require.NoError(t, err) + require.Len(t, celFilters, 2) + + // Verify filters are sorted by priority (high to low) + filterNames := make([]string, len(celFilters)) + for i, filter := range celFilters { + filterNames[i] = filter.Name() + } + require.Equal(t, []string{"high-priority", "low-priority"}, filterNames) +} + +func TestCELConfigManager_GetCELFiltersFromTemplate(t *testing.T) { + // Create test scheme + scheme := runtime.NewScheme() + err := tfv1.AddToScheme(scheme) + require.NoError(t, err) + + // Create test template + schedulingTemplate := &tfv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "direct-template", + }, + Spec: tfv1.SchedulingConfigTemplateSpec{ + Placement: tfv1.PlacementConfig{ + CELFilters: []tfv1.CELFilterConfig{ + { + Name: "simple-filter", + Expression: testExampleMinTFlops, + Priority: 50, + }, + }, + }, + }, + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(schedulingTemplate). + Build() + + // Test direct template access + manager := NewCELConfigManager(fakeClient) + celFilters, err := manager.GetCELFiltersFromTemplate(context.Background(), schedulingTemplate.Name) + require.NoError(t, err) + require.Len(t, celFilters, 1) + require.Equal(t, "simple-filter", celFilters[0].Name()) +} + +func TestCELConfigManager_CreateCELFiltersFromConfig(t *testing.T) { + manager := NewCELConfigManager(nil) // No client needed for this test + + celConfigs := []tfv1.CELFilterConfig{ + { + Name: "filter-3", + Expression: testExamplePhaseRunning, + Priority: 30, + }, + { + Name: "filter-1", + Expression: testExampleMinTFlops, + Priority: 100, + }, + { + Name: "filter-2", + Expression: testExampleSpecificModel, + Priority: 50, + }, + } + + celFilters, err := manager.CreateCELFiltersFromConfig(celConfigs) + require.NoError(t, err) + require.Len(t, celFilters, 3) + + // Verify priority ordering (high to low) + expectedOrder := []string{"filter-1", "filter-2", "filter-3"} + actualOrder := make([]string, len(celFilters)) + for i, filter := range celFilters { + actualOrder[i] = filter.Name() + } + require.Equal(t, expectedOrder, actualOrder) +} + +func TestCELConfigManager_ValidateCELConfig(t *testing.T) { + manager := NewCELConfigManager(nil) + + tests := []struct { + name string + config tfv1.CELFilterConfig + expectError bool + }{ + { + name: "valid config", + config: tfv1.CELFilterConfig{ + Name: "valid", + Expression: testExamplePhaseRunning, + Priority: 100, + }, + expectError: false, + }, + { + name: "invalid expression", + config: tfv1.CELFilterConfig{ + Name: "invalid", + Expression: "gpu.phase ==", // Invalid syntax + Priority: 100, + }, + expectError: true, + }, + { + name: "complex valid expression", + config: tfv1.CELFilterConfig{ + Name: "complex", + Expression: testExampleComplex, + Priority: 100, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := manager.ValidateCELConfig(tt.config) + if tt.expectError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestCELConfigManager_NoTemplate(t *testing.T) { + // Create test scheme + scheme := runtime.NewScheme() + err := tfv1.AddToScheme(scheme) + require.NoError(t, err) + + // Create pool without SchedulingConfigTemplate + pool := &tfv1.GPUPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "no-template-pool", + }, + Spec: tfv1.GPUPoolSpec{ + SchedulingConfigTemplate: nil, // No template specified + }, + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(pool). + Build() + + // Test that no CEL filters are returned + manager := NewCELConfigManager(fakeClient) + celFilters, err := manager.GetCELFiltersForPool(context.Background(), pool.Name) + require.NoError(t, err) + require.Len(t, celFilters, 0) +} + +func TestCELConfigManager_EmptyConfig(t *testing.T) { + manager := NewCELConfigManager(nil) + + // Test empty config slice + celFilters, err := manager.CreateCELFiltersFromConfig([]tfv1.CELFilterConfig{}) + require.NoError(t, err) + require.Len(t, celFilters, 0) +} diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go new file mode 100644 index 00000000..3165a5b6 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -0,0 +1,213 @@ +package cel_filter + +import ( + "context" + "fmt" + "sync" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/google/cel-go/cel" + "github.com/google/cel-go/common/types" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// CELFilterConfig defines the configuration for CEL-based filtering +type CELFilterConfig struct { + // CEL expression for filtering GPUs + Expression string `json:"expression"` + // Priority for this filter (higher priority filters run first) + Priority int `json:"priority"` + // Name for this filter (for debugging/logging) + Name string `json:"name"` +} + +// CELFilter implements GPU filtering using CEL expressions +type CELFilter struct { + name string + expression string + program cel.Program + env *cel.Env + mu sync.RWMutex +} + +// NewCELFilter creates a new CEL-based GPU filter +func NewCELFilter(config CELFilterConfig) (*CELFilter, error) { + env, err := createCELEnvironment() + if err != nil { + return nil, fmt.Errorf("failed to create CEL environment: %w", err) + } + + ast, issues := env.Compile(config.Expression) + if issues != nil && issues.Err() != nil { + return nil, fmt.Errorf("failed to compile CEL expression %q: %w", config.Expression, issues.Err()) + } + + program, err := env.Program(ast) + if err != nil { + return nil, fmt.Errorf("failed to create CEL program: %w", err) + } + + name := config.Name + if name == "" { + name = fmt.Sprintf("CELFilter-%d", config.Priority) + } + + return &CELFilter{ + name: name, + expression: config.Expression, + program: program, + env: env, + }, nil +} + +// Name returns the name of this filter +func (f *CELFilter) Name() string { + f.mu.RLock() + defer f.mu.RUnlock() + return f.name +} + +// Filter applies the CEL expression to filter GPUs +func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []tfv1.GPU) ([]tfv1.GPU, error) { + log := log.FromContext(ctx) + if len(gpus) == 0 { + return gpus, nil + } + + f.mu.RLock() + program := f.program + expression := f.expression + f.mu.RUnlock() + + var filteredGPUs []tfv1.GPU + + for _, gpu := range gpus { + // Create variables for CEL evaluation + vars := createCELVariables(gpu, workerPodKey) + + // Evaluate the CEL expression + result, _, err := program.Eval(vars) + if err != nil { + log.Error(err, "CEL expression evaluation failed", + "expression", expression, + "gpu", gpu.Name, + "workerPodKey", workerPodKey) + // On error, exclude the GPU (fail-safe) + continue + } + + // Convert result to boolean + if boolResult, ok := result.(types.Bool); ok { + if bool(boolResult) { + filteredGPUs = append(filteredGPUs, gpu) + } + } else { + log.Error(nil, "CEL expression did not return boolean", + "expression", expression, + "result", result, + "gpu", gpu.Name) + // On non-boolean result, exclude the GPU (fail-safe) + continue + } + } + + log.V(1).Info("CEL filter applied", + "filter", f.name, + "expression", expression, + "inputGPUs", len(gpus), + "outputGPUs", len(filteredGPUs)) + + return filteredGPUs, nil +} + +// UpdateExpression updates the CEL expression (thread-safe) +func (f *CELFilter) UpdateExpression(newExpression string) error { + f.mu.Lock() + defer f.mu.Unlock() + + ast, issues := f.env.Compile(newExpression) + if issues != nil && issues.Err() != nil { + return fmt.Errorf("failed to compile new CEL expression %q: %w", newExpression, issues.Err()) + } + + program, err := f.env.Program(ast) + if err != nil { + return fmt.Errorf("failed to create new CEL program: %w", err) + } + + f.expression = newExpression + f.program = program + return nil +} + +// createCELEnvironment creates a CEL environment with GPU-related variables and functions +func createCELEnvironment() (*cel.Env, error) { + return cel.NewEnv( + // Define GPU object structure + cel.Variable(CELVarGPU, cel.MapType(cel.StringType, cel.DynType)), + // Define worker pod key + cel.Variable(CELVarWorkerPodKey, cel.MapType(cel.StringType, cel.StringType)), + // Define request information (if needed in future) + cel.Variable(CELVarRequest, cel.MapType(cel.StringType, cel.DynType)), + ) +} + +// createCELVariables creates variables for CEL evaluation from GPU and request information +func createCELVariables(gpu tfv1.GPU, workerPodKey tfv1.NameNamespace) map[string]interface{} { + // Convert GPU to a map for CEL evaluation + gpuMap := map[string]interface{}{ + GPUFieldName: gpu.Name, + GPUFieldNamespace: gpu.Namespace, + GPUFieldGPUModel: gpu.Status.GPUModel, + GPUFieldUUID: gpu.Status.UUID, + GPUFieldPhase: string(gpu.Status.Phase), + GPUFieldUsedBy: string(gpu.Status.UsedBy), + GPUFieldMessage: gpu.Status.Message, + GPUFieldLabels: gpu.Labels, + GPUFieldAnnotations: gpu.Annotations, + } + + // Add capacity information if available + if gpu.Status.Capacity != nil { + gpuMap[GPUFieldCapacity] = map[string]interface{}{ + ResourceFieldTFlops: gpu.Status.Capacity.Tflops.AsApproximateFloat64(), + ResourceFieldVRAM: gpu.Status.Capacity.Vram.AsApproximateFloat64(), + } + } + + // Add available information if available + if gpu.Status.Available != nil { + gpuMap[GPUFieldAvailable] = map[string]interface{}{ + ResourceFieldTFlops: gpu.Status.Available.Tflops.AsApproximateFloat64(), + ResourceFieldVRAM: gpu.Status.Available.Vram.AsApproximateFloat64(), + } + } + + // Add node selector information + if gpu.Status.NodeSelector != nil { + gpuMap[GPUFieldNodeSelector] = gpu.Status.NodeSelector + } + + // Add running apps information (always set, even if empty) + runningApps := make([]map[string]interface{}, len(gpu.Status.RunningApps)) + for i, app := range gpu.Status.RunningApps { + runningApps[i] = map[string]interface{}{ + AppFieldName: app.Name, + AppFieldNamespace: app.Namespace, + AppFieldCount: app.Count, + } + } + gpuMap[GPUFieldRunningApps] = runningApps + + // Worker pod key information + workerPodKeyMap := map[string]string{ + PodKeyFieldName: workerPodKey.Name, + PodKeyFieldNamespace: workerPodKey.Namespace, + } + + return map[string]interface{}{ + CELVarGPU: gpuMap, + CELVarWorkerPodKey: workerPodKeyMap, + CELVarRequest: map[string]interface{}{}, // Placeholder for future request info + } +} diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go new file mode 100644 index 00000000..ba2cc539 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go @@ -0,0 +1,347 @@ +package cel_filter + +import ( + "context" + "testing" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/stretchr/testify/require" +) + +// Test constants for CEL expressions +const ( + // Phase expressions + ExamplePhaseRunning = `gpu.phase == 'Running'` + ExamplePhasePending = `gpu.phase == 'Pending'` + + // Resource expressions + ExampleMinTFlops = `gpu.available.tflops >= 0.5` + ExampleMinVRAM = `gpu.available.vram >= 4294967296` // 4GB in bytes + ExampleResourceRatio = `gpu.available.tflops > gpu.capacity.tflops * 0.5` + + // Model expressions + ExampleNVIDIAOnly = `gpu.gpuModel.startsWith('NVIDIA')` + ExampleSpecificModel = `gpu.gpuModel.contains('A100')` + + // Label expressions + ExampleHasLabel = `'gpu-tier' in gpu.labels` + ExampleLabelValue = `gpu.labels != null && 'gpu-tier' in gpu.labels && gpu.labels['gpu-tier'] == 'premium'` + + // Load balancing expressions + ExampleLowLoad = `size(gpu.runningApps) < 3` + ExampleNoApps = `size(gpu.runningApps) == 0` + + // Complex expressions + ExampleComplex = `gpu.phase == 'Running' && gpu.available.tflops > 0.5 && size(gpu.runningApps) < 2` +) + +func TestNewCELFilter(t *testing.T) { + tests := []struct { + name string + config CELFilterConfig + expectError bool + }{ + { + name: "valid basic expression", + config: CELFilterConfig{ + Name: "basic-test", + Expression: ExamplePhaseRunning, + Priority: 100, + }, + expectError: false, + }, + { + name: "valid resource expression", + config: CELFilterConfig{ + Name: "resource-test", + Expression: ExampleMinTFlops, + Priority: 50, + }, + expectError: false, + }, + { + name: "invalid expression syntax", + config: CELFilterConfig{ + Name: "invalid-test", + Expression: "gpu.phase ==", // Invalid syntax + Priority: 10, + }, + expectError: true, + }, + { + name: "expression with labels", + config: CELFilterConfig{ + Name: "label-test", + Expression: ExampleHasLabel, + Priority: 75, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + filter, err := NewCELFilter(tt.config) + if tt.expectError { + require.Error(t, err) + require.Nil(t, filter) + } else { + require.NoError(t, err) + require.NotNil(t, filter) + require.Equal(t, tt.config.Name, filter.Name()) + } + }) + } +} + +func TestCELFilter_Filter(t *testing.T) { + // Create test GPUs + gpus := []tfv1.GPU{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-1", + Namespace: "default", + Labels: map[string]string{ + "gpu-tier": "premium", + }, + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + GPUModel: "NVIDIA A100", + UUID: "gpu-1-uuid", + Capacity: &tfv1.Resource{ + Tflops: resource.MustParse("1.5"), + Vram: resource.MustParse("80Gi"), + }, + Available: &tfv1.Resource{ + Tflops: resource.MustParse("1.0"), + Vram: resource.MustParse("60Gi"), + }, + RunningApps: []*tfv1.RunningAppDetail{ + { + Name: "app-1", + Namespace: "default", + Count: 1, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-2", + Namespace: "default", + Labels: map[string]string{ + "gpu-tier": "basic", + }, + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + GPUModel: "NVIDIA RTX 4090", + UUID: "gpu-2-uuid", + Capacity: &tfv1.Resource{ + Tflops: resource.MustParse("0.8"), + Vram: resource.MustParse("24Gi"), + }, + Available: &tfv1.Resource{ + Tflops: resource.MustParse("0.2"), + Vram: resource.MustParse("8Gi"), + }, + RunningApps: []*tfv1.RunningAppDetail{ + { + Name: "app-2", + Namespace: "default", + Count: 1, + }, + { + Name: "app-3", + Namespace: "default", + Count: 2, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-3", + Namespace: "default", + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhasePending, + GPUModel: "NVIDIA A100", + UUID: "gpu-3-uuid", + Capacity: &tfv1.Resource{ + Tflops: resource.MustParse("1.5"), + Vram: resource.MustParse("80Gi"), + }, + Available: &tfv1.Resource{ + Tflops: resource.MustParse("1.5"), + Vram: resource.MustParse("80Gi"), + }, + }, + }, + } + + workerPodKey := tfv1.NameNamespace{ + Name: "test-pod", + Namespace: "default", + } + + tests := []struct { + name string + expression string + expectedGPUs []string // GPU names that should pass the filter + expectError bool + }{ + { + name: "filter by phase", + expression: ExamplePhaseRunning, + expectedGPUs: []string{"gpu-1", "gpu-2"}, + }, + { + name: "filter by available resources", + expression: ExampleMinTFlops, + expectedGPUs: []string{"gpu-1", "gpu-3"}, + }, + { + name: "filter by GPU model", + expression: "gpu.gpuModel.startsWith('NVIDIA A100')", + expectedGPUs: []string{"gpu-1", "gpu-3"}, + }, + { + name: "filter by labels", + expression: ExampleLabelValue, + expectedGPUs: []string{"gpu-1"}, + }, + { + name: "filter by running apps count", + expression: ExampleLowLoad, + expectedGPUs: []string{"gpu-1", "gpu-2", "gpu-3"}, + }, + { + name: "complex filter", + expression: ExampleComplex, + expectedGPUs: []string{"gpu-1"}, + }, + { + name: "filter none", + expression: "false", + expectedGPUs: []string{}, + }, + { + name: "filter all", + expression: "true", + expectedGPUs: []string{"gpu-1", "gpu-2", "gpu-3"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + filter, err := NewCELFilter(CELFilterConfig{ + Name: tt.name, + Expression: tt.expression, + Priority: 100, + }) + require.NoError(t, err) + + filteredGPUs, err := filter.Filter(context.Background(), workerPodKey, gpus) + if tt.expectError { + require.Error(t, err) + return + } + + require.NoError(t, err) + require.Len(t, filteredGPUs, len(tt.expectedGPUs)) + + // Check that the correct GPUs were filtered + actualNames := make([]string, len(filteredGPUs)) + for i, gpu := range filteredGPUs { + actualNames[i] = gpu.Name + } + + require.ElementsMatch(t, tt.expectedGPUs, actualNames) + }) + } +} + +func TestCELFilter_UpdateExpression(t *testing.T) { + // Create initial filter + filter, err := NewCELFilter(CELFilterConfig{ + Name: "update-test", + Expression: ExamplePhaseRunning, + Priority: 100, + }) + require.NoError(t, err) + + // Test valid update + err = filter.UpdateExpression(ExamplePhasePending) + require.NoError(t, err) + + // Test invalid update + err = filter.UpdateExpression("gpu.phase ==") + require.Error(t, err) +} + +func TestCELFilter_ThreadSafety(t *testing.T) { + filter, err := NewCELFilter(CELFilterConfig{ + Name: "thread-safety-test", + Expression: ExamplePhaseRunning, + Priority: 100, + }) + require.NoError(t, err) + + // Create test GPU + gpu := tfv1.GPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-1", + Namespace: "default", + }, + Status: tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + }, + } + + workerPodKey := tfv1.NameNamespace{ + Name: "test-pod", + Namespace: "default", + } + + // Run concurrent operations + done := make(chan bool, 3) + + // Concurrent filtering + go func() { + defer func() { done <- true }() + for i := 0; i < 100; i++ { + _, err := filter.Filter(context.Background(), workerPodKey, []tfv1.GPU{gpu}) + require.NoError(t, err) + } + }() + + // Concurrent name access + go func() { + defer func() { done <- true }() + for i := 0; i < 100; i++ { + name := filter.Name() + require.Equal(t, "thread-safety-test", name) + } + }() + + // Concurrent expression updates + go func() { + defer func() { done <- true }() + for i := 0; i < 10; i++ { + err := filter.UpdateExpression(ExamplePhasePending) + require.NoError(t, err) + err = filter.UpdateExpression(ExamplePhaseRunning) + require.NoError(t, err) + } + }() + + // Wait for all goroutines to complete + for i := 0; i < 3; i++ { + <-done + } +} diff --git a/internal/gpuallocator/filter/cel_filter/constants.go b/internal/gpuallocator/filter/cel_filter/constants.go new file mode 100644 index 00000000..152f643f --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/constants.go @@ -0,0 +1,44 @@ +package cel_filter + +// CEL variable names available in expressions +const ( + // Root variables + CELVarGPU = "gpu" + CELVarWorkerPodKey = "workerPodKey" + CELVarRequest = "request" +) + +// GPU object field names +const ( + // Basic GPU metadata + GPUFieldName = "name" + GPUFieldNamespace = "namespace" + GPUFieldGPUModel = "gpuModel" + GPUFieldUUID = "uuid" + GPUFieldPhase = "phase" + GPUFieldUsedBy = "usedBy" + GPUFieldMessage = "message" + + // Kubernetes metadata + GPUFieldLabels = "labels" + GPUFieldAnnotations = "annotations" + + // Resource information + GPUFieldCapacity = "capacity" + GPUFieldAvailable = "available" + GPUFieldNodeSelector = "nodeSelector" + GPUFieldRunningApps = "runningApps" + + // Resource sub-fields + ResourceFieldTFlops = "tflops" + ResourceFieldVRAM = "vram" + + // Running app sub-fields + AppFieldName = "name" + AppFieldNamespace = "namespace" + AppFieldCount = "count" + + // WorkerPodKey fields + PodKeyFieldName = "name" + PodKeyFieldNamespace = "namespace" +) diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index b4fbbc2a..e1d87bae 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -18,6 +18,7 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" + cel_filter "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter/cel_filter" "github.com/NexusGPU/tensor-fusion/internal/metrics" "github.com/NexusGPU/tensor-fusion/internal/quota" "github.com/NexusGPU/tensor-fusion/internal/utils" @@ -173,6 +174,17 @@ func (s *GpuAllocator) Filter( filterRegistry = filterRegistry.With(filter.NewNodeAffinityFilter(s.Client, req.NodeAffinity)) } + // Add CEL filters from SchedulingConfigTemplate if available + celConfigManager := cel_filter.NewCELConfigManager(s.Client) + celFilters, err := celConfigManager.GetCELFiltersForPool(s.ctx, req.PoolName) + if err != nil { + return nil, nil, fmt.Errorf("get CEL filters: %w", err) + } + if len(celFilters) > 0 { + celFilterAdapters := cel_filter.CreateCELFilterAdapters(celFilters) + filterRegistry = filterRegistry.With(celFilterAdapters...) + } + // Apply the filters in sequence filteredGPUs, filterDetails, err := filterRegistry.Apply(s.ctx, req.WorkloadNameNamespace, toFilterGPUs, isSimulateSchedule) if err != nil { From 7be8e25398ffd388bbadb8fb836ef982325e4421 Mon Sep 17 00:00:00 2001 From: dylan Date: Sat, 30 Aug 2025 08:36:39 -0700 Subject: [PATCH 02/34] covert allocator request to cel filter --- api/v1/schedulingconfigtemplate_types.go | 32 -- api/v1/zz_generated.deepcopy.go | 20 - ...r-fusion.ai_schedulingconfigtemplates.yaml | 35 -- ...r-fusion.ai_schedulingconfigtemplates.yaml | 35 -- internal/config/global_config.go | 4 + internal/gpuallocator/cel_integration_test.go | 260 ------------- .../gpuallocator/filter/cel_filter/adapter.go | 39 -- .../filter/cel_filter/alloc_request_filter.go | 164 +++++++++ .../filter/cel_filter/cel_config.go | 90 ----- .../filter/cel_filter/cel_config_test.go | 246 ------------- .../filter/cel_filter/cel_filter.go | 46 +-- .../filter/cel_filter/cel_filter_test.go | 347 ------------------ .../filter/cel_filter/constants.go | 9 + .../filter/cel_filter/expression_cache.go | 191 ++++++++++ internal/gpuallocator/gpuallocator.go | 75 +++- 15 files changed, 434 insertions(+), 1159 deletions(-) delete mode 100644 internal/gpuallocator/cel_integration_test.go delete mode 100644 internal/gpuallocator/filter/cel_filter/adapter.go create mode 100644 internal/gpuallocator/filter/cel_filter/alloc_request_filter.go delete mode 100644 internal/gpuallocator/filter/cel_filter/cel_config.go delete mode 100644 internal/gpuallocator/filter/cel_filter/cel_config_test.go delete mode 100644 internal/gpuallocator/filter/cel_filter/cel_filter_test.go create mode 100644 internal/gpuallocator/filter/cel_filter/expression_cache.go diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index 80ef55e6..44f07bef 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -51,10 +51,6 @@ type PlacementConfig struct { // +optional GPUFilters []GPUFilter `json:"gpuFilters,omitempty"` - - // CEL-based GPU filters for advanced filtering logic - // +optional - CELFilters []CELFilterConfig `json:"celFilters,omitempty"` } // +kubebuilder:validation:Enum=CompactFirst;LowLoadFirst @@ -89,34 +85,6 @@ type GPUFilter struct { Params runtime.RawExtension `json:"params,omitempty"` } -// CELFilterConfig defines the configuration for CEL-based filtering -// -// example: -// ```yaml -// - name: "avoid-overloaded-gpus" -// expression: "gpu.available.tflops > 0.5 && size(gpu.runningApps) < 3" -// priority: 100 -// - name: "prefer-specific-model" -// expression: "gpu.gpuModel.startsWith('NVIDIA') && gpu.labels.has('gpu-tier') && gpu.labels['gpu-tier'] == 'premium'" -// priority: 50 -// -// ``` -type CELFilterConfig struct { - // Name for this filter (for debugging/logging) - // +optional - Name string `json:"name,omitempty"` - - // CEL expression for filtering GPUs - // The expression should return a boolean value - // Available variables: gpu, workerPodKey, request - Expression string `json:"expression"` - - // Priority for this filter (higher priority filters run first) - // +kubebuilder:default=0 - // +optional - Priority int `json:"priority,omitempty"` -} - type AutoScalingConfig struct { // layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly // VPA-like, aggregate metrics data <1m diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 27f8e8fd..9be4f47c 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -184,21 +184,6 @@ func (in *AutoSetRequests) DeepCopy() *AutoSetRequests { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *CELFilterConfig) DeepCopyInto(out *CELFilterConfig) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CELFilterConfig. -func (in *CELFilterConfig) DeepCopy() *CELFilterConfig { - if in == nil { - return nil - } - out := new(CELFilterConfig) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig) { *out = *in @@ -1696,11 +1681,6 @@ func (in *PlacementConfig) DeepCopyInto(out *PlacementConfig) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - if in.CELFilters != nil { - in, out := &in.CELFilters, &out.CELFilters - *out = make([]CELFilterConfig, len(*in)) - copy(*out, *in) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementConfig. diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index f7aeb8fa..91a01eae 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -203,41 +203,6 @@ spec: allowUsingLocalGPU: default: true type: boolean - celFilters: - description: CEL-based GPU filters for advanced filtering logic - items: - description: |- - CELFilterConfig defines the configuration for CEL-based filtering - - example: - ```yaml - - name: "avoid-overloaded-gpus" - expression: "gpu.available.tflops > 0.5 && size(gpu.runningApps) < 3" - priority: 100 - - name: "prefer-specific-model" - expression: "gpu.gpuModel.startsWith('NVIDIA') && gpu.labels.has('gpu-tier') && gpu.labels['gpu-tier'] == 'premium'" - priority: 50 - - ``` - properties: - expression: - description: |- - CEL expression for filtering GPUs - The expression should return a boolean value - Available variables: gpu, workerPodKey, request - type: string - name: - description: Name for this filter (for debugging/logging) - type: string - priority: - default: 0 - description: Priority for this filter (higher priority filters - run first) - type: integer - required: - - expression - type: object - type: array gpuFilters: items: description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n- diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index f7aeb8fa..91a01eae 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -203,41 +203,6 @@ spec: allowUsingLocalGPU: default: true type: boolean - celFilters: - description: CEL-based GPU filters for advanced filtering logic - items: - description: |- - CELFilterConfig defines the configuration for CEL-based filtering - - example: - ```yaml - - name: "avoid-overloaded-gpus" - expression: "gpu.available.tflops > 0.5 && size(gpu.runningApps) < 3" - priority: 100 - - name: "prefer-specific-model" - expression: "gpu.gpuModel.startsWith('NVIDIA') && gpu.labels.has('gpu-tier') && gpu.labels['gpu-tier'] == 'premium'" - priority: 50 - - ``` - properties: - expression: - description: |- - CEL expression for filtering GPUs - The expression should return a boolean value - Available variables: gpu, workerPodKey, request - type: string - name: - description: Name for this filter (for debugging/logging) - type: string - priority: - default: 0 - description: Priority for this filter (higher priority filters - run first) - type: integer - required: - - expression - type: object - type: array gpuFilters: items: description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n- diff --git a/internal/config/global_config.go b/internal/config/global_config.go index 75bddc22..0632c284 100644 --- a/internal/config/global_config.go +++ b/internal/config/global_config.go @@ -8,6 +8,9 @@ type GlobalConfig struct { MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"` AlertRules []AlertRule `yaml:"alertRules"` + + // EnableCELFilter enables CEL-based filtering (default: false for rollback support) + EnableCELFilter bool `yaml:"enableCELFilter"` } var globalConfig *GlobalConfig @@ -41,6 +44,7 @@ func MockGlobalConfig() *GlobalConfig { MetricsTTL: "30d", MetricsFormat: "influx", MetricsExtraPodLabels: map[string]string{"kubernetes.io/app": "app"}, + EnableCELFilter: false, // Default to legacy filter for rollback support AlertRules: []AlertRule{ { Name: "mock", diff --git a/internal/gpuallocator/cel_integration_test.go b/internal/gpuallocator/cel_integration_test.go deleted file mode 100644 index 7913c116..00000000 --- a/internal/gpuallocator/cel_integration_test.go +++ /dev/null @@ -1,260 +0,0 @@ -package gpuallocator - -import ( - "context" - "testing" - - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" - cel_filter "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter/cel_filter" - "github.com/stretchr/testify/require" -) - -func TestGpuAllocator_CELFilters_Integration(t *testing.T) { - // Create test scheme - scheme := runtime.NewScheme() - err := tfv1.AddToScheme(scheme) - require.NoError(t, err) - - // Create test resources - schedulingTemplate := &tfv1.SchedulingConfigTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-template", - }, - Spec: tfv1.SchedulingConfigTemplateSpec{ - Placement: tfv1.PlacementConfig{ - Mode: tfv1.PlacementModeCompactFirst, - CELFilters: []tfv1.CELFilterConfig{ - { - Name: "running-gpus-only", - Expression: "gpu.phase == 'Running'", - Priority: 100, - }, - { - Name: "sufficient-tflops", - Expression: "gpu.available.tflops >= 0.5", - Priority: 90, - }, - { - Name: "nvidia-gpus-only", - Expression: "gpu.gpuModel.contains('NVIDIA')", - Priority: 80, - }, - }, - }, - }, - } - - pool := &tfv1.GPUPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - }, - Spec: tfv1.GPUPoolSpec{ - SchedulingConfigTemplate: &schedulingTemplate.Name, - }, - } - - // Create test GPUs - gpus := []tfv1.GPU{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-1-pass-all", - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhaseRunning, - GPUModel: "NVIDIA A100", - Available: &tfv1.Resource{ - Tflops: resource.MustParse("1.0"), - Vram: resource.MustParse("60Gi"), - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-2-fail-phase", - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhasePending, - GPUModel: "NVIDIA A100", - Available: &tfv1.Resource{ - Tflops: resource.MustParse("1.0"), - Vram: resource.MustParse("60Gi"), - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-3-fail-tflops", - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhaseRunning, - GPUModel: "NVIDIA A100", - Available: &tfv1.Resource{ - Tflops: resource.MustParse("0.3"), - Vram: resource.MustParse("60Gi"), - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-4-fail-model", - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhaseRunning, - GPUModel: "AMD Radeon RX 7900 XTX", - Available: &tfv1.Resource{ - Tflops: resource.MustParse("1.0"), - Vram: resource.MustParse("24Gi"), - }, - }, - }, - } - - // Create fake client - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(schedulingTemplate, pool). - Build() - - // Test CEL filters using CELConfigManager - celConfigManager := cel_filter.NewCELConfigManager(fakeClient) - celFilters, err := celConfigManager.GetCELFiltersForPool(context.Background(), pool.Name) - require.NoError(t, err) - require.Len(t, celFilters, 3) - - // Test filtering with CEL filters - celFilterAdapters := cel_filter.CreateCELFilterAdapters(celFilters) - filterRegistry := filter.NewFilterRegistry().With(celFilterAdapters...) - - filteredGPUs, _, err := filterRegistry.Apply( - context.Background(), - tfv1.NameNamespace{Name: "test-pod", Namespace: "default"}, - gpus, - false, - ) - require.NoError(t, err) - - // Only gpu-1 should pass all filters - require.Len(t, filteredGPUs, 1) - require.Equal(t, "gpu-1-pass-all", filteredGPUs[0].Name) -} - -func TestGpuAllocator_CELFilters_ErrorHandling(t *testing.T) { - // Create test scheme - scheme := runtime.NewScheme() - err := tfv1.AddToScheme(scheme) - require.NoError(t, err) - - // Create scheduling template with invalid CEL expression - schedulingTemplate := &tfv1.SchedulingConfigTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: "invalid-template", - }, - Spec: tfv1.SchedulingConfigTemplateSpec{ - Placement: tfv1.PlacementConfig{ - Mode: tfv1.PlacementModeCompactFirst, - CELFilters: []tfv1.CELFilterConfig{ - { - Name: "invalid-expression", - Expression: "gpu.phase ==", // Invalid syntax - Priority: 100, - }, - }, - }, - }, - } - - pool := &tfv1.GPUPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - }, - Spec: tfv1.GPUPoolSpec{ - SchedulingConfigTemplate: &schedulingTemplate.Name, - }, - } - - // Create fake client - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(schedulingTemplate, pool). - Build() - - // Test that invalid CEL expression results in error - celConfigManager := cel_filter.NewCELConfigManager(fakeClient) - _, err = celConfigManager.GetCELFiltersForPool(context.Background(), pool.Name) - require.Error(t, err) - require.Contains(t, err.Error(), "create CEL filter") -} - -func TestGpuAllocator_CELFilters_Priority_Ordering(t *testing.T) { - // Create test scheme - scheme := runtime.NewScheme() - err := tfv1.AddToScheme(scheme) - require.NoError(t, err) - - // Create scheduling template with multiple CEL filters with different priorities - schedulingTemplate := &tfv1.SchedulingConfigTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: "priority-template", - }, - Spec: tfv1.SchedulingConfigTemplateSpec{ - Placement: tfv1.PlacementConfig{ - Mode: tfv1.PlacementModeCompactFirst, - CELFilters: []tfv1.CELFilterConfig{ - { - Name: "low-priority", - Expression: "gpu.name.contains('gpu')", - Priority: 10, - }, - { - Name: "high-priority", - Expression: "gpu.phase == 'Running'", - Priority: 100, - }, - { - Name: "medium-priority", - Expression: "gpu.gpuModel.contains('NVIDIA')", - Priority: 50, - }, - }, - }, - }, - } - - pool := &tfv1.GPUPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - }, - Spec: tfv1.GPUPoolSpec{ - SchedulingConfigTemplate: &schedulingTemplate.Name, - }, - } - - // Create fake client - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(schedulingTemplate, pool). - Build() - - // Test that CEL filters are sorted by priority - celConfigManager := cel_filter.NewCELConfigManager(fakeClient) - celFilters, err := celConfigManager.GetCELFiltersForPool(context.Background(), pool.Name) - require.NoError(t, err) - require.Len(t, celFilters, 3) - - // Check that filters are ordered by priority (high to low) - // Note: We can't easily check the internal order without exposing more internals, - // but we can verify that all filters are created successfully - filterNames := make([]string, len(celFilters)) - for i, filter := range celFilters { - filterNames[i] = filter.Name() - } - - expectedFilters := []string{"high-priority", "medium-priority", "low-priority"} - require.ElementsMatch(t, expectedFilters, filterNames) -} diff --git a/internal/gpuallocator/filter/cel_filter/adapter.go b/internal/gpuallocator/filter/cel_filter/adapter.go deleted file mode 100644 index 2d3877f3..00000000 --- a/internal/gpuallocator/filter/cel_filter/adapter.go +++ /dev/null @@ -1,39 +0,0 @@ -package cel_filter - -import ( - "context" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" -) - -// CELFilterAdapter adapts CELFilter to implement filter.GPUFilter interface -type CELFilterAdapter struct { - celFilter *CELFilter -} - -// NewCELFilterAdapter creates a new adapter for CELFilter -func NewCELFilterAdapter(celFilter *CELFilter) filter.GPUFilter { - return &CELFilterAdapter{ - celFilter: celFilter, - } -} - -// Filter implements the filter.GPUFilter interface -func (a *CELFilterAdapter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []tfv1.GPU) ([]tfv1.GPU, error) { - return a.celFilter.Filter(ctx, workerPodKey, gpus) -} - -// Name implements the filter.GPUFilter interface -func (a *CELFilterAdapter) Name() string { - return a.celFilter.Name() -} - -// CreateCELFilterAdapters creates filter.GPUFilter adapters from CELFilter instances -func CreateCELFilterAdapters(celFilters []*CELFilter) []filter.GPUFilter { - adapters := make([]filter.GPUFilter, len(celFilters)) - for i, celFilter := range celFilters { - adapters[i] = NewCELFilterAdapter(celFilter) - } - return adapters -} diff --git a/internal/gpuallocator/filter/cel_filter/alloc_request_filter.go b/internal/gpuallocator/filter/cel_filter/alloc_request_filter.go new file mode 100644 index 00000000..bd3e06de --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/alloc_request_filter.go @@ -0,0 +1,164 @@ +package cel_filter + +import ( + "context" + "fmt" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/google/cel-go/common/types" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// AllocRequestCELFilter converts AllocRequest to CEL filter and executes it +type AllocRequestCELFilter struct { + cache *ExpressionCache + expression string + name string +} + +// NewAllocRequestCELFilter creates a new CEL filter from allocation request +func NewAllocRequestCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*AllocRequestCELFilter, error) { + // Convert AllocRequest to CEL expression + expression, err := convertAllocRequestToCEL(req) + if err != nil { + return nil, fmt.Errorf("failed to convert AllocRequest to CEL: %w", err) + } + + return &AllocRequestCELFilter{ + cache: cache, + expression: expression, + name: fmt.Sprintf("AllocRequest-%s", req.WorkloadNameNamespace.String()), + }, nil +} + +// Name returns the filter name +func (f *AllocRequestCELFilter) Name() string { + return f.name +} + +// Filter applies the CEL expression derived from AllocRequest to filter GPUs +func (f *AllocRequestCELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) { + log := log.FromContext(ctx) + if len(gpus) == 0 { + return gpus, nil + } + + if f.expression == "" { + // If no expression, return all GPUs (no filtering needed) + return gpus, nil + } + + // Get compiled program from cache + program, err := f.cache.GetOrCompileProgram(f.expression) + if err != nil { + return nil, fmt.Errorf("failed to get CEL program for expression %q: %w", f.expression, err) + } + + var filteredGPUs []*tfv1.GPU + + for _, gpu := range gpus { + // Create timeout context for CEL evaluation + evalCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + + // Create variables for CEL evaluation + vars := createCELVariables(*gpu, workerPodKey) + + // Evaluate with timeout + resultChan := make(chan evalResult, 1) + go func() { + result, _, evalErr := program.Eval(vars) + resultChan <- evalResult{result: result, err: evalErr} + }() + + select { + case evalRes := <-resultChan: + cancel() + if evalRes.err != nil { + log.Error(evalRes.err, "CEL expression evaluation failed", + "expression", f.expression, + "gpu", gpu.Name, + "workerPodKey", workerPodKey) + // On error, exclude the GPU (fail-safe) + continue + } + + // Convert result to boolean + if boolResult, ok := evalRes.result.(types.Bool); ok { + if bool(boolResult) { + filteredGPUs = append(filteredGPUs, gpu) + } + } else { + log.Error(nil, "CEL expression did not return boolean", + "expression", f.expression, + "result", evalRes.result, + "gpu", gpu.Name) + // On non-boolean result, exclude the GPU (fail-safe) + continue + } + case <-evalCtx.Done(): + cancel() + // Timeout - skip this GPU (fail-safe behavior) + log.V(1).Info("CEL evaluation timeout", "gpu", gpu.Name, "expression", f.expression) + continue + } + } + + log.V(1).Info("AllocRequest CEL filter applied", + "filter", f.name, + "expression", f.expression, + "inputGPUs", len(gpus), + "outputGPUs", len(filteredGPUs)) + + return filteredGPUs, nil +} + +type evalResult struct { + result interface{} + err error +} + +// convertAllocRequestToCEL converts an allocation request to a CEL expression +func convertAllocRequestToCEL(req *tfv1.AllocRequest) (string, error) { + if req == nil { + return "", nil + } + + var conditions []string + + // Add GPU phase condition (must be Ready) + conditions = append(conditions, "gpu.phase == 'Ready'") + + // Add resource requirements + if req.Request.Tflops.Sign() > 0 { + tflopsValue := req.Request.Tflops.AsApproximateFloat64() + conditions = append(conditions, fmt.Sprintf("gpu.available.tflops >= %f", tflopsValue)) + } + + if req.Request.Vram.Sign() > 0 { + vramValue := req.Request.Vram.AsApproximateFloat64() + conditions = append(conditions, fmt.Sprintf("gpu.available.vram >= %f", vramValue)) + } + + // Add GPU model filter if specified + if req.GPUModel != "" { + conditions = append(conditions, fmt.Sprintf("gpu.gpuModel == '%s'", req.GPUModel)) + } + + // If no conditions, return empty expression (no filtering) + if len(conditions) == 0 { + return "", nil + } + + // Combine all conditions with AND + if len(conditions) == 1 { + return conditions[0], nil + } + + expression := conditions[0] + for i := 1; i < len(conditions); i++ { + expression += " && " + conditions[i] + } + + return expression, nil +} diff --git a/internal/gpuallocator/filter/cel_filter/cel_config.go b/internal/gpuallocator/filter/cel_filter/cel_config.go deleted file mode 100644 index fc3a0f86..00000000 --- a/internal/gpuallocator/filter/cel_filter/cel_config.go +++ /dev/null @@ -1,90 +0,0 @@ -package cel_filter - -import ( - "context" - "fmt" - "sort" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// CELConfigManager handles CEL filter configuration retrieval and creation -type CELConfigManager struct { - client client.Client -} - -// NewCELConfigManager creates a new CEL configuration manager -func NewCELConfigManager(client client.Client) *CELConfigManager { - return &CELConfigManager{ - client: client, - } -} - -// GetCELFiltersForPool retrieves CEL filters from SchedulingConfigTemplate for a given pool -func (m *CELConfigManager) GetCELFiltersForPool(ctx context.Context, poolName string) ([]*CELFilter, error) { - // Get pool to find SchedulingConfigTemplate - pool := &tfv1.GPUPool{} - if err := m.client.Get(ctx, client.ObjectKey{Name: poolName}, pool); err != nil { - return nil, fmt.Errorf("get pool %s: %w", poolName, err) - } - - // If no SchedulingConfigTemplate is specified, return empty - if pool.Spec.SchedulingConfigTemplate == nil { - return nil, nil - } - - return m.GetCELFiltersFromTemplate(ctx, *pool.Spec.SchedulingConfigTemplate) -} - -// GetCELFiltersFromTemplate retrieves CEL filters directly from a SchedulingConfigTemplate -func (m *CELConfigManager) GetCELFiltersFromTemplate(ctx context.Context, templateName string) ([]*CELFilter, error) { - // Get the SchedulingConfigTemplate - schedulingConfigTemplate := &tfv1.SchedulingConfigTemplate{} - if err := m.client.Get(ctx, client.ObjectKey{Name: templateName}, schedulingConfigTemplate); err != nil { - return nil, fmt.Errorf("get scheduling config template %s: %w", templateName, err) - } - - return m.CreateCELFiltersFromConfig(schedulingConfigTemplate.Spec.Placement.CELFilters) -} - -// CreateCELFiltersFromConfig creates CEL filters from configuration slice -func (m *CELConfigManager) CreateCELFiltersFromConfig(celConfigs []tfv1.CELFilterConfig) ([]*CELFilter, error) { - if len(celConfigs) == 0 { - return nil, nil - } - - // Sort CEL configs by priority (higher priority first) - sortedConfigs := make([]tfv1.CELFilterConfig, len(celConfigs)) - copy(sortedConfigs, celConfigs) - sort.Slice(sortedConfigs, func(i, j int) bool { - return sortedConfigs[i].Priority > sortedConfigs[j].Priority - }) - - // Create CEL filters - var celFilters []*CELFilter - for _, config := range sortedConfigs { - celFilter, err := NewCELFilter(CELFilterConfig{ - Name: config.Name, - Expression: config.Expression, - Priority: config.Priority, - }) - if err != nil { - return nil, fmt.Errorf("create CEL filter %q: %w", config.Name, err) - } - celFilters = append(celFilters, celFilter) - } - - return celFilters, nil -} - -// ValidateCELConfig validates a CEL filter configuration -func (m *CELConfigManager) ValidateCELConfig(config tfv1.CELFilterConfig) error { - // Try to create the filter to validate the expression - _, err := NewCELFilter(CELFilterConfig{ - Name: config.Name, - Expression: config.Expression, - Priority: config.Priority, - }) - return err -} diff --git a/internal/gpuallocator/filter/cel_filter/cel_config_test.go b/internal/gpuallocator/filter/cel_filter/cel_config_test.go deleted file mode 100644 index 8e8b0ad5..00000000 --- a/internal/gpuallocator/filter/cel_filter/cel_config_test.go +++ /dev/null @@ -1,246 +0,0 @@ -package cel_filter - -import ( - "context" - "testing" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/stretchr/testify/require" -) - -// Test constants for CEL expressions (same as in cel_filter_test.go) -const ( - // Phase expressions - testExamplePhaseRunning = `gpu.phase == 'Running'` - - // Resource expressions - testExampleMinTFlops = `gpu.available.tflops >= 0.5` - testExampleSpecificModel = `gpu.gpuModel.contains('A100')` - - // Label expressions - testExampleNVIDIAOnly = `gpu.gpuModel.startsWith('NVIDIA')` - - // Complex expressions - testExampleComplex = `gpu.phase == 'Running' && gpu.available.tflops > 0.5 && size(gpu.runningApps) < 2` -) - -func TestCELConfigManager_GetCELFiltersForPool(t *testing.T) { - // Create test scheme - scheme := runtime.NewScheme() - err := tfv1.AddToScheme(scheme) - require.NoError(t, err) - - // Create test resources - schedulingTemplate := &tfv1.SchedulingConfigTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-template", - }, - Spec: tfv1.SchedulingConfigTemplateSpec{ - Placement: tfv1.PlacementConfig{ - CELFilters: []tfv1.CELFilterConfig{ - { - Name: "high-priority", - Expression: testExamplePhaseRunning, - Priority: 100, - }, - { - Name: "low-priority", - Expression: testExampleNVIDIAOnly, - Priority: 10, - }, - }, - }, - }, - } - - pool := &tfv1.GPUPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - }, - Spec: tfv1.GPUPoolSpec{ - SchedulingConfigTemplate: &schedulingTemplate.Name, - }, - } - - // Create fake client - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(schedulingTemplate, pool). - Build() - - // Test CELConfigManager - manager := NewCELConfigManager(fakeClient) - celFilters, err := manager.GetCELFiltersForPool(context.Background(), pool.Name) - require.NoError(t, err) - require.Len(t, celFilters, 2) - - // Verify filters are sorted by priority (high to low) - filterNames := make([]string, len(celFilters)) - for i, filter := range celFilters { - filterNames[i] = filter.Name() - } - require.Equal(t, []string{"high-priority", "low-priority"}, filterNames) -} - -func TestCELConfigManager_GetCELFiltersFromTemplate(t *testing.T) { - // Create test scheme - scheme := runtime.NewScheme() - err := tfv1.AddToScheme(scheme) - require.NoError(t, err) - - // Create test template - schedulingTemplate := &tfv1.SchedulingConfigTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: "direct-template", - }, - Spec: tfv1.SchedulingConfigTemplateSpec{ - Placement: tfv1.PlacementConfig{ - CELFilters: []tfv1.CELFilterConfig{ - { - Name: "simple-filter", - Expression: testExampleMinTFlops, - Priority: 50, - }, - }, - }, - }, - } - - // Create fake client - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(schedulingTemplate). - Build() - - // Test direct template access - manager := NewCELConfigManager(fakeClient) - celFilters, err := manager.GetCELFiltersFromTemplate(context.Background(), schedulingTemplate.Name) - require.NoError(t, err) - require.Len(t, celFilters, 1) - require.Equal(t, "simple-filter", celFilters[0].Name()) -} - -func TestCELConfigManager_CreateCELFiltersFromConfig(t *testing.T) { - manager := NewCELConfigManager(nil) // No client needed for this test - - celConfigs := []tfv1.CELFilterConfig{ - { - Name: "filter-3", - Expression: testExamplePhaseRunning, - Priority: 30, - }, - { - Name: "filter-1", - Expression: testExampleMinTFlops, - Priority: 100, - }, - { - Name: "filter-2", - Expression: testExampleSpecificModel, - Priority: 50, - }, - } - - celFilters, err := manager.CreateCELFiltersFromConfig(celConfigs) - require.NoError(t, err) - require.Len(t, celFilters, 3) - - // Verify priority ordering (high to low) - expectedOrder := []string{"filter-1", "filter-2", "filter-3"} - actualOrder := make([]string, len(celFilters)) - for i, filter := range celFilters { - actualOrder[i] = filter.Name() - } - require.Equal(t, expectedOrder, actualOrder) -} - -func TestCELConfigManager_ValidateCELConfig(t *testing.T) { - manager := NewCELConfigManager(nil) - - tests := []struct { - name string - config tfv1.CELFilterConfig - expectError bool - }{ - { - name: "valid config", - config: tfv1.CELFilterConfig{ - Name: "valid", - Expression: testExamplePhaseRunning, - Priority: 100, - }, - expectError: false, - }, - { - name: "invalid expression", - config: tfv1.CELFilterConfig{ - Name: "invalid", - Expression: "gpu.phase ==", // Invalid syntax - Priority: 100, - }, - expectError: true, - }, - { - name: "complex valid expression", - config: tfv1.CELFilterConfig{ - Name: "complex", - Expression: testExampleComplex, - Priority: 100, - }, - expectError: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := manager.ValidateCELConfig(tt.config) - if tt.expectError { - require.Error(t, err) - } else { - require.NoError(t, err) - } - }) - } -} - -func TestCELConfigManager_NoTemplate(t *testing.T) { - // Create test scheme - scheme := runtime.NewScheme() - err := tfv1.AddToScheme(scheme) - require.NoError(t, err) - - // Create pool without SchedulingConfigTemplate - pool := &tfv1.GPUPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "no-template-pool", - }, - Spec: tfv1.GPUPoolSpec{ - SchedulingConfigTemplate: nil, // No template specified - }, - } - - // Create fake client - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pool). - Build() - - // Test that no CEL filters are returned - manager := NewCELConfigManager(fakeClient) - celFilters, err := manager.GetCELFiltersForPool(context.Background(), pool.Name) - require.NoError(t, err) - require.Len(t, celFilters, 0) -} - -func TestCELConfigManager_EmptyConfig(t *testing.T) { - manager := NewCELConfigManager(nil) - - // Test empty config slice - celFilters, err := manager.CreateCELFiltersFromConfig([]tfv1.CELFilterConfig{}) - require.NoError(t, err) - require.Len(t, celFilters, 0) -} diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go index 3165a5b6..90b60501 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -30,45 +30,8 @@ type CELFilter struct { mu sync.RWMutex } -// NewCELFilter creates a new CEL-based GPU filter -func NewCELFilter(config CELFilterConfig) (*CELFilter, error) { - env, err := createCELEnvironment() - if err != nil { - return nil, fmt.Errorf("failed to create CEL environment: %w", err) - } - - ast, issues := env.Compile(config.Expression) - if issues != nil && issues.Err() != nil { - return nil, fmt.Errorf("failed to compile CEL expression %q: %w", config.Expression, issues.Err()) - } - - program, err := env.Program(ast) - if err != nil { - return nil, fmt.Errorf("failed to create CEL program: %w", err) - } - - name := config.Name - if name == "" { - name = fmt.Sprintf("CELFilter-%d", config.Priority) - } - - return &CELFilter{ - name: name, - expression: config.Expression, - program: program, - env: env, - }, nil -} - -// Name returns the name of this filter -func (f *CELFilter) Name() string { - f.mu.RLock() - defer f.mu.RUnlock() - return f.name -} - // Filter applies the CEL expression to filter GPUs -func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []tfv1.GPU) ([]tfv1.GPU, error) { +func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) { log := log.FromContext(ctx) if len(gpus) == 0 { return gpus, nil @@ -79,11 +42,11 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, expression := f.expression f.mu.RUnlock() - var filteredGPUs []tfv1.GPU + var filteredGPUs []*tfv1.GPU for _, gpu := range gpus { // Create variables for CEL evaluation - vars := createCELVariables(gpu, workerPodKey) + vars := createCELVariables(*gpu, workerPodKey) // Evaluate the CEL expression result, _, err := program.Eval(vars) @@ -147,7 +110,7 @@ func createCELEnvironment() (*cel.Env, error) { cel.Variable(CELVarGPU, cel.MapType(cel.StringType, cel.DynType)), // Define worker pod key cel.Variable(CELVarWorkerPodKey, cel.MapType(cel.StringType, cel.StringType)), - // Define request information (if needed in future) + // Define request object structure cel.Variable(CELVarRequest, cel.MapType(cel.StringType, cel.DynType)), ) } @@ -208,6 +171,5 @@ func createCELVariables(gpu tfv1.GPU, workerPodKey tfv1.NameNamespace) map[strin return map[string]interface{}{ CELVarGPU: gpuMap, CELVarWorkerPodKey: workerPodKeyMap, - CELVarRequest: map[string]interface{}{}, // Placeholder for future request info } } diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go deleted file mode 100644 index ba2cc539..00000000 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go +++ /dev/null @@ -1,347 +0,0 @@ -package cel_filter - -import ( - "context" - "testing" - - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/stretchr/testify/require" -) - -// Test constants for CEL expressions -const ( - // Phase expressions - ExamplePhaseRunning = `gpu.phase == 'Running'` - ExamplePhasePending = `gpu.phase == 'Pending'` - - // Resource expressions - ExampleMinTFlops = `gpu.available.tflops >= 0.5` - ExampleMinVRAM = `gpu.available.vram >= 4294967296` // 4GB in bytes - ExampleResourceRatio = `gpu.available.tflops > gpu.capacity.tflops * 0.5` - - // Model expressions - ExampleNVIDIAOnly = `gpu.gpuModel.startsWith('NVIDIA')` - ExampleSpecificModel = `gpu.gpuModel.contains('A100')` - - // Label expressions - ExampleHasLabel = `'gpu-tier' in gpu.labels` - ExampleLabelValue = `gpu.labels != null && 'gpu-tier' in gpu.labels && gpu.labels['gpu-tier'] == 'premium'` - - // Load balancing expressions - ExampleLowLoad = `size(gpu.runningApps) < 3` - ExampleNoApps = `size(gpu.runningApps) == 0` - - // Complex expressions - ExampleComplex = `gpu.phase == 'Running' && gpu.available.tflops > 0.5 && size(gpu.runningApps) < 2` -) - -func TestNewCELFilter(t *testing.T) { - tests := []struct { - name string - config CELFilterConfig - expectError bool - }{ - { - name: "valid basic expression", - config: CELFilterConfig{ - Name: "basic-test", - Expression: ExamplePhaseRunning, - Priority: 100, - }, - expectError: false, - }, - { - name: "valid resource expression", - config: CELFilterConfig{ - Name: "resource-test", - Expression: ExampleMinTFlops, - Priority: 50, - }, - expectError: false, - }, - { - name: "invalid expression syntax", - config: CELFilterConfig{ - Name: "invalid-test", - Expression: "gpu.phase ==", // Invalid syntax - Priority: 10, - }, - expectError: true, - }, - { - name: "expression with labels", - config: CELFilterConfig{ - Name: "label-test", - Expression: ExampleHasLabel, - Priority: 75, - }, - expectError: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - filter, err := NewCELFilter(tt.config) - if tt.expectError { - require.Error(t, err) - require.Nil(t, filter) - } else { - require.NoError(t, err) - require.NotNil(t, filter) - require.Equal(t, tt.config.Name, filter.Name()) - } - }) - } -} - -func TestCELFilter_Filter(t *testing.T) { - // Create test GPUs - gpus := []tfv1.GPU{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-1", - Namespace: "default", - Labels: map[string]string{ - "gpu-tier": "premium", - }, - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhaseRunning, - GPUModel: "NVIDIA A100", - UUID: "gpu-1-uuid", - Capacity: &tfv1.Resource{ - Tflops: resource.MustParse("1.5"), - Vram: resource.MustParse("80Gi"), - }, - Available: &tfv1.Resource{ - Tflops: resource.MustParse("1.0"), - Vram: resource.MustParse("60Gi"), - }, - RunningApps: []*tfv1.RunningAppDetail{ - { - Name: "app-1", - Namespace: "default", - Count: 1, - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-2", - Namespace: "default", - Labels: map[string]string{ - "gpu-tier": "basic", - }, - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhaseRunning, - GPUModel: "NVIDIA RTX 4090", - UUID: "gpu-2-uuid", - Capacity: &tfv1.Resource{ - Tflops: resource.MustParse("0.8"), - Vram: resource.MustParse("24Gi"), - }, - Available: &tfv1.Resource{ - Tflops: resource.MustParse("0.2"), - Vram: resource.MustParse("8Gi"), - }, - RunningApps: []*tfv1.RunningAppDetail{ - { - Name: "app-2", - Namespace: "default", - Count: 1, - }, - { - Name: "app-3", - Namespace: "default", - Count: 2, - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-3", - Namespace: "default", - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhasePending, - GPUModel: "NVIDIA A100", - UUID: "gpu-3-uuid", - Capacity: &tfv1.Resource{ - Tflops: resource.MustParse("1.5"), - Vram: resource.MustParse("80Gi"), - }, - Available: &tfv1.Resource{ - Tflops: resource.MustParse("1.5"), - Vram: resource.MustParse("80Gi"), - }, - }, - }, - } - - workerPodKey := tfv1.NameNamespace{ - Name: "test-pod", - Namespace: "default", - } - - tests := []struct { - name string - expression string - expectedGPUs []string // GPU names that should pass the filter - expectError bool - }{ - { - name: "filter by phase", - expression: ExamplePhaseRunning, - expectedGPUs: []string{"gpu-1", "gpu-2"}, - }, - { - name: "filter by available resources", - expression: ExampleMinTFlops, - expectedGPUs: []string{"gpu-1", "gpu-3"}, - }, - { - name: "filter by GPU model", - expression: "gpu.gpuModel.startsWith('NVIDIA A100')", - expectedGPUs: []string{"gpu-1", "gpu-3"}, - }, - { - name: "filter by labels", - expression: ExampleLabelValue, - expectedGPUs: []string{"gpu-1"}, - }, - { - name: "filter by running apps count", - expression: ExampleLowLoad, - expectedGPUs: []string{"gpu-1", "gpu-2", "gpu-3"}, - }, - { - name: "complex filter", - expression: ExampleComplex, - expectedGPUs: []string{"gpu-1"}, - }, - { - name: "filter none", - expression: "false", - expectedGPUs: []string{}, - }, - { - name: "filter all", - expression: "true", - expectedGPUs: []string{"gpu-1", "gpu-2", "gpu-3"}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - filter, err := NewCELFilter(CELFilterConfig{ - Name: tt.name, - Expression: tt.expression, - Priority: 100, - }) - require.NoError(t, err) - - filteredGPUs, err := filter.Filter(context.Background(), workerPodKey, gpus) - if tt.expectError { - require.Error(t, err) - return - } - - require.NoError(t, err) - require.Len(t, filteredGPUs, len(tt.expectedGPUs)) - - // Check that the correct GPUs were filtered - actualNames := make([]string, len(filteredGPUs)) - for i, gpu := range filteredGPUs { - actualNames[i] = gpu.Name - } - - require.ElementsMatch(t, tt.expectedGPUs, actualNames) - }) - } -} - -func TestCELFilter_UpdateExpression(t *testing.T) { - // Create initial filter - filter, err := NewCELFilter(CELFilterConfig{ - Name: "update-test", - Expression: ExamplePhaseRunning, - Priority: 100, - }) - require.NoError(t, err) - - // Test valid update - err = filter.UpdateExpression(ExamplePhasePending) - require.NoError(t, err) - - // Test invalid update - err = filter.UpdateExpression("gpu.phase ==") - require.Error(t, err) -} - -func TestCELFilter_ThreadSafety(t *testing.T) { - filter, err := NewCELFilter(CELFilterConfig{ - Name: "thread-safety-test", - Expression: ExamplePhaseRunning, - Priority: 100, - }) - require.NoError(t, err) - - // Create test GPU - gpu := tfv1.GPU{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-1", - Namespace: "default", - }, - Status: tfv1.GPUStatus{ - Phase: tfv1.TensorFusionGPUPhaseRunning, - }, - } - - workerPodKey := tfv1.NameNamespace{ - Name: "test-pod", - Namespace: "default", - } - - // Run concurrent operations - done := make(chan bool, 3) - - // Concurrent filtering - go func() { - defer func() { done <- true }() - for i := 0; i < 100; i++ { - _, err := filter.Filter(context.Background(), workerPodKey, []tfv1.GPU{gpu}) - require.NoError(t, err) - } - }() - - // Concurrent name access - go func() { - defer func() { done <- true }() - for i := 0; i < 100; i++ { - name := filter.Name() - require.Equal(t, "thread-safety-test", name) - } - }() - - // Concurrent expression updates - go func() { - defer func() { done <- true }() - for i := 0; i < 10; i++ { - err := filter.UpdateExpression(ExamplePhasePending) - require.NoError(t, err) - err = filter.UpdateExpression(ExamplePhaseRunning) - require.NoError(t, err) - } - }() - - // Wait for all goroutines to complete - for i := 0; i < 3; i++ { - <-done - } -} diff --git a/internal/gpuallocator/filter/cel_filter/constants.go b/internal/gpuallocator/filter/cel_filter/constants.go index 152f643f..7ea0cc85 100644 --- a/internal/gpuallocator/filter/cel_filter/constants.go +++ b/internal/gpuallocator/filter/cel_filter/constants.go @@ -42,3 +42,12 @@ const ( PodKeyFieldName = "name" PodKeyFieldNamespace = "namespace" ) + +// Request object field names +const ( + RequestFieldWorkerPodKey = "workerPodKey" + RequestFieldCount = "count" + RequestFieldGPUModel = "gpuModel" + RequestFieldRequest = "request" + RequestFieldLimit = "limit" +) diff --git a/internal/gpuallocator/filter/cel_filter/expression_cache.go b/internal/gpuallocator/filter/cel_filter/expression_cache.go new file mode 100644 index 00000000..4065c3b9 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/expression_cache.go @@ -0,0 +1,191 @@ +package cel_filter + +import ( + "context" + "crypto/sha256" + "fmt" + "sync" + "time" + + "github.com/google/cel-go/cel" +) + +// CachedCELProgram represents a compiled CEL program with metadata +type CachedCELProgram struct { + Program cel.Program + Expression string + CreatedAt time.Time + AccessedAt time.Time + AccessCount int64 +} + +// ExpressionCache provides caching for compiled CEL expressions +type ExpressionCache struct { + cache map[string]*CachedCELProgram + mutex sync.RWMutex + maxSize int + maxAge time.Duration + env *cel.Env + + // Metrics + hits int64 + misses int64 +} + +// NewExpressionCache creates a new CEL expression cache +func NewExpressionCache(maxSize int, maxAge time.Duration) (*ExpressionCache, error) { + env, err := createCELEnvironment() + if err != nil { + return nil, fmt.Errorf("failed to create CEL environment: %w", err) + } + + cache := &ExpressionCache{ + cache: make(map[string]*CachedCELProgram, maxSize), + maxSize: maxSize, + maxAge: maxAge, + env: env, + } + + // Start cleanup goroutine + go cache.cleanupExpiredEntries(context.Background()) + + return cache, nil +} + +// GetOrCompileProgram returns a cached program or compiles and caches a new one +func (c *ExpressionCache) GetOrCompileProgram(expression string) (cel.Program, error) { + hash := c.hashExpression(expression) + + c.mutex.RLock() + if cached, exists := c.cache[hash]; exists { + // Check if entry is still valid + if time.Since(cached.CreatedAt) < c.maxAge { + cached.AccessedAt = time.Now() + cached.AccessCount++ + c.hits++ + c.mutex.RUnlock() + return cached.Program, nil + } + } + c.mutex.RUnlock() + + // Cache miss or expired - compile new program + c.mutex.Lock() + defer c.mutex.Unlock() + + // Double-check after acquiring write lock + if cached, exists := c.cache[hash]; exists && time.Since(cached.CreatedAt) < c.maxAge { + cached.AccessedAt = time.Now() + cached.AccessCount++ + c.hits++ + return cached.Program, nil + } + + // Compile the expression + ast, issues := c.env.Compile(expression) + if issues != nil && issues.Err() != nil { + c.misses++ + return nil, fmt.Errorf("failed to compile CEL expression %q: %w", expression, issues.Err()) + } + + program, err := c.env.Program(ast) + if err != nil { + c.misses++ + return nil, fmt.Errorf("failed to create CEL program: %w", err) + } + + // Check if cache is full and evict least recently used entry + if len(c.cache) >= c.maxSize { + c.evictLRU() + } + + // Cache the compiled program + c.cache[hash] = &CachedCELProgram{ + Program: program, + Expression: expression, + CreatedAt: time.Now(), + AccessedAt: time.Now(), + AccessCount: 1, + } + + c.misses++ + return program, nil +} + +// hashExpression creates a hash for caching expressions +func (c *ExpressionCache) hashExpression(expression string) string { + hash := sha256.Sum256([]byte(expression)) + return fmt.Sprintf("%x", hash) +} + +// evictLRU removes the least recently used entry from cache +func (c *ExpressionCache) evictLRU() { + var oldestKey string + var oldestTime time.Time = time.Now() + + for key, cached := range c.cache { + if cached.AccessedAt.Before(oldestTime) { + oldestTime = cached.AccessedAt + oldestKey = key + } + } + + if oldestKey != "" { + delete(c.cache, oldestKey) + } +} + +// cleanupExpiredEntries removes expired entries periodically +func (c *ExpressionCache) cleanupExpiredEntries(ctx context.Context) { + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + c.mutex.Lock() + now := time.Now() + for key, cached := range c.cache { + if now.Sub(cached.CreatedAt) > c.maxAge { + delete(c.cache, key) + } + } + c.mutex.Unlock() + } + } +} + +// GetStats returns cache statistics +func (c *ExpressionCache) GetStats() CacheStats { + c.mutex.RLock() + defer c.mutex.RUnlock() + + return CacheStats{ + Size: len(c.cache), + MaxSize: c.maxSize, + Hits: c.hits, + Misses: c.misses, + HitRatio: float64(c.hits) / float64(c.hits+c.misses), + } +} + +// CacheStats represents cache performance statistics +type CacheStats struct { + Size int + MaxSize int + Hits int64 + Misses int64 + HitRatio float64 +} + +// Clear removes all entries from the cache +func (c *ExpressionCache) Clear() { + c.mutex.Lock() + defer c.mutex.Unlock() + + c.cache = make(map[string]*CachedCELProgram, c.maxSize) + c.hits = 0 + c.misses = 0 +} diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index 0a657dc2..c1ce771b 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -166,7 +166,65 @@ func (s *GpuAllocator) Filter( toFilterGPUs []*tfv1.GPU, isSimulateSchedule bool, ) ([]*tfv1.GPU, []filter.FilterDetail, error) { + + // Check if CEL filtering is enabled via config/flag + useCELFilter := config.GetGlobalConfig().EnableCELFilter + + if useCELFilter { + // New CEL-based filtering approach + return s.applyCELFilter(req, toFilterGPUs, isSimulateSchedule) + } else { + // Legacy filter approach (for rollback support) + return s.applyLegacyFilters(req, toFilterGPUs, isSimulateSchedule) + } +} + +// applyCELFilter applies the new CEL-based filtering +func (s *GpuAllocator) applyCELFilter( + req *tfv1.AllocRequest, + toFilterGPUs []*tfv1.GPU, + isSimulateSchedule bool, +) ([]*tfv1.GPU, []filter.FilterDetail, error) { + // Create CEL filter from AllocRequest + cache, err := cel_filter.NewExpressionCache(100, 5*time.Minute) + if err != nil { + return nil, nil, fmt.Errorf("failed to create CEL cache: %w", err) + } + + allocCELFilter, err := cel_filter.NewAllocRequestCELFilter(req, cache) + if err != nil { + return nil, nil, fmt.Errorf("failed to create AllocRequest CEL filter: %w", err) + } + + // Start with base registry and add CEL filter + filterRegistry := s.filterRegistry.With(allocCELFilter) + // Add SameNodeFilter if count > 1 to ensure GPUs are from the same node + if req.Count > 1 { + filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count)) + } + + // Add NodeAffinityFilter if specified + if req.NodeAffinity != nil { + filterRegistry = filterRegistry.With(filter.NewNodeAffinityFilter(s.Client, req.NodeAffinity)) + } + + // Apply the filters + filteredGPUs, filterDetails, err := filterRegistry.Apply(s.ctx, req.WorkloadNameNamespace, toFilterGPUs, isSimulateSchedule) + if err != nil { + return nil, nil, fmt.Errorf("apply CEL filters: %w", err) + } + + return filteredGPUs, filterDetails, nil +} + +// applyLegacyFilters applies the legacy filter approach (for rollback support) +func (s *GpuAllocator) applyLegacyFilters( + req *tfv1.AllocRequest, + toFilterGPUs []*tfv1.GPU, + isSimulateSchedule bool, +) ([]*tfv1.GPU, []filter.FilterDetail, error) { + // Legacy filtering approach filterRegistry := s.filterRegistry.With(filter.NewResourceFilter(req.Request)) // Add GPU model filter if specified @@ -177,26 +235,16 @@ func (s *GpuAllocator) Filter( if req.Count > 1 { filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count)) } + // Add NodeAffinityFilter if specified if req.NodeAffinity != nil { filterRegistry = filterRegistry.With(filter.NewNodeAffinityFilter(s.Client, req.NodeAffinity)) } - // Add CEL filters from SchedulingConfigTemplate if available - celConfigManager := cel_filter.NewCELConfigManager(s.Client) - celFilters, err := celConfigManager.GetCELFiltersForPool(s.ctx, req.PoolName) - if err != nil { - return nil, nil, fmt.Errorf("get CEL filters: %w", err) - } - if len(celFilters) > 0 { - celFilterAdapters := cel_filter.CreateCELFilterAdapters(celFilters) - filterRegistry = filterRegistry.With(celFilterAdapters...) - } - - // Apply the filters in sequence + // Apply the legacy filters filteredGPUs, filterDetails, err := filterRegistry.Apply(s.ctx, req.WorkloadNameNamespace, toFilterGPUs, isSimulateSchedule) if err != nil { - return nil, nil, fmt.Errorf("apply filters: %w", err) + return nil, nil, fmt.Errorf("apply legacy filters: %w", err) } return filteredGPUs, filterDetails, nil @@ -338,6 +386,7 @@ func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocR return nil, nil, fmt.Errorf("no gpu devices in pool %s", req.PoolName) } filteredGPUs, filterDetails, err := s.Filter(req, poolGPUs, isSimulateSchedule) + if err != nil { return nil, nil, err } From fc26511b25452a602a2b4e3c22ee12e250577f9c Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 31 Aug 2025 00:37:57 -0700 Subject: [PATCH 03/34] support annotaion cel --- api/v1/gpuresourcequota_types.go | 6 + internal/config/global_config.go | 4 - internal/constants/constants.go | 3 + .../filter/cel_filter/alloc_request_filter.go | 164 -------- .../filter/cel_filter/cel_filter.go | 181 ++++++--- .../cel_filter/cel_filter_benchmark_test.go | 288 ++++++++++++++ .../filter/cel_filter/cel_filter_test.go | 368 ++++++++++++++++++ .../filter/cel_filter/constants.go | 1 - internal/gpuallocator/gpuallocator.go | 27 +- 9 files changed, 802 insertions(+), 240 deletions(-) delete mode 100644 internal/gpuallocator/filter/cel_filter/alloc_request_filter.go create mode 100644 internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go create mode 100644 internal/gpuallocator/filter/cel_filter/cel_filter_test.go diff --git a/api/v1/gpuresourcequota_types.go b/api/v1/gpuresourcequota_types.go index c6ac1dba..46e068b7 100644 --- a/api/v1/gpuresourcequota_types.go +++ b/api/v1/gpuresourcequota_types.go @@ -186,6 +186,12 @@ type AllocRequest struct { // record the pod meta for quota check PodMeta metav1.ObjectMeta + + // enable cel filter + DisableCELFilter bool + + // cel filter expression + CELFilterExpression string } type GPUAllocationInfo struct { diff --git a/internal/config/global_config.go b/internal/config/global_config.go index 0632c284..75bddc22 100644 --- a/internal/config/global_config.go +++ b/internal/config/global_config.go @@ -8,9 +8,6 @@ type GlobalConfig struct { MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"` AlertRules []AlertRule `yaml:"alertRules"` - - // EnableCELFilter enables CEL-based filtering (default: false for rollback support) - EnableCELFilter bool `yaml:"enableCELFilter"` } var globalConfig *GlobalConfig @@ -44,7 +41,6 @@ func MockGlobalConfig() *GlobalConfig { MetricsTTL: "30d", MetricsFormat: "influx", MetricsExtraPodLabels: map[string]string{"kubernetes.io/app": "app"}, - EnableCELFilter: false, // Default to legacy filter for rollback support AlertRules: []AlertRule{ { Name: "mock", diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 32b3d6bc..b1aa6b64 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -74,6 +74,8 @@ const ( // In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system SelectedWorkloadAnnotation = Domain + "/selected-workload" + CELFilterExpressionAnnotation = Domain + "/cel-filter-expression" + WorkloadModeAnnotation = Domain + "/workload-mode" WorkloadModeDynamic = "dynamic" WorkloadModeFixed = "fixed" @@ -86,6 +88,7 @@ const ( BuiltInFeaturesMemManager = "mem-manager" // For debug purpose only of Remote vGPU, disable start worker to manual start with ad-hoc command inside Pod BuiltInFeatureStartWorker = "start-worker" + BuiltInFeatureCELFilter = "cel-filter" GenHostPortLabel = Domain + "/host-port" GenHostPortLabelValue = "auto" diff --git a/internal/gpuallocator/filter/cel_filter/alloc_request_filter.go b/internal/gpuallocator/filter/cel_filter/alloc_request_filter.go deleted file mode 100644 index bd3e06de..00000000 --- a/internal/gpuallocator/filter/cel_filter/alloc_request_filter.go +++ /dev/null @@ -1,164 +0,0 @@ -package cel_filter - -import ( - "context" - "fmt" - "time" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/google/cel-go/common/types" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -// AllocRequestCELFilter converts AllocRequest to CEL filter and executes it -type AllocRequestCELFilter struct { - cache *ExpressionCache - expression string - name string -} - -// NewAllocRequestCELFilter creates a new CEL filter from allocation request -func NewAllocRequestCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*AllocRequestCELFilter, error) { - // Convert AllocRequest to CEL expression - expression, err := convertAllocRequestToCEL(req) - if err != nil { - return nil, fmt.Errorf("failed to convert AllocRequest to CEL: %w", err) - } - - return &AllocRequestCELFilter{ - cache: cache, - expression: expression, - name: fmt.Sprintf("AllocRequest-%s", req.WorkloadNameNamespace.String()), - }, nil -} - -// Name returns the filter name -func (f *AllocRequestCELFilter) Name() string { - return f.name -} - -// Filter applies the CEL expression derived from AllocRequest to filter GPUs -func (f *AllocRequestCELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) { - log := log.FromContext(ctx) - if len(gpus) == 0 { - return gpus, nil - } - - if f.expression == "" { - // If no expression, return all GPUs (no filtering needed) - return gpus, nil - } - - // Get compiled program from cache - program, err := f.cache.GetOrCompileProgram(f.expression) - if err != nil { - return nil, fmt.Errorf("failed to get CEL program for expression %q: %w", f.expression, err) - } - - var filteredGPUs []*tfv1.GPU - - for _, gpu := range gpus { - // Create timeout context for CEL evaluation - evalCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) - - // Create variables for CEL evaluation - vars := createCELVariables(*gpu, workerPodKey) - - // Evaluate with timeout - resultChan := make(chan evalResult, 1) - go func() { - result, _, evalErr := program.Eval(vars) - resultChan <- evalResult{result: result, err: evalErr} - }() - - select { - case evalRes := <-resultChan: - cancel() - if evalRes.err != nil { - log.Error(evalRes.err, "CEL expression evaluation failed", - "expression", f.expression, - "gpu", gpu.Name, - "workerPodKey", workerPodKey) - // On error, exclude the GPU (fail-safe) - continue - } - - // Convert result to boolean - if boolResult, ok := evalRes.result.(types.Bool); ok { - if bool(boolResult) { - filteredGPUs = append(filteredGPUs, gpu) - } - } else { - log.Error(nil, "CEL expression did not return boolean", - "expression", f.expression, - "result", evalRes.result, - "gpu", gpu.Name) - // On non-boolean result, exclude the GPU (fail-safe) - continue - } - case <-evalCtx.Done(): - cancel() - // Timeout - skip this GPU (fail-safe behavior) - log.V(1).Info("CEL evaluation timeout", "gpu", gpu.Name, "expression", f.expression) - continue - } - } - - log.V(1).Info("AllocRequest CEL filter applied", - "filter", f.name, - "expression", f.expression, - "inputGPUs", len(gpus), - "outputGPUs", len(filteredGPUs)) - - return filteredGPUs, nil -} - -type evalResult struct { - result interface{} - err error -} - -// convertAllocRequestToCEL converts an allocation request to a CEL expression -func convertAllocRequestToCEL(req *tfv1.AllocRequest) (string, error) { - if req == nil { - return "", nil - } - - var conditions []string - - // Add GPU phase condition (must be Ready) - conditions = append(conditions, "gpu.phase == 'Ready'") - - // Add resource requirements - if req.Request.Tflops.Sign() > 0 { - tflopsValue := req.Request.Tflops.AsApproximateFloat64() - conditions = append(conditions, fmt.Sprintf("gpu.available.tflops >= %f", tflopsValue)) - } - - if req.Request.Vram.Sign() > 0 { - vramValue := req.Request.Vram.AsApproximateFloat64() - conditions = append(conditions, fmt.Sprintf("gpu.available.vram >= %f", vramValue)) - } - - // Add GPU model filter if specified - if req.GPUModel != "" { - conditions = append(conditions, fmt.Sprintf("gpu.gpuModel == '%s'", req.GPUModel)) - } - - // If no conditions, return empty expression (no filtering) - if len(conditions) == 0 { - return "", nil - } - - // Combine all conditions with AND - if len(conditions) == 1 { - return conditions[0], nil - } - - expression := conditions[0] - for i := 1; i < len(conditions); i++ { - expression += " && " + conditions[i] - } - - return expression, nil -} diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go index 90b60501..a9369535 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -3,7 +3,7 @@ package cel_filter import ( "context" "fmt" - "sync" + "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/google/cel-go/cel" @@ -11,96 +11,157 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) -// CELFilterConfig defines the configuration for CEL-based filtering -type CELFilterConfig struct { - // CEL expression for filtering GPUs - Expression string `json:"expression"` - // Priority for this filter (higher priority filters run first) - Priority int `json:"priority"` - // Name for this filter (for debugging/logging) - Name string `json:"name"` -} - -// CELFilter implements GPU filtering using CEL expressions +// AllocRequestCELFilter converts AllocRequest to CEL filter and executes it type CELFilter struct { - name string + cache *ExpressionCache expression string - program cel.Program - env *cel.Env - mu sync.RWMutex + name string } -// Filter applies the CEL expression to filter GPUs +// NewAllocRequestCELFilter creates a new CEL filter from allocation request +func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, error) { + // Convert AllocRequest to CEL expression + expression, err := convertAllocRequestToCEL(req) + if err != nil { + return nil, fmt.Errorf("failed to convert AllocRequest to CEL: %w", err) + } + + // Handle nil request case + name := "AllocRequest-unknown" + if req != nil { + name = fmt.Sprintf("AllocRequest-%s", req.WorkloadNameNamespace.String()) + } + + return &CELFilter{ + cache: cache, + expression: expression, + name: name, + }, nil +} + +// Name returns the filter name +func (f *CELFilter) Name() string { + return f.name +} + +// Filter applies the CEL expression derived from AllocRequest to filter GPUs func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) { log := log.FromContext(ctx) if len(gpus) == 0 { return gpus, nil } - f.mu.RLock() - program := f.program - expression := f.expression - f.mu.RUnlock() + if f.expression == "" { + // If no expression, return all GPUs (no filtering needed) + return gpus, nil + } + + // Get compiled program from cache + program, err := f.cache.GetOrCompileProgram(f.expression) + if err != nil { + return nil, fmt.Errorf("failed to get CEL program for expression %q: %w", f.expression, err) + } var filteredGPUs []*tfv1.GPU for _, gpu := range gpus { + // Create timeout context for CEL evaluation + evalCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + // Create variables for CEL evaluation vars := createCELVariables(*gpu, workerPodKey) - // Evaluate the CEL expression - result, _, err := program.Eval(vars) - if err != nil { - log.Error(err, "CEL expression evaluation failed", - "expression", expression, - "gpu", gpu.Name, - "workerPodKey", workerPodKey) - // On error, exclude the GPU (fail-safe) - continue - } + // Evaluate with timeout + resultChan := make(chan evalResult, 1) + go func() { + result, _, evalErr := program.Eval(vars) + resultChan <- evalResult{result: result, err: evalErr} + }() + + select { + case evalRes := <-resultChan: + cancel() + if evalRes.err != nil { + log.Error(evalRes.err, "CEL expression evaluation failed", + "expression", f.expression, + "gpu", gpu.Name, + "workerPodKey", workerPodKey) + // On error, exclude the GPU (fail-safe) + continue + } - // Convert result to boolean - if boolResult, ok := result.(types.Bool); ok { - if bool(boolResult) { - filteredGPUs = append(filteredGPUs, gpu) + // Convert result to boolean + if boolResult, ok := evalRes.result.(types.Bool); ok { + if bool(boolResult) { + filteredGPUs = append(filteredGPUs, gpu) + } + } else { + log.Error(nil, "CEL expression did not return boolean", + "expression", f.expression, + "result", evalRes.result, + "gpu", gpu.Name) + // On non-boolean result, exclude the GPU (fail-safe) + continue } - } else { - log.Error(nil, "CEL expression did not return boolean", - "expression", expression, - "result", result, - "gpu", gpu.Name) - // On non-boolean result, exclude the GPU (fail-safe) + case <-evalCtx.Done(): + cancel() + // Timeout - skip this GPU (fail-safe behavior) + log.V(1).Info("CEL evaluation timeout", "gpu", gpu.Name, "expression", f.expression) continue } } - log.V(1).Info("CEL filter applied", + log.V(1).Info("AllocRequest CEL filter applied", "filter", f.name, - "expression", expression, + "expression", f.expression, "inputGPUs", len(gpus), "outputGPUs", len(filteredGPUs)) return filteredGPUs, nil } -// UpdateExpression updates the CEL expression (thread-safe) -func (f *CELFilter) UpdateExpression(newExpression string) error { - f.mu.Lock() - defer f.mu.Unlock() +type evalResult struct { + result interface{} + err error +} - ast, issues := f.env.Compile(newExpression) - if issues != nil && issues.Err() != nil { - return fmt.Errorf("failed to compile new CEL expression %q: %w", newExpression, issues.Err()) +// convertAllocRequestToCEL converts an allocation request to a CEL expression +func convertAllocRequestToCEL(req *tfv1.AllocRequest) (string, error) { + if req == nil { + return "", nil } - program, err := f.env.Program(ast) - if err != nil { - return fmt.Errorf("failed to create new CEL program: %w", err) + var conditions []string + + // Add custom CEL expression if provided by user + if req.CELFilterExpression != "" { + conditions = append(conditions, req.CELFilterExpression) + } + + // Add GPU phase condition (must be Ready) + conditions = append(conditions, "gpu.phase == 'Ready'") + + // Add GPU model filter if specified + if req.GPUModel != "" { + conditions = append(conditions, fmt.Sprintf("gpu.gpuModel == '%s'", req.GPUModel)) } - f.expression = newExpression - f.program = program - return nil + // If no conditions, return empty expression (no filtering) + if len(conditions) == 0 { + return "", nil + } + + // Combine all conditions with AND + if len(conditions) == 1 { + return conditions[0], nil + } + + expression := conditions[0] + for i := 1; i < len(conditions); i++ { + expression += " && " + conditions[i] + } + + return expression, nil } // createCELEnvironment creates a CEL environment with GPU-related variables and functions @@ -130,14 +191,6 @@ func createCELVariables(gpu tfv1.GPU, workerPodKey tfv1.NameNamespace) map[strin GPUFieldAnnotations: gpu.Annotations, } - // Add capacity information if available - if gpu.Status.Capacity != nil { - gpuMap[GPUFieldCapacity] = map[string]interface{}{ - ResourceFieldTFlops: gpu.Status.Capacity.Tflops.AsApproximateFloat64(), - ResourceFieldVRAM: gpu.Status.Capacity.Vram.AsApproximateFloat64(), - } - } - // Add available information if available if gpu.Status.Available != nil { gpuMap[GPUFieldAvailable] = map[string]interface{}{ diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go new file mode 100644 index 00000000..8894db07 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go @@ -0,0 +1,288 @@ +package cel_filter + +import ( + "context" + "fmt" + "testing" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" +) + +// Benchmark performance of the CEL filter compared to the original filter +func BenchmarkFilterPerformance(b *testing.B) { + // Create test data + const numGPUs = 1000 + gpus := make([]*tfv1.GPU, numGPUs) + for i := 0; i < numGPUs; i++ { + gpuModel := "A100" + if i%3 == 0 { + gpuModel = "V100" + } else if i%3 == 1 { + gpuModel = "H100" + } + + phase := "Ready" + if i%10 == 0 { + phase = "Pending" + } + + gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", gpuModel, phase, 150.0, 40.0) + gpu.Labels["environment"] = "production" + if i%2 == 0 { + gpu.Labels["tier"] = "high-performance" + } + gpus[i] = gpu + } + + workerPodKey := tfv1.NameNamespace{Name: "worker-pod", Namespace: "default"} + ctx := context.Background() + + // Benchmark original filter combination (Phase + GPUModel) + b.Run("OriginalFilters", func(b *testing.B) { + // Import the original filter package + registry := filter.NewFilterRegistry().With( + filter.NewPhaseFilter("Ready"), + filter.NewGPUModelFilter("A100"), + ) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + filteredGPUs, _, err := registry.Apply(ctx, workerPodKey, gpus, false) + if err != nil { + b.Fatal(err) + } + _ = filteredGPUs + } + }) + + // Benchmark CEL filter - basic filtering + b.Run("CELFilter_Basic", func(b *testing.B) { + request := createTestAllocRequest("default", "test-workload", "A100", "") + cache, err := NewExpressionCache(100, 5*time.Minute) + if err != nil { + b.Fatal(err) + } + + celFilter, err := NewCELFilter(request, cache) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + filteredGPUs, err := celFilter.Filter(ctx, workerPodKey, gpus) + if err != nil { + b.Fatal(err) + } + _ = filteredGPUs + } + }) + + // Benchmark CEL filter - complex expression + b.Run("CELFilter_Complex", func(b *testing.B) { + request := createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'") + cache, err := NewExpressionCache(100, 5*time.Minute) + if err != nil { + b.Fatal(err) + } + + celFilter, err := NewCELFilter(request, cache) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + filteredGPUs, err := celFilter.Filter(ctx, workerPodKey, gpus) + if err != nil { + b.Fatal(err) + } + _ = filteredGPUs + } + }) + + // Benchmark CEL filter with cache miss (different expressions each time) + b.Run("CELFilter_CacheMiss", func(b *testing.B) { + cache, err := NewExpressionCache(5, 5*time.Minute) // Small cache to force misses + if err != nil { + b.Fatal(err) + } + + expressions := []string{ + "gpu.gpuModel == 'A100' && gpu.available.tflops > 100.0", + "gpu.gpuModel == 'V100' && gpu.available.tflops > 80.0", + "gpu.gpuModel == 'H100' && gpu.available.tflops > 180.0", + "gpu.labels['environment'] == 'production'", + "gpu.labels['tier'] == 'high-performance'", + "gpu.available.vram > 30000000000", + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + expression := expressions[i%len(expressions)] + request := createTestAllocRequest("default", "test-workload", "", expression) + + celFilter, err := NewCELFilter(request, cache) + if err != nil { + b.Fatal(err) + } + + filteredGPUs, err := celFilter.Filter(ctx, workerPodKey, gpus) + if err != nil { + b.Fatal(err) + } + _ = filteredGPUs + } + }) + + // Print performance comparison report after benchmarks + printPerformanceComparison(b) +} + +// Benchmark cache performance +func BenchmarkCachePerformance(b *testing.B) { + cache, err := NewExpressionCache(100, 5*time.Minute) + if err != nil { + b.Fatal(err) + } + + expression := "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0" + + b.Run("CacheHit", func(b *testing.B) { + // Pre-warm cache + _, err := cache.GetOrCompileProgram(expression) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := cache.GetOrCompileProgram(expression) + if err != nil { + b.Fatal(err) + } + } + }) + + b.Run("CacheMiss", func(b *testing.B) { + expressions := make([]string, b.N) + for i := 0; i < b.N; i++ { + expressions[i] = fmt.Sprintf("gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= %d.0", i%200+50) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := cache.GetOrCompileProgram(expressions[i]) + if err != nil { + b.Fatal(err) + } + } + }) +} + +// Benchmark expression complexity impact +func BenchmarkExpressionComplexity(b *testing.B) { + const numGPUs = 100 + gpus := make([]*tfv1.GPU, numGPUs) + for i := 0; i < numGPUs; i++ { + gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", "A100", "Ready", 150.0, 40.0) + gpu.Labels["environment"] = "production" + gpu.Labels["tier"] = "high-performance" + gpu.Annotations["priority"] = "critical" + gpus[i] = gpu + } + + workerPodKey := tfv1.NameNamespace{Name: "worker-pod", Namespace: "default"} + ctx := context.Background() + + testCases := []struct { + name string + expression string + }{ + { + name: "Simple", + expression: "gpu.phase == 'Ready'", + }, + { + name: "Medium", + expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100'", + }, + { + name: "Complex", + expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0", + }, + { + name: "VeryComplex", + expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'", + }, + { + name: "UltraComplex", + expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production' && gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'", + }, + } + + for _, tc := range testCases { + b.Run(tc.name, func(b *testing.B) { + cache, err := NewExpressionCache(100, 5*time.Minute) + if err != nil { + b.Fatal(err) + } + + request := createTestAllocRequest("default", "test-workload", "", tc.expression) + celFilter, err := NewCELFilter(request, cache) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := celFilter.Filter(ctx, workerPodKey, gpus) + if err != nil { + b.Fatal(err) + } + } + }) + } +} + +// Performance comparison report function +func printPerformanceComparison(b *testing.B) { + b.Helper() + b.Logf(` +=== GPU Filter Performance Comparison === + +Test Environment: +- Number of GPUs: 1000 +- GPU Models: A100 (33%%), V100 (33%%), H100 (33%%) +- GPU Phases: Ready (90%%), Pending (10%%) + +Expected Results: +1. Original Filters: Fastest for simple conditions (direct field comparison) +2. CEL Filter Basic: Slower than original due to expression evaluation overhead +3. CEL Filter Complex: Similar to basic, cached compilation helps +4. CEL Filter Cache Miss: Slowest due to compilation overhead + +Performance Analysis: +- Original Filters: ~8,000 ns/op (optimized for static conditions) +- CEL Filters: ~4,000,000 ns/op (runtime flexibility cost) +- Cache Hit: ~350 ns/op (extremely fast cached access) +- Cache Miss: ~47,000 ns/op (compilation overhead) + +Benefits Analysis: +- Original Filters: + * Pros: Fast, type-safe, compile-time validation + * Cons: Limited flexibility, requires code changes for new conditions + +- CEL Filters: + * Pros: Runtime flexibility, powerful expressions, user-configurable + * Cons: Runtime compilation overhead, expression evaluation cost + +Recommendation: +- Use Original Filters for well-defined, static conditions +- Use CEL Filters for dynamic, user-configurable filtering requirements +- Consider hybrid approach: Original filters for basic filtering + CEL for advanced conditions +- Always use expression caching in production environments +`) +} diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go new file mode 100644 index 00000000..c21e4ee8 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go @@ -0,0 +1,368 @@ +package cel_filter + +import ( + "context" + "testing" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Helper functions for creating test data +func createTestGPU(name, namespace, gpuModel, phase string, tflops, vram float64) *tfv1.GPU { + gpu := &tfv1.GPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: make(map[string]string), + Annotations: make(map[string]string), + }, + Status: tfv1.GPUStatus{ + GPUModel: gpuModel, + UUID: "test-uuid-" + name, + Phase: tfv1.TensorFusionGPUPhase(phase), + Message: "Test GPU", + }, + } + + // Set available resources + if tflops > 0 || vram > 0 { + gpu.Status.Available = &tfv1.Resource{ + Tflops: *resource.NewMilliQuantity(int64(tflops*1000), resource.DecimalSI), + Vram: *resource.NewQuantity(int64(vram*1024*1024*1024), resource.BinarySI), + } + } + + return gpu +} + +func createTestAllocRequest(namespace, name, gpuModel, celExpression string) *tfv1.AllocRequest { + return &tfv1.AllocRequest{ + WorkloadNameNamespace: tfv1.NameNamespace{ + Name: name, + Namespace: namespace, + }, + GPUModel: gpuModel, + CELFilterExpression: celExpression, + Count: 1, + } +} + +// Test normal cases of CEL filter (including basic filtering, custom expression, labels/annotations, etc.) +func TestCELFilter_NormalCases(t *testing.T) { + ctx := context.Background() + + tests := []struct { + name string + request *tfv1.AllocRequest + gpus []*tfv1.GPU + expectedCount int + description string + }{ + { + name: "filter by GPU model", + request: createTestAllocRequest("default", "test-workload", "A100", ""), + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), + createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0), + }, + expectedCount: 2, + description: "Should filter GPUs matching the specified model A100", + }, + { + name: "filter by GPU phase only", + request: createTestAllocRequest("default", "test-workload", "", ""), + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-2", "default", "A100", "Pending", 150.0, 40.0), + createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-4", "default", "A100", "Failed", 150.0, 40.0), + }, + expectedCount: 2, + description: "Should only return GPUs in Ready phase", + }, + { + name: "custom CEL expression - filter by available TFLOPS", + request: createTestAllocRequest("default", "test-workload", "", "gpu.available.tflops > 120.0"), + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), + createTestGPU("gpu-3", "default", "H100", "Ready", 200.0, 80.0), + }, + expectedCount: 2, + description: "Should filter GPUs with TFLOPS > 120 and Ready phase", + }, + { + name: "custom CEL expression - filter by available VRAM", + request: createTestAllocRequest("default", "test-workload", "", "gpu.available.vram > 35000000000"), // > 35GB in bytes + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), // 40GB + createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), // 32GB + createTestGPU("gpu-3", "default", "H100", "Ready", 200.0, 80.0), // 80GB + }, + expectedCount: 2, + description: "Should filter GPUs with VRAM > 35GB and Ready phase", + }, + { + name: "combined model and custom CEL expression", + request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0"), + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-2", "default", "A100", "Ready", 120.0, 40.0), + createTestGPU("gpu-3", "default", "V100", "Ready", 160.0, 32.0), + createTestGPU("gpu-4", "default", "A100", "Ready", 180.0, 40.0), + }, + expectedCount: 2, + description: "Should filter A100 GPUs with TFLOPS >= 150 and Ready phase", + }, + { + name: "filter by labels", + request: createTestAllocRequest("default", "test-workload", "", "gpu.labels['environment'] == 'production'"), + gpus: func() []*tfv1.GPU { + gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0) + gpu1.Labels["environment"] = "production" + gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 150.0, 40.0) + gpu2.Labels["environment"] = "development" + gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0) + gpu3.Labels["environment"] = "production" + return []*tfv1.GPU{gpu1, gpu2, gpu3} + }(), + expectedCount: 2, + description: "Should filter GPUs with environment=production label", + }, + { + name: "filter by annotations", + request: createTestAllocRequest("default", "test-workload", "", "gpu.annotations['priority'] == 'critical'"), + gpus: func() []*tfv1.GPU { + gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0) + gpu1.Annotations["priority"] = "critical" + gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 150.0, 40.0) + gpu2.Annotations["priority"] = "low" + gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0) + gpu3.Annotations["priority"] = "critical" + return []*tfv1.GPU{gpu1, gpu2, gpu3} + }(), + expectedCount: 2, + description: "Should filter GPUs with priority=critical annotation", + }, + { + name: "combined labels and annotations filter", + request: createTestAllocRequest("default", "test-workload", "", "gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'"), + gpus: func() []*tfv1.GPU { + gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0) + gpu1.Labels["tier"] = "high-performance" + gpu1.Annotations["priority"] = "critical" + gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 150.0, 40.0) + gpu2.Labels["tier"] = "standard" + gpu2.Annotations["priority"] = "critical" + gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0) + gpu3.Labels["tier"] = "high-performance" + gpu3.Annotations["priority"] = "low" + return []*tfv1.GPU{gpu1, gpu2, gpu3} + }(), + expectedCount: 1, + description: "Should filter GPUs matching both label and annotation conditions", + }, + { + name: "empty GPU list", + request: createTestAllocRequest("default", "test-workload", "A100", ""), + gpus: []*tfv1.GPU{}, + expectedCount: 0, + description: "Should handle empty GPU list gracefully", + }, + { + name: "complex combined expression with model, resources, and metadata", + request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'"), + gpus: func() []*tfv1.GPU { + gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 180.0, 40.0) + gpu1.Labels["environment"] = "production" + gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 120.0, 40.0) + gpu2.Labels["environment"] = "production" + gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 200.0, 40.0) + gpu3.Labels["environment"] = "development" + return []*tfv1.GPU{gpu1, gpu2, gpu3} + }(), + expectedCount: 1, + description: "Should filter A100 GPUs with TFLOPS >= 150, production environment, and Ready phase", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create cache and CEL filter + cache, err := NewExpressionCache(10, 5*time.Minute) + require.NoError(t, err, "Failed to create expression cache") + + celFilter, err := NewCELFilter(tt.request, cache) + require.NoError(t, err, "Failed to create CEL filter") + + // Execute filter + workerPodKey := tfv1.NameNamespace{Name: "worker-pod", Namespace: "default"} + filteredGPUs, err := celFilter.Filter(ctx, workerPodKey, tt.gpus) + + // Verify results + require.NoError(t, err, "Filter execution should not fail") + assert.Len(t, filteredGPUs, tt.expectedCount, tt.description) + + // Verify filter name + assert.Contains(t, celFilter.Name(), "AllocRequest-") + assert.Contains(t, celFilter.Name(), tt.request.WorkloadNameNamespace.String()) + }) + } +} + +// Test edge and exception cases of CEL filter +func TestCELFilter_EdgeAndExceptionCases(t *testing.T) { + ctx := context.Background() + + // Test CEL expressions with various edge cases (compilation + execution) + t.Run("CEL expressions edge cases", func(t *testing.T) { + // Test GPUs for execution + testGPUs := []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), + } + // Add GPU with nil resources + gpuWithNilResources := createTestGPU("gpu-nil", "default", "A100", "Ready", 0, 0) + gpuWithNilResources.Status.Available = nil + testGPUs = append(testGPUs, gpuWithNilResources) + + workerPodKey := tfv1.NameNamespace{Name: "worker-pod", Namespace: "default"} + + edgeCases := []struct { + name string + expression string + shouldFail bool // Whether compilation/creation should fail + expectedCount int // Expected GPU count if execution succeeds + description string + }{ + // Compilation failures + { + name: "syntax error - missing quotes", + expression: "gpu.gpuModel == A100", + shouldFail: true, + description: "Missing quotes should cause compilation error", + }, + { + name: "syntax error - invalid operator", + expression: "gpu.phase === 'Ready'", + shouldFail: true, + description: "Invalid operator should cause compilation error", + }, + { + name: "undefined variable", + expression: "jdwquygfewqndwql", + shouldFail: true, + description: "Undefined variable should fail when combined with other conditions", + }, + { + name: "whitespace only expression", + expression: " ", + shouldFail: true, + description: "Whitespace-only expression should fail", + }, + + // Compilation success but runtime behavior testing + { + name: "empty expression", + expression: "", + shouldFail: false, + expectedCount: 3, // All Ready GPUs pass + description: "Empty expression should work (no additional filtering)", + }, + { + name: "logically contradictory expression", + expression: "gpu.phase > 100 && gpu.phase < 100", + shouldFail: false, + expectedCount: 0, // No GPUs pass impossible condition + description: "Contradictory logic should compile but filter out all GPUs", + }, + { + name: "type mismatch comparison", + expression: "gpu.phase == 123", + shouldFail: false, + expectedCount: 0, // No GPUs pass type mismatch + description: "Type mismatch should return false for all GPUs", + }, + { + name: "undefined nested field access", + expression: "gpu.nonexistent.field == 'value'", + shouldFail: false, + expectedCount: 0, // No GPUs pass undefined field check + description: "Undefined nested field should return false (fail-safe)", + }, + { + name: "numeric comparison on string", + expression: "gpu.gpuModel > 50", + shouldFail: false, + expectedCount: 0, // No GPUs pass invalid comparison + description: "Invalid type comparison should return false", + }, + { + name: "null field access", + expression: "gpu.available.tflops > 100", + shouldFail: false, + expectedCount: 1, // Only A100 with 150 TFLOPS passes (V100=100, nil=fails) + description: "Null field access should be handled gracefully", + }, + { + name: "conditional null handling", + expression: "has(gpu.available) ? gpu.available.tflops > 120 : false", + shouldFail: false, + expectedCount: 1, // Only A100 with 150 TFLOPS + description: "Conditional expressions should handle nulls correctly", + }, + { + name: "always true expression", + expression: "true", + shouldFail: false, + expectedCount: 3, // All Ready GPUs pass + description: "Tautology should pass all Ready phase GPUs", + }, + { + name: "always false expression", + expression: "false", + shouldFail: false, + expectedCount: 0, // No GPUs pass + description: "Contradiction should filter out all GPUs", + }, + } + + for _, tt := range edgeCases { + t.Run(tt.name, func(t *testing.T) { + cache, err := NewExpressionCache(10, 5*time.Minute) + require.NoError(t, err) + + request := createTestAllocRequest("default", "test-workload", "", tt.expression) + celFilter, err := NewCELFilter(request, cache) + + if tt.shouldFail { + // Should fail at creation or execution + if err != nil { + t.Logf("✅ Expected compilation failure: %v", err) + return + } + + // If creation succeeded, should fail at execution + _, err = celFilter.Filter(ctx, workerPodKey, testGPUs) + assert.Error(t, err, "Should fail during execution: %s", tt.description) + t.Logf("✅ Expected execution failure: %v", err) + } else { + // Should succeed in both creation and execution + require.NoError(t, err, "Filter creation should succeed: %s", tt.description) + + filteredGPUs, err := celFilter.Filter(ctx, workerPodKey, testGPUs) + require.NoError(t, err, "Filter execution should succeed: %s", tt.description) + + assert.Len(t, filteredGPUs, tt.expectedCount, tt.description) + t.Logf("✅ Expression '%s': %d/%d GPUs filtered", tt.expression, len(filteredGPUs), len(testGPUs)) + } + }) + } + }) +} diff --git a/internal/gpuallocator/filter/cel_filter/constants.go b/internal/gpuallocator/filter/cel_filter/constants.go index 7ea0cc85..2e43ab74 100644 --- a/internal/gpuallocator/filter/cel_filter/constants.go +++ b/internal/gpuallocator/filter/cel_filter/constants.go @@ -24,7 +24,6 @@ const ( GPUFieldAnnotations = "annotations" // Resource information - GPUFieldCapacity = "capacity" GPUFieldAvailable = "available" GPUFieldNodeSelector = "nodeSelector" GPUFieldRunningApps = "runningApps" diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index c1ce771b..44deb3c4 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -168,14 +168,14 @@ func (s *GpuAllocator) Filter( ) ([]*tfv1.GPU, []filter.FilterDetail, error) { // Check if CEL filtering is enabled via config/flag - useCELFilter := config.GetGlobalConfig().EnableCELFilter + disableCELFilter := req.DisableCELFilter - if useCELFilter { - // New CEL-based filtering approach - return s.applyCELFilter(req, toFilterGPUs, isSimulateSchedule) - } else { - // Legacy filter approach (for rollback support) + if disableCELFilter { + // Legacy filter approach return s.applyLegacyFilters(req, toFilterGPUs, isSimulateSchedule) + } else { + // CEL filter approach + return s.applyCELFilter(req, toFilterGPUs, isSimulateSchedule) } } @@ -191,7 +191,7 @@ func (s *GpuAllocator) applyCELFilter( return nil, nil, fmt.Errorf("failed to create CEL cache: %w", err) } - allocCELFilter, err := cel_filter.NewAllocRequestCELFilter(req, cache) + allocCELFilter, err := cel_filter.NewCELFilter(req, cache) if err != nil { return nil, nil, fmt.Errorf("failed to create AllocRequest CEL filter: %w", err) } @@ -1339,11 +1339,24 @@ func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest return &tfv1.AllocRequest{}, "gpu count annotation is too large", nil } + disableCELFilter := false + if disabledFeatures, exists := pod.Annotations[constants.DisableFeaturesAnnotation]; exists { + disabledFeaturesList := strings.Split(disabledFeatures, ",") + for _, feature := range disabledFeaturesList { + if feature == constants.BuiltInFeatureCELFilter { + disableCELFilter = true + } + } + } + allocRequest := tfv1.AllocRequest{ PoolName: pod.Annotations[constants.GpuPoolKey], Request: gpuRequestResource, Limit: gpuLimitResource, + DisableCELFilter: disableCELFilter, + CELFilterExpression: pod.Annotations[constants.CELFilterExpressionAnnotation], + Count: uint(count), GPUModel: pod.Annotations[constants.GPUModelAnnotation], WorkloadNameNamespace: tfv1.NameNamespace{ From 69788075cb1a2e3299743a618b6b3d7616f4d405 Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 31 Aug 2025 00:39:56 -0700 Subject: [PATCH 04/34] remove deperate config --- config/samples/cel_filter_example.yaml | 74 -------------------------- 1 file changed, 74 deletions(-) delete mode 100644 config/samples/cel_filter_example.yaml diff --git a/config/samples/cel_filter_example.yaml b/config/samples/cel_filter_example.yaml deleted file mode 100644 index aaf4895e..00000000 --- a/config/samples/cel_filter_example.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: tensor-fusion.ai/v1 -kind: SchedulingConfigTemplate -metadata: - name: cel-filter-example -spec: - placement: - mode: CompactFirst - allowUsingLocalGPU: true - - # Traditional GPU filters (still supported) - gpuFilters: - - type: avoidTooMuchConnectionsOnSameGPU - params: - connectionNum: 150 - - # CEL-based filters for advanced filtering logic - celFilters: - # High priority filter: only use running GPUs - - name: "only-running-gpus" - expression: "gpu.phase == 'Running'" - priority: 100 - - # Medium-high priority: ensure sufficient resources available - - name: "sufficient-resources" - expression: "gpu.available.tflops >= 0.5 && gpu.available.vram >= 4096000000" - priority: 90 - - # Medium priority: prefer premium tier GPUs - - name: "prefer-premium-gpus" - expression: "gpu.labels != null && 'gpu-tier' in gpu.labels && gpu.labels['gpu-tier'] == 'premium'" - priority: 80 - - # Lower priority: avoid overloaded GPUs - - name: "avoid-overloaded-gpus" - expression: "size(gpu.runningApps) < 3" - priority: 70 - - # GPU model specific filters - - name: "nvidia-only" - expression: "gpu.gpuModel.startsWith('NVIDIA')" - priority: 60 - - # Complex condition example - - name: "complex-filter" - expression: | - gpu.phase == 'Running' && - gpu.available.tflops > 0.3 && - ( - (gpu.labels != null && 'workload-type' in gpu.labels && gpu.labels['workload-type'] == 'training') || - (size(gpu.runningApps) == 0) - ) - priority: 50 - - # Optional: AutoScaling configuration - autoScaling: - autoSetLimits: - enable: true - targetResource: "all" - evaluationPeriod: "5m" - extraTFlopsBufferRatio: "0.1" - ---- -apiVersion: tensor-fusion.ai/v1 -kind: SchedulingConfigTemplate -metadata: - name: simple-cel-example -spec: - placement: - mode: LowLoadFirst - celFilters: - # Simple example: only use GPUs with more than 50% TFlops available - - name: "high-availability" - expression: "gpu.available.tflops > gpu.capacity.tflops * 0.5" - priority: 100 \ No newline at end of file From d3c112afa06fdac3125351f6021de18a7be3c720 Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 31 Aug 2025 00:40:36 -0700 Subject: [PATCH 05/34] remove docs --- docs/cel-filters.md | 264 -------------------------------------------- 1 file changed, 264 deletions(-) delete mode 100644 docs/cel-filters.md diff --git a/docs/cel-filters.md b/docs/cel-filters.md deleted file mode 100644 index 590e1d90..00000000 --- a/docs/cel-filters.md +++ /dev/null @@ -1,264 +0,0 @@ -# CEL Filters for GPU Allocation - -CEL (Common Expression Language) filters provide a powerful and flexible way to define custom GPU filtering logic in TensorFusion. This feature allows you to write expressions that determine which GPUs are eligible for allocation based on various criteria. - -## Overview - -CEL filters are defined in the `SchedulingConfigTemplate` resource and are applied during the GPU allocation process. They work alongside traditional GPU filters and provide more sophisticated filtering capabilities. - -## Configuration - -CEL filters are configured in the `placement.celFilters` field of a `SchedulingConfigTemplate`: - -```yaml -apiVersion: tensor-fusion.ai/v1 -kind: SchedulingConfigTemplate -metadata: - name: my-template -spec: - placement: - celFilters: - - name: "filter-name" - expression: "gpu.phase == 'Running'" - priority: 100 -``` - -### Fields - -- `name` (optional): A descriptive name for the filter, used for logging and debugging -- `expression` (required): The CEL expression that returns a boolean value -- `priority` (optional, default: 0): Higher priority filters are applied first - -## Available Variables - -CEL expressions have access to the following variables: - -### `gpu` Object - -The `gpu` variable contains information about the GPU being evaluated: - -```javascript -{ - "name": "gpu-1", // GPU name - "namespace": "default", // GPU namespace - "gpuModel": "NVIDIA A100", // GPU model - "uuid": "gpu-uuid", // GPU UUID - "phase": "Running", // GPU phase (Running, Pending, etc.) - "usedBy": "tensor-fusion", // Usage system - "labels": {...}, // Kubernetes labels - "annotations": {...}, // Kubernetes annotations - "capacity": { // Total GPU capacity - "tflops": 1.5, - "vram": 85899345920 // in bytes - }, - "available": { // Available GPU resources - "tflops": 1.0, - "vram": 64424509440 // in bytes - }, - "nodeSelector": {...}, // Node selector information - "runningApps": [ // Currently running applications - { - "name": "app-1", - "namespace": "default", - "count": 1 - } - ] -} -``` - -### `workerPodKey` Object - -Information about the requesting worker pod: - -```javascript -{ - "name": "worker-pod", - "namespace": "default" -} -``` - -## Expression Examples - -### Basic Filtering - -```yaml -# Only use running GPUs -- name: "running-only" - expression: "gpu.phase == 'Running'" - priority: 100 - -# Filter by GPU model -- name: "nvidia-only" - expression: "gpu.gpuModel.startsWith('NVIDIA')" - priority: 90 - -# Ensure minimum resources available -- name: "min-resources" - expression: "gpu.available.tflops >= 0.5 && gpu.available.vram >= 4294967296" - priority: 80 -``` - -### Label-Based Filtering - -```yaml -# Filter by labels -- name: "premium-tier" - expression: "gpu.labels != null && 'gpu-tier' in gpu.labels && gpu.labels['gpu-tier'] == 'premium'" - priority: 70 - -# Multiple label conditions -- name: "training-gpus" - expression: | - gpu.labels != null && - 'workload-type' in gpu.labels && - gpu.labels['workload-type'] == 'training' && - 'zone' in gpu.labels && - gpu.labels['zone'].startsWith('us-west') - priority: 60 -``` - -### Resource-Based Filtering - -```yaml -# Percentage of available resources -- name: "high-availability" - expression: "gpu.available.tflops > gpu.capacity.tflops * 0.7" - priority: 80 - -# Avoid overloaded GPUs -- name: "load-balancing" - expression: "size(gpu.runningApps) < 3" - priority: 50 - -# Memory-intensive workloads -- name: "high-memory" - expression: "gpu.available.vram > 34359738368" # > 32GB - priority: 60 -``` - -### Complex Conditions - -```yaml -# Complex multi-criteria filter -- name: "complex-filter" - expression: | - gpu.phase == 'Running' && - gpu.gpuModel.contains('A100') && - gpu.available.tflops > 0.8 && - ( - size(gpu.runningApps) == 0 || - (size(gpu.runningApps) < 2 && gpu.available.vram > 42949672960) - ) - priority: 90 -``` - -## CEL Language Features - -CEL supports many built-in functions and operators: - -### String Operations -- `startsWith()`, `endsWith()`, `contains()` -- String concatenation with `+` -- Regular expressions with `matches()` - -### Numeric Operations -- Standard arithmetic operators: `+`, `-`, `*`, `/`, `%` -- Comparison operators: `>`, `>=`, `<`, `<=`, `==`, `!=` - -### Logical Operations -- `&&` (and), `||` (or), `!` (not) - -### Collection Operations -- `size()` - get collection size -- `in` operator - check membership -- List/map access with `[]` - -### Conditional Expressions -- Ternary operator: `condition ? true_value : false_value` - -## Best Practices - -### Performance -1. **Order by Priority**: Place most restrictive filters first (highest priority) -2. **Avoid Complex Expressions**: Keep expressions simple for better performance -3. **Cache-Friendly**: Use consistent filter logic to benefit from any caching - -### Reliability -1. **Null Checks**: Always check for null values when accessing optional fields -2. **Fail-Safe Logic**: Design expressions to exclude GPUs on error rather than include them -3. **Test Thoroughly**: Test expressions with various GPU configurations - -### Maintainability -1. **Descriptive Names**: Use clear, descriptive names for filters -2. **Comments**: Add comments for complex expressions -3. **Modular Design**: Break complex logic into multiple simpler filters - -## Example Complete Configuration - -```yaml -apiVersion: tensor-fusion.ai/v1 -kind: SchedulingConfigTemplate -metadata: - name: production-gpu-scheduling -spec: - placement: - mode: CompactFirst - - # Traditional filters (still supported) - gpuFilters: - - type: avoidTooMuchConnectionsOnSameGPU - params: - connectionNum: 100 - - # CEL filters for advanced logic - celFilters: - # Critical filters (high priority) - - name: "operational-gpus-only" - expression: "gpu.phase == 'Running' && gpu.usedBy == 'tensor-fusion'" - priority: 100 - - - name: "sufficient-resources" - expression: "gpu.available.tflops >= 0.3 && gpu.available.vram >= 2147483648" - priority: 95 - - # Preference filters (medium priority) - - name: "prefer-nvidia" - expression: "gpu.gpuModel.startsWith('NVIDIA')" - priority: 80 - - - name: "balanced-load" - expression: "size(gpu.runningApps) < 2" - priority: 70 - - # Quality filters (lower priority) - - name: "premium-hardware" - expression: | - gpu.labels != null && - 'gpu-tier' in gpu.labels && - gpu.labels['gpu-tier'] in ['premium', 'high-performance'] - priority: 50 -``` - -## Troubleshooting - -### Common Issues - -1. **Expression Compilation Errors**: Check syntax and ensure all referenced fields exist -2. **Runtime Errors**: Add null checks for optional fields -3. **No GPUs Selected**: Verify that at least some GPUs meet all filter criteria -4. **Performance Issues**: Simplify complex expressions or reduce the number of filters - -### Debugging - -Enable debug logging to see detailed information about filter execution: - -```yaml -# In your logging configuration -logLevel: debug -``` - -Look for log entries containing "CEL filter applied" to see filtering results. - -## Migration from Traditional Filters - -CEL filters can be used alongside traditional GPU filters. They are applied after traditional filters in the filtering pipeline. You can gradually migrate complex traditional filters to CEL expressions for better maintainability. \ No newline at end of file From d466cdaa133ee29f8d14441dbbc9c83f08c0914a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Sep 2025 03:19:23 +0000 Subject: [PATCH 06/34] chore(deps): bump golang from 1.24 to 1.25 in /dockerfile (#325) --- dockerfile/node-discovery.Dockerfile | 2 +- dockerfile/operator.Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dockerfile/node-discovery.Dockerfile b/dockerfile/node-discovery.Dockerfile index 3fae02bd..09ac6741 100644 --- a/dockerfile/node-discovery.Dockerfile +++ b/dockerfile/node-discovery.Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.24 AS builder +FROM golang:1.25 AS builder ARG TARGETOS ARG TARGETARCH diff --git a/dockerfile/operator.Dockerfile b/dockerfile/operator.Dockerfile index fc76900c..65dfd514 100644 --- a/dockerfile/operator.Dockerfile +++ b/dockerfile/operator.Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.24 AS builder +FROM golang:1.25 AS builder ARG TARGETOS ARG TARGETARCH ARG GO_LDFLAGS From 8bd5e89b18d54ead703af8793c7c2999d2ce58c3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Sep 2025 03:19:48 +0000 Subject: [PATCH 07/34] chore(deps): bump cycjimmy/semantic-release-action from 4 to 5 (#338) --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index eeecf7e9..8dafe17b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -29,7 +29,7 @@ jobs: - name: Semantic Release if: github.event_name == 'push' id: semantic - uses: cycjimmy/semantic-release-action@v4 + uses: cycjimmy/semantic-release-action@v5 with: extra_plugins: | @semantic-release/release-notes-generator@^10 From 67b1c6401d8f2d43555f8831817cf842b679d29d Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Wed, 3 Sep 2025 19:11:59 +0800 Subject: [PATCH 08/34] fix: helm chart issue (#346) --- charts/tensor-fusion/Chart.yaml | 2 +- charts/tensor-fusion/values.schema.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml index 59de69d1..c72e6082 100644 --- a/charts/tensor-fusion/Chart.yaml +++ b/charts/tensor-fusion/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.5.5 +version: 1.5.6 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/tensor-fusion/values.schema.json b/charts/tensor-fusion/values.schema.json index 2c193cfd..67c74939 100644 --- a/charts/tensor-fusion/values.schema.json +++ b/charts/tensor-fusion/values.schema.json @@ -470,9 +470,9 @@ "description": "Metrics format, default to 'influx', could be 'json' or 'otel'" }, "metricsExtraPodLabels": { - "type": "array", + "type": "object", "description": "Extra pod labels to be added to metrics", - "items": { + "additionalProperties": { "type": "string" } }, From dbc088c192b1f283e649a82be1b7ebc7f6466244 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Sep 2025 03:27:42 +0000 Subject: [PATCH 09/34] chore(deps): bump k8s.io/kubernetes (#347) --- go.mod | 6 +++--- go.sum | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 4d2299db..72d32a0f 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/awslabs/operatorpkg v0.0.0-20250721225858-4e7491c57aa5 github.com/gin-contrib/gzip v1.2.3 github.com/gin-gonic/gin v1.10.1 + github.com/go-sql-driver/mysql v1.8.1 github.com/influxdata/line-protocol/v2 v2.2.1 github.com/lithammer/shortuuid/v4 v4.2.0 github.com/mitchellh/mapstructure v1.5.0 @@ -21,6 +22,7 @@ require ( github.com/shirou/gopsutil v3.21.11+incompatible github.com/stretchr/testify v1.11.0 go.opentelemetry.io/otel v1.37.0 + go.uber.org/zap v1.27.0 golang.org/x/time v0.12.0 gomodules.xyz/jsonpatch/v2 v2.5.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 @@ -32,7 +34,7 @@ require ( k8s.io/component-base v0.33.3 k8s.io/component-helpers v0.33.3 k8s.io/klog/v2 v2.130.1 - k8s.io/kubernetes v1.33.3 + k8s.io/kubernetes v1.33.4 k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 sigs.k8s.io/controller-runtime v0.21.0 sigs.k8s.io/karpenter v1.6.1 @@ -79,7 +81,6 @@ require ( github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/validator/v10 v10.26.0 // indirect - github.com/go-sql-driver/mysql v1.8.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -139,7 +140,6 @@ require ( go.opentelemetry.io/proto/otlp v1.4.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/arch v0.15.0 // indirect golang.org/x/crypto v0.39.0 // indirect diff --git a/go.sum b/go.sum index e98c785d..0f62b153 100644 --- a/go.sum +++ b/go.sum @@ -499,8 +499,8 @@ k8s.io/kube-scheduler v0.32.7 h1:QOvu/fNEYGg1gzzpowWHFCI8SD3vJs5Iz0qebEQADd4= k8s.io/kube-scheduler v0.32.7/go.mod h1:ez/2BnvZv2Bq1K9LpBsDgRsTvwJLAzkcpRMfY7rhLMA= k8s.io/kubelet v0.33.1 h1:x4LCw1/iZVWOKA4RoITnuB8gMHnw31HPB3S0EF0EexE= k8s.io/kubelet v0.33.1/go.mod h1:8WpdC9M95VmsqIdGSQrajXooTfT5otEj8pGWOm+KKfQ= -k8s.io/kubernetes v1.33.3 h1:dBx5Z2ZhR8kNzAwCoCz4j1niUbUrNUDVxeSj4/Ienu0= -k8s.io/kubernetes v1.33.3/go.mod h1:nrt8sldmckKz2fCZhgRX3SKfS2e+CzXATPv6ITNkU00= +k8s.io/kubernetes v1.33.4 h1:T1d5FLUYm3/KyUeV7YJhKTR980zHCHb7K2xhCSo3lE8= +k8s.io/kubernetes v1.33.4/go.mod h1:nrt8sldmckKz2fCZhgRX3SKfS2e+CzXATPv6ITNkU00= k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= From 865bdf5d1bb365639d7f4356d76893b5f52cd4c3 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Thu, 4 Sep 2025 12:00:51 +0800 Subject: [PATCH 10/34] fix: Potential fix for code scanning alert no. 36: Workflow does not contain permissions (#349) Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/test-e2e.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 3b4fec3f..42354dfe 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -1,5 +1,8 @@ name: E2E Tests +permissions: + contents: read + on: workflow_dispatch: From 9006e96b172ee0d099d75f2a2a05acd859cad93a Mon Sep 17 00:00:00 2001 From: D Date: Thu, 4 Sep 2025 12:01:44 +0800 Subject: [PATCH 11/34] support dedicated-gpu (#345) * support dedicated gpus * support dedicated GPU * support dedicated GPU * fix test issue --- cmd/main.go | 12 +++++-- internal/cloudprovider/pricing/pricing.go | 44 +++++++++++++++++++---- internal/constants/constants.go | 1 + internal/metrics/recorder.go | 38 ++++++++++++++------ internal/webhook/v1/pod_webhook.go | 19 +++++----- internal/webhook/v1/pod_webhook_test.go | 5 ++- internal/webhook/v1/tf_parser.go | 35 ++++++++++++++++++ internal/webhook/v1/webhook_suite_test.go | 5 ++- 8 files changed, 130 insertions(+), 29 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 92021131..23cd69b8 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -217,7 +217,9 @@ func main() { // Initialize GPU allocator and set up watches allocator, portAllocator := startTensorFusionAllocators(ctx, mgr) - startWebhook(mgr, portAllocator) + // Create pricing provider for webhook + pricingProvider := pricing.NewStaticPricingProvider() + startWebhook(mgr, portAllocator, pricingProvider) scheduler := startScheduler(ctx, allocator, mgr) @@ -441,11 +443,15 @@ func startCustomResourceController( } } -func startWebhook(mgr manager.Manager, portAllocator *portallocator.PortAllocator) { +func startWebhook( + mgr manager.Manager, + portAllocator *portallocator.PortAllocator, + pricingProvider pricing.PricingProvider, +) { if os.Getenv(constants.EnableWebhookEnv) == constants.FalseStringValue { return } - if err := webhookcorev1.SetupPodWebhookWithManager(mgr, portAllocator); err != nil { + if err := webhookcorev1.SetupPodWebhookWithManager(mgr, portAllocator, pricingProvider); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Pod") os.Exit(1) } diff --git a/internal/cloudprovider/pricing/pricing.go b/internal/cloudprovider/pricing/pricing.go index 33ee529f..e8854583 100644 --- a/internal/cloudprovider/pricing/pricing.go +++ b/internal/cloudprovider/pricing/pricing.go @@ -31,6 +31,7 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types" "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" + "k8s.io/apimachinery/pkg/api/resource" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -39,11 +40,17 @@ const ( providerAzure = "azure" ) +// CompleteGPUInfo combines GpuInfo with VRAM information from instance data +type CompleteGPUInfo struct { + *config.GpuInfo + VRAMGigabytes int32 +} + // Global data initialized at package load time var ( globalAWSGPUInstanceData map[string]GPUNodeInstanceInfoAndPrice globalAzureGPUInstanceData map[string]GPUNodeInstanceInfoAndPrice - tflopsMap map[string]*config.GpuInfo + tflopsMap map[string]*CompleteGPUInfo ) var readyCh = make(chan struct{}) @@ -51,8 +58,9 @@ var initOnce sync.Once // PricingProvider provides pricing information and calculations for instance types type PricingProvider interface { - GetPricing(instanceType, capacityType tfv1.CapacityTypeEnum) (float64, bool) - GetGPUNodeInstanceTypeInfo(region string) ([]string, bool) + GetPricing(instanceType string, capacityType tfv1.CapacityTypeEnum, region string) (float64, bool) + GetRegionalGPUNodeInstanceTypes(region string) ([]types.GPUNodeInstanceInfo, bool) + GetGPUCapacityByModel(gpuModel string) (resource.Quantity, resource.Quantity, bool) } type GPUNodeInstanceInfoAndPrice struct { @@ -77,7 +85,7 @@ var awsCSV string var azureCSV string func init() { - tflopsMap = make(map[string]*config.GpuInfo, 100) + tflopsMap = make(map[string]*CompleteGPUInfo, 100) } func SetTflopsMapAndInitGPUPricingInfo(ctx context.Context, gpuInfos *[]config.GpuInfo) { @@ -86,8 +94,11 @@ func SetTflopsMapAndInitGPUPricingInfo(ctx context.Context, gpuInfos *[]config.G return } for _, gpuInfo := range *gpuInfos { - tflopsMap[gpuInfo.FullModelName] = &gpuInfo - tflopsMap[gpuInfo.Model] = &gpuInfo + completeInfo := &CompleteGPUInfo{ + GpuInfo: &gpuInfo, + } + tflopsMap[gpuInfo.FullModelName] = completeInfo + tflopsMap[gpuInfo.Model] = completeInfo } initOnce.Do(func() { @@ -151,6 +162,11 @@ func loadCSVInstanceDataFromPath(ctx context.Context, data []byte, provider stri } instanceInfo.FP16TFlopsPerGPU = gpuInfo.Fp16TFlops.AsApproximateFloat64() + // Fill VRAM information if not already set + if gpuInfo.VRAMGigabytes == 0 { + gpuInfo.VRAMGigabytes = instanceInfo.VRAMGigabytesPerGPU + } + instanceInfoAndPrice := GPUNodeInstanceInfoAndPrice{ GPUNodeInstanceInfo: instanceInfo, onDemandPrice: prices[0], @@ -416,3 +432,19 @@ func (p *StaticPricingProvider) GetRegionalGPUNodeInstanceTypes(region string) ( return instanceTypes, len(instanceTypes) > 0 } + +// GetGPUCapacityByModel gets the full capacity (TFlops and VRAM) for a GPU model +// Returns TFlops, VRAM, and whether found +func (p *StaticPricingProvider) GetGPUCapacityByModel(gpuModel string) (resource.Quantity, resource.Quantity, bool) { + <-readyCh + + gpuInfo, exists := tflopsMap[gpuModel] + if !exists { + return resource.Quantity{}, resource.Quantity{}, false + } + + tflops := gpuInfo.Fp16TFlops + vram := *resource.NewQuantity(int64(gpuInfo.VRAMGigabytes)*constants.GiBToBytes, resource.BinarySI) + + return tflops, vram, true +} diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 32b3d6bc..bf95b3d9 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -69,6 +69,7 @@ const ( GPUModelAnnotation = Domain + "/gpu-model" // GPU ID list is assigned by scheduler, should not specified by user GPUDeviceIDsAnnotation = Domain + "/gpu-ids" + DedicatedGPUAnnotation = Domain + "/dedicated-gpu" SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload" PricingAnnotation = Domain + "/hourly-pricing" // In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go index 9050df00..d01ad315 100644 --- a/internal/metrics/recorder.go +++ b/internal/metrics/recorder.go @@ -187,19 +187,37 @@ func SetPoolMetrics(poolObj *tfv1.GPUPool) { } if poolObj.Status.VirtualAvailableTFlops != nil && poolObj.Status.VirtualAvailableVRAM != nil { - poolMetricsMap[poolObj.Name].AllocatedVramPercentToVirtualCap = poolMetricsMap[poolObj.Name].AllocatedVramBytes / - poolObj.Status.VirtualVRAM.AsApproximateFloat64() * 100 + virtualVRAM := poolObj.Status.VirtualVRAM.AsApproximateFloat64() + virtualTFlops := poolObj.Status.VirtualTFlops.AsApproximateFloat64() - poolMetricsMap[poolObj.Name].AllocatedTflopsPercentToVirtualCap = poolMetricsMap[poolObj.Name].AllocatedTflops / - poolObj.Status.VirtualTFlops.AsApproximateFloat64() * 100 - poolMetricsMap[poolObj.Name].AssignedLimitedTFlops = poolObj.Status.VirtualTFlops.AsApproximateFloat64() - + if virtualVRAM > 0 { + poolMetricsMap[poolObj.Name].AllocatedVramPercentToVirtualCap = poolMetricsMap[poolObj.Name].AllocatedVramBytes / virtualVRAM * 100 + } else { + poolMetricsMap[poolObj.Name].AllocatedVramPercentToVirtualCap = 0 + } + + if virtualTFlops > 0 { + poolMetricsMap[poolObj.Name].AllocatedTflopsPercentToVirtualCap = poolMetricsMap[poolObj.Name].AllocatedTflops / virtualTFlops * 100 + } else { + poolMetricsMap[poolObj.Name].AllocatedTflopsPercentToVirtualCap = 0 + } + + poolMetricsMap[poolObj.Name].AssignedLimitedTFlops = virtualTFlops - poolObj.Status.VirtualAvailableTFlops.AsApproximateFloat64() - poolMetricsMap[poolObj.Name].AssignedLimitedVramBytes = poolObj.Status.VirtualVRAM.AsApproximateFloat64() - + poolMetricsMap[poolObj.Name].AssignedLimitedVramBytes = virtualVRAM - poolObj.Status.VirtualAvailableVRAM.AsApproximateFloat64() - poolMetricsMap[poolObj.Name].AssignedLimitedTFlopsPercentToVirtualCap = poolMetricsMap[poolObj.Name].AssignedLimitedTFlops / - poolObj.Status.VirtualTFlops.AsApproximateFloat64() * 100 - poolMetricsMap[poolObj.Name].AssignedLimitedVramPercentToVirtualCap = poolMetricsMap[poolObj.Name].AssignedLimitedVramBytes / - poolObj.Status.VirtualVRAM.AsApproximateFloat64() * 100 + + if virtualTFlops > 0 { + poolMetricsMap[poolObj.Name].AssignedLimitedTFlopsPercentToVirtualCap = poolMetricsMap[poolObj.Name].AssignedLimitedTFlops / virtualTFlops * 100 + } else { + poolMetricsMap[poolObj.Name].AssignedLimitedTFlopsPercentToVirtualCap = 0 + } + + if virtualVRAM > 0 { + poolMetricsMap[poolObj.Name].AssignedLimitedVramPercentToVirtualCap = poolMetricsMap[poolObj.Name].AssignedLimitedVramBytes / virtualVRAM * 100 + } else { + poolMetricsMap[poolObj.Name].AssignedLimitedVramPercentToVirtualCap = 0 + } } poolMetricsMap[poolObj.Name].GPUCount = int(poolObj.Status.TotalGPUs) } diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 53610ffe..542a3ab0 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -37,6 +37,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/portallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" @@ -46,24 +47,26 @@ import ( var httpClient = &http.Client{Timeout: 10 * time.Second} // SetupPodWebhookWithManager registers the webhook for Pod in the manager. -func SetupPodWebhookWithManager(mgr ctrl.Manager, portAllocator *portallocator.PortAllocator) error { +func SetupPodWebhookWithManager(mgr ctrl.Manager, portAllocator *portallocator.PortAllocator, pricingProvider pricing.PricingProvider) error { webhookServer := mgr.GetWebhookServer() webhookServer.Register("/mutate-v1-pod", &admission.Webhook{ Handler: &TensorFusionPodMutator{ - decoder: admission.NewDecoder(runtime.NewScheme()), - Client: mgr.GetClient(), - portAllocator: portAllocator, + decoder: admission.NewDecoder(runtime.NewScheme()), + Client: mgr.GetClient(), + portAllocator: portAllocator, + pricingProvider: pricingProvider, }, }) return nil } type TensorFusionPodMutator struct { - Client client.Client - decoder admission.Decoder - portAllocator *portallocator.PortAllocator + Client client.Client + decoder admission.Decoder + portAllocator *portallocator.PortAllocator + pricingProvider pricing.PricingProvider } // Handle implements admission.Handler interface. @@ -100,7 +103,7 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque return admission.Errored(http.StatusBadRequest, fmt.Errorf("failed to marshal current pod: %w", err)) } - tfInfo, err := ParseTensorFusionInfo(ctx, m.Client, pod) + tfInfo, err := ParseTensorFusionInfo(ctx, m.Client, pod, m.pricingProvider) if err != nil { return admission.Errored(http.StatusInternalServerError, fmt.Errorf("parse tf resources: %w", err)) } diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index 55f29233..d72770cc 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -23,6 +23,7 @@ import ( "net/http" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing" "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" . "github.com/onsi/ginkgo/v2" @@ -532,7 +533,9 @@ var _ = Describe("TensorFusionPodMutator", func() { }, }, } - tfInfo, err := ParseTensorFusionInfo(ctx, k8sClient, pod) + // Create a mock pricing provider for testing + mockPricingProvider := &pricing.StaticPricingProvider{} + tfInfo, err := ParseTensorFusionInfo(ctx, k8sClient, pod, mockPricingProvider) Expect(err).NotTo(HaveOccurred()) Expect(tfInfo.ContainerNames).To(HaveLen(1)) Expect(tfInfo.ContainerNames[0]).To(Equal("test-container")) diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index bf805b76..cd72fbc1 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -7,6 +7,7 @@ import ( "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/utils" corev1 "k8s.io/api/core/v1" @@ -29,6 +30,7 @@ func ParseTensorFusionInfo( ctx context.Context, k8sClient client.Client, pod *corev1.Pod, + pricingProvider pricing.PricingProvider, ) (utils.TensorFusionInfo, error) { var info utils.TensorFusionInfo if pod.Annotations == nil { @@ -115,6 +117,12 @@ func ParseTensorFusionInfo( workloadProfile.Spec.GPUModel = gpuModel } + // Handle dedicated GPU logic + err = handleDedicatedGPU(pod, workloadProfile, pricingProvider) + if err != nil { + return info, fmt.Errorf("handle dedicated GPU: %w", err) + } + info.Profile = &workloadProfile.Spec info.ContainerNames = containerNames return info, nil @@ -227,3 +235,30 @@ func setDefaultQuotasIfExists(workloadProfile *tfv1.WorkloadProfile, single tfv1 } } } + +// handleDedicatedGPU handles dedicated GPU annotation by setting full GPU capacity +func handleDedicatedGPU(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile, pricingProvider pricing.PricingProvider) error { + dedicatedGPU, ok := pod.Annotations[constants.DedicatedGPUAnnotation] + if !ok || dedicatedGPU != constants.TrueStringValue { + return nil // Not a dedicated GPU request + } + + // Must have GPU model specified for dedicated GPU + if workloadProfile.Spec.GPUModel == "" { + return fmt.Errorf("dedicated GPU requires gpu-model annotation to be specified") + } + + // Get full GPU capacity from pricing provider + tflops, vram, found := pricingProvider.GetGPUCapacityByModel(workloadProfile.Spec.GPUModel) + if !found { + return fmt.Errorf("could not find capacity information for GPU model: %s", workloadProfile.Spec.GPUModel) + } + + // Set full capacity for both requests and limits + workloadProfile.Spec.Resources.Requests.Tflops = tflops + workloadProfile.Spec.Resources.Requests.Vram = vram + workloadProfile.Spec.Resources.Limits.Tflops = tflops + workloadProfile.Spec.Resources.Limits.Vram = vram + + return nil +} diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go index 4e5d369b..26a6685d 100644 --- a/internal/webhook/v1/webhook_suite_test.go +++ b/internal/webhook/v1/webhook_suite_test.go @@ -27,6 +27,7 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing" "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/portallocator" . "github.com/onsi/ginkgo/v2" @@ -134,11 +135,13 @@ var _ = BeforeSuite(func() { }) Expect(err).NotTo(HaveOccurred()) + // Create a mock pricing provider for testing + mockPricingProvider := &pricing.StaticPricingProvider{} err = SetupPodWebhookWithManager(mgr, &portallocator.PortAllocator{ PortRangeStartCluster: 42000, PortRangeEndCluster: 62000, BitmapCluster: make([]uint64, (62000-42000)/64+1), - }) + }, mockPricingProvider) Expect(err).NotTo(HaveOccurred()) // +kubebuilder:scaffold:webhook From 0389852aa9fd8780c82b31f9c28fda05223cfd47 Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Thu, 4 Sep 2025 18:44:59 +0800 Subject: [PATCH 12/34] fix: skip gpu limiter not working issue, observability optimize (#350) * fix: skip gpu limiter not working issue * fix: avoid k8s QoS side effect for inject lib init container * fix: potential panic issues * fix: remove unused event --- config/samples/dynamic-config.yaml | 275 ++++++++++++++++-- internal/cloudprovider/common/utils.go | 14 +- internal/cloudprovider/karpenter/nodeclaim.go | 6 +- .../tensorfusioncluster_controller.go | 3 - .../tensorfusionworkload_controller.go | 1 - internal/metrics/encoders/influx.go | 8 +- internal/metrics/recorder.go | 20 +- internal/utils/compose.go | 45 ++- 8 files changed, 329 insertions(+), 43 deletions(-) diff --git a/config/samples/dynamic-config.yaml b/config/samples/dynamic-config.yaml index c3102f3b..ae9350a3 100644 --- a/config/samples/dynamic-config.yaml +++ b/config/samples/dynamic-config.yaml @@ -1,23 +1,260 @@ metricsTTL: 30d # default to 'influx', influx v2 line protocol -metricsFormat: json +metricsFormat: influx -alertRules: -- name: GPUTFlopsFull - query: | - SELECT - node, - pool, - uuid, - avg(compute_percentage) AS compute_used - FROM tf_gpu_usage - WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }} - GROUP BY node, pool, uuid - threshold: 97 - evaluationInterval: 30s - consecutiveCount: 4 - severity: P1 - summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}" - alertTargetInstance: "{{ .uuid }}" - description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%" \ No newline at end of file +alertRules: + # Worker TFlops throttled alert + - name: WorkerTFlopsThrottled + query: | + SELECT workload, worker, uuid, node, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase + FROM tf_worker_usage + WHERE {{ .Conditions }} + GROUP BY workload, worker, uuid, node + HAVING throttled_increase > {{ .Threshold }} + threshold: 0 + evaluationInterval: 15s + consecutiveCount: 3 + severity: P1 + summary: "Worker TFlops Throttled" + description: "Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit" + alertTargetInstance: "{{ .worker }}-{{ .uuid }}" + runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook" + + # Worker VRAM switching too frequent alert + - name: WorkerVRAMSwitchCountIncreasing + query: | + SELECT workload, worker, uuid, node, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase + FROM tf_worker_usage + WHERE {{ .Conditions }} + GROUP BY workload, worker, uuid, node + HAVING switch_increase > {{ .Threshold }} + threshold: 0 + evaluationInterval: 2m + consecutiveCount: 1 + severity: P1 + summary: "Worker VRAM Switch Count Increasing" + description: "Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot" + alertTargetInstance: "{{ .worker }}-{{ .uuid }}" + runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook" + + # Worker can not scale up/scheduled alert + - name: WorkerAllocationFailed + query: | + SELECT pool, (MAX(total_allocation_fail_cnt) - MIN(total_allocation_fail_cnt)) as failure_increase, + FROM tf_system_metrics + WHERE {{ .Conditions }} + GROUP BY pool + HAVING failure_increase > {{ .Threshold }} + threshold: 0 + evaluationInterval: 30s + consecutiveCount: 1 + severity: P1 + summary: "Worker allocation failed for GPU Pool {{ .pool }}" + description: "Worker allocation failed, {{ .failure_increase }} times in last 30 seconds for GPU Pool {{ .pool }}" + alertTargetInstance: "{{ .pool }}" + runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook" + + # Single GPU Alerts + + # GPU VRAM Full Alert + - name: GPUVRAMFull + query: | + SELECT + node, + pool, + uuid, + avg(memory_percentage) AS memory_used + FROM tf_gpu_usage + WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }} + GROUP BY node, pool, uuid + threshold: 97 + evaluationInterval: 30s + consecutiveCount: 2 + severity: P1 + summary: "GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}" + alertTargetInstance: "{{ .uuid }}" + description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%" + + # GPU TFlops Full Alert + - name: GPUTFlopsFull + query: | + SELECT + node, + pool, + uuid, + avg(compute_percentage) AS compute_used + FROM tf_gpu_usage + WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }} + GROUP BY node, pool, uuid + threshold: 97 + evaluationInterval: 30s + consecutiveCount: 4 + severity: P1 + summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}" + alertTargetInstance: "{{ .uuid }}" + description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%" + + # GPU Temperature alert + - name: GPUTemperatureHigh + query: | + SELECT + node, + pool, + uuid, + avg(temperature) AS avg_temperature + FROM tf_gpu_usage + WHERE temperature > {{ .Threshold }} AND {{ .Conditions }} + GROUP BY node, pool, uuid + threshold: 90 + evaluationInterval: 30s + consecutiveCount: 3 + severity: P1 + summary: "GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}" + alertTargetInstance: "{{ .uuid }}" + description: "GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}" + runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook" + + # GPU Pool Alerts + + # Node TFlops allocation alert + - name: NodeTFlopsAllocationCritical + query: | + SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY node, pool + HAVING tflops_available < {{ .Threshold }} + threshold: 5 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P0 + summary: "Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}" + description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" + alertTargetInstance: "{{ .node }}" + + - name: NodeTFlopsAllocationWarning + query: | + SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY node, pool + HAVING tflops_available < {{ .Threshold }} + threshold: 10 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P1 + summary: "Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}" + description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" + alertTargetInstance: "{{ .node }}" + + # Pool TFlops allocation alert - Total + - name: PoolTotalTFlopsAllocationCritical + query: | + SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY pool + HAVING tflops_available < {{ .Threshold }} + threshold: 5 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P0 + summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%" + description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" + alertTargetInstance: "{{ .pool }}" + + - name: PoolTotalTFlopsAllocationWarning + query: | + SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY pool + HAVING tflops_available < {{ .Threshold }} + threshold: 10 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P1 + summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%" + description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" + alertTargetInstance: "{{ .pool }}" + + # Node VRAM allocation alert + - name: NodeVRAMAllocationCritical + query: | + SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY node, pool + HAVING vram_available < {{ .Threshold }} + threshold: 5 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P1 + summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}" + description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%" + alertTargetInstance: "{{ .node }}" + + - name: NodeVRAMAllocationWarning + query: | + SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY node, pool + HAVING vram_available < {{ .Threshold }} + threshold: 10 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P1 + summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}" + description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%" + alertTargetInstance: "{{ .node }}" + + # Pool VRAM allocation alert + - name: PoolVRAMAllocationWarning + query: | + SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available + FROM tf_node_metrics + WHERE {{ .Conditions }} + GROUP BY pool + HAVING vram_available < {{ .Threshold }} + threshold: 10 + evaluationInterval: 1m + consecutiveCount: 2 + severity: P1 + summary: "Pool available VRAM below threshold, remaining {{ .vram_available }}% for {{ .pool }}" + description: "Pool {{ .pool }} has available VRAM below {{ .Threshold }}%" + alertTargetInstance: "{{ .pool }}" + + # Empty or Idle GPU Alert + - name: EmptyGPU + query: | + SELECT DISTINCT node + FROM tf_node_metrics + WHERE {{ .Conditions }} AND node NOT IN ( + SELECT DISTINCT node + FROM tf_worker_usage + WHERE {{ .Conditions }} + ) + threshold: 0 + evaluationInterval: 5m + consecutiveCount: 2 + severity: P2 + summary: "Empty GPU without any workload, Node {{ .node }}" + description: "GPU Node {{ .node }} has no workload running, should be decommissioned" + alertTargetInstance: "{{ .node }}" + + - name: IdleGPU + query: | + SELECT node, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram + FROM tf_gpu_usage + WHERE {{ .Conditions }} + GROUP BY node, pool, uuid + HAVING compute < 1 and vram < {{ .Threshold }}; + threshold: 5 + evaluationInterval: 10m + consecutiveCount: 3 + severity: P2 + summary: "Idle GPU found: {{ .uuid }} on Node {{ .node }}" + description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}" + alertTargetInstance: "{{ .uuid }}" diff --git a/internal/cloudprovider/common/utils.go b/internal/cloudprovider/common/utils.go index fbe882ab..788ba96d 100644 --- a/internal/cloudprovider/common/utils.go +++ b/internal/cloudprovider/common/utils.go @@ -131,6 +131,16 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi nodes := make([]tfv1.GPUNodeClaimSpec, 0, bestNumInstances) for i := int64(0); i < bestNumInstances; i++ { + + tflopsQuantity, err := resource.ParseQuantity(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount))) + if err != nil { + return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err) + } + + vramQuantity, err := resource.ParseQuantity(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount)) + if err != nil { + return nil, fmt.Errorf("failed to parse VRAMOffered: %v", err) + } nodes = append(nodes, tfv1.GPUNodeClaimSpec{ NodeName: fmt.Sprintf("%s-%s", pool.Name, generateRandomString(8)), InstanceType: bestInstance.InstanceType, @@ -139,8 +149,8 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi Zone: zone, CapacityType: preferredCapacityType, - TFlopsOffered: resource.MustParse(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount))), - VRAMOffered: resource.MustParse(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount)), + TFlopsOffered: tflopsQuantity, + VRAMOffered: vramQuantity, GPUDeviceOffered: bestInstance.GPUCount, ExtraParams: cluster.Spec.ComputingVendor.Params.ExtraParams, diff --git a/internal/cloudprovider/karpenter/nodeclaim.go b/internal/cloudprovider/karpenter/nodeclaim.go index 2877e80d..15c8dcc0 100644 --- a/internal/cloudprovider/karpenter/nodeclaim.go +++ b/internal/cloudprovider/karpenter/nodeclaim.go @@ -318,7 +318,11 @@ func (p KarpenterGPUNodeProvider) buildNodeClaim(ctx context.Context, param *tfv // Add GPU resources if specified (Karpenter supports nvidia.com/gpu) if param.GPUDeviceOffered > 0 { - resourceRequests[karpenterConfig.GPUResourceName] = resource.MustParse(fmt.Sprintf("%d", param.GPUDeviceOffered)) + quantity, err := resource.ParseQuantity(fmt.Sprintf("%d", param.GPUDeviceOffered)) + if err != nil { + return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err) + } + resourceRequests[karpenterConfig.GPUResourceName] = quantity } // query nodeClass and build NodeClassRef diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go index a2f8ba12..d4f464c3 100644 --- a/internal/controller/tensorfusioncluster_controller.go +++ b/internal/controller/tensorfusioncluster_controller.go @@ -43,7 +43,6 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/metrics" utils "github.com/NexusGPU/tensor-fusion/internal/utils" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -382,7 +381,6 @@ func (r *TensorFusionClusterReconciler) checkTFClusterComponentsReady(ctx contex constants.LabelKeyOwner: tfc.GetName(), })) if err != nil { - r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "CheckComponentStatusError", err.Error()) return false, nil, fmt.Errorf("failed to list GPUPools: %w", err) } if len(pools.Items) != len(tfc.Spec.GPUPools) { @@ -411,7 +409,6 @@ func (r *TensorFusionClusterReconciler) updateTFClusterStatus(ctx context.Contex } } if err := r.Status().Update(ctx, tfc); err != nil { - r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "UpdateClusterStatusError", err.Error()) return err } return nil diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go index 1ec0d722..bc8ced78 100644 --- a/internal/controller/tensorfusionworkload_controller.go +++ b/internal/controller/tensorfusionworkload_controller.go @@ -347,7 +347,6 @@ func (r *TensorFusionWorkloadReconciler) updateStatus( readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = "WorkerFailed" readyCondition.Message = fmt.Sprintf("Failed workers num: %d", failedWorkers) - r.Recorder.Eventf(workload, corev1.EventTypeWarning, "WorkerFailed", "Failed workers num: %d", failedWorkers) } else if workload.Spec.IsDynamicReplica() { // for dynamic replicas, if no worker failed, indicate workload is running phase = tfv1.TensorFusionWorkloadPhaseRunning diff --git a/internal/metrics/encoders/influx.go b/internal/metrics/encoders/influx.go index a459c7ee..4d089759 100644 --- a/internal/metrics/encoders/influx.go +++ b/internal/metrics/encoders/influx.go @@ -4,6 +4,7 @@ import ( "time" metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol" + "k8s.io/klog/v2" ) // InfluxStrategy implements InfluxDB line protocol encoding @@ -28,7 +29,12 @@ func (s *InfluxStrategy) AddTag(key, value string) { } func (s *InfluxStrategy) AddField(key string, value any) { - s.enc.AddField(key, metricsProto.MustNewValue(value)) + v, parsed := metricsProto.NewValue(value) + if !parsed { + klog.Error("metrics influx encoder failed to parse value: ", key, value) + return + } + s.enc.AddField(key, v) } func (s *InfluxStrategy) EndLine(timestamp time.Time) { diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go index d01ad315..f1c14a39 100644 --- a/internal/metrics/recorder.go +++ b/internal/metrics/recorder.go @@ -413,16 +413,16 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) { enc.StartLine("tf_pool_metrics") enc.AddTag("pool", metrics.PoolName) enc.AddTag("phase", metrics.Phase) - enc.AddField("allocatedTflops", metrics.AllocatedTflops) - enc.AddField("allocatedTflopsPercent", metrics.AllocatedTflopsPercent) - enc.AddField("allocatedTflopsPercentVirtual", metrics.AllocatedTflopsPercentToVirtualCap) - enc.AddField("allocatedVramBytes", metrics.AllocatedVramBytes) - enc.AddField("allocatedVramPercent", metrics.AllocatedVramPercent) - enc.AddField("allocatedVramPercentVirtual", metrics.AllocatedVramPercentToVirtualCap) - enc.AddField("assignedLimitedTFlops", metrics.AssignedLimitedTFlops) - enc.AddField("assignedLimitedVramBytes", metrics.AssignedLimitedVramBytes) - enc.AddField("assignedLimitedTFlopsPercentVirtual", metrics.AssignedLimitedTFlopsPercentToVirtualCap) - enc.AddField("assignedLimitedVramPercentVirtual", metrics.AssignedLimitedVramPercentToVirtualCap) + enc.AddField("allocated_tflops", metrics.AllocatedTflops) + enc.AddField("allocated_tflops_percent", metrics.AllocatedTflopsPercent) + enc.AddField("allocated_tflops_percent_virtual", metrics.AllocatedTflopsPercentToVirtualCap) + enc.AddField("allocated_vram_bytes", metrics.AllocatedVramBytes) + enc.AddField("allocated_vram_percent", metrics.AllocatedVramPercent) + enc.AddField("allocated_vram_percent_virtual", metrics.AllocatedVramPercentToVirtualCap) + enc.AddField("limited_tflops", metrics.AssignedLimitedTFlops) + enc.AddField("limited_vram_bytes", metrics.AssignedLimitedVramBytes) + enc.AddField("limited_tflops_percent_virtual", metrics.AssignedLimitedTFlopsPercentToVirtualCap) + enc.AddField("limited_vram_percent_virtual", metrics.AssignedLimitedVramPercentToVirtualCap) enc.AddField("gpu_count", int64(metrics.GPUCount)) enc.EndLine(now) } diff --git a/internal/utils/compose.go b/internal/utils/compose.go index e7170881..344228b0 100644 --- a/internal/utils/compose.go +++ b/internal/utils/compose.go @@ -16,6 +16,10 @@ import ( "k8s.io/utils/ptr" ) +var injectLibResource v1.ResourceList = v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("20m"), + v1.ResourceMemory: resource.MustParse("64Mi"), +} var nodeDiscoveryDefaultRequests v1.ResourceList = v1.ResourceList{ v1.ResourceCPU: resource.MustParse("20m"), v1.ResourceMemory: resource.MustParse("64Mi"), @@ -175,6 +179,11 @@ func AddTFDefaultClientConfBeforePatch( MountPath: constants.TFLibsVolumeMountPath, }, }, + Resources: v1.ResourceRequirements{ + Requests: injectLibResource, + Limits: injectLibResource, + }, + Env: convertDisabledFeatures4InjectLib(pod.Annotations[constants.DisableFeaturesAnnotation]), }) pod.Spec.Volumes = append(pod.Spec.Volumes, v1.Volume{ Name: constants.TFLibsVolumeName, @@ -302,18 +311,42 @@ func AddTFDefaultClientConfBeforePatch( } func convertDisabledFeaturesToEnvs(disabledFeatures string, envList []v1.EnvVar) []v1.EnvVar { - disabledFeaturesList := strings.Split(disabledFeatures, ",") - for _, feature := range disabledFeaturesList { + disabledFeaturesList := strings.SplitSeq(disabledFeatures, ",") + for feature := range disabledFeaturesList { if feat, ok := featureShortcutMap[feature]; ok { - envList = append(envList, v1.EnvVar{ - Name: feat.EnvName, - Value: feat.EnvValue, - }) + if !lo.ContainsBy(envList, func(item v1.EnvVar) bool { + return item.Name == feat.EnvName + }) { + envList = append(envList, v1.EnvVar{ + Name: feat.EnvName, + Value: feat.EnvValue, + }) + } } } return envList } +func convertDisabledFeatures4InjectLib(disabledFeatures string) []v1.EnvVar { + if disabledFeatures == "" { + return []v1.EnvVar{} + } + disabledFeaturesList := strings.SplitSeq(disabledFeatures, ",") + + // GPU limiter by-pass take effect in bootstrap stage, add special handling here + for feature := range disabledFeaturesList { + if feature == constants.BuiltInFeaturesGpuLimiter { + return []v1.EnvVar{ + { + Name: featureShortcutMap[feature].EnvName, + Value: featureShortcutMap[feature].EnvValue, + }, + } + } + } + return []v1.EnvVar{} +} + func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, pool *tfv1.GPUPool) { // Hypervisor needs to read /proc to map pod with processID spec.HostPID = true From c0a35005c7bde6bc79bb42d2454bb3391a236ada Mon Sep 17 00:00:00 2001 From: D Date: Sun, 7 Sep 2025 22:49:29 +0800 Subject: [PATCH 13/34] fix: init pricing overwrite vram to 0 (#351) * support dedicated gpus * support dedicated GPU * support dedicated GPU * fix test issue * fix init pricing override vran * Revert "fix init pricing override vran" This reverts commit d0bea18f1b6777af66c71b300e9ba891453b5359. * fix init pricing override vram --- internal/cloudprovider/pricing/pricing.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/cloudprovider/pricing/pricing.go b/internal/cloudprovider/pricing/pricing.go index e8854583..45dd09bb 100644 --- a/internal/cloudprovider/pricing/pricing.go +++ b/internal/cloudprovider/pricing/pricing.go @@ -94,6 +94,9 @@ func SetTflopsMapAndInitGPUPricingInfo(ctx context.Context, gpuInfos *[]config.G return } for _, gpuInfo := range *gpuInfos { + if tflopsMap[gpuInfo.FullModelName] != nil { + continue + } completeInfo := &CompleteGPUInfo{ GpuInfo: &gpuInfo, } From f25c65db2e467d3c9c0dd0476b0622abd552fd1e Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:37:50 +0800 Subject: [PATCH 14/34] fix: add node hash for gpu k8s node, owner ref for hypervisor, isolate shm (#352) --- internal/constants/constants.go | 1 + internal/controller/gpunode_controller.go | 25 ++++++-- internal/controller/gpupool_controller.go | 63 +++++++++++++++++++ .../controller/gpupool_controller_test.go | 8 +++ internal/controller/node_controller.go | 31 +++++---- internal/utils/compose.go | 16 ++--- internal/utils/reconcile.go | 9 +++ 7 files changed, 125 insertions(+), 28 deletions(-) diff --git a/internal/constants/constants.go b/internal/constants/constants.go index bf95b3d9..81470022 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -30,6 +30,7 @@ const ( LabelKeyClusterOwner = Domain + "/cluster" LabelKeyNodeClass = Domain + "/node-class" LabelKeyPodTemplateHash = Domain + "/pod-template-hash" + LabelNodeSelectorHash = Domain + "/node-selector-hash" LabelComponent = Domain + "/component" // used by TF connection, for matching the related connections when worker Pod state changed LabelWorkerName = Domain + "/worker-name" diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index 9035ecdd..054d5922 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -140,7 +140,7 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } - hypervisorName, err := r.reconcileHypervisorPod(ctx, node, poolObj) + hypervisorName, err := r.reconcileHypervisorPod(ctx, node, poolObj, coreNode) if err != nil { return ctrl.Result{}, err } @@ -319,7 +319,12 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob( return nil } -func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tfv1.GPUNode, pool *tfv1.GPUPool) (string, error) { +func (r *GPUNodeReconciler) reconcileHypervisorPod( + ctx context.Context, + node *tfv1.GPUNode, + pool *tfv1.GPUPool, + k8sNode *corev1.Node, +) (string, error) { log := log.FromContext(ctx) if pool.Spec.ComponentConfig == nil || pool.Spec.ComponentConfig.Hypervisor == nil { @@ -361,7 +366,7 @@ func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tf } log.Info("hypervisor pod not found, creating new one", "node", node.Name) - if err := r.createHypervisorPod(ctx, key, node, pool); err != nil { + if err := r.createHypervisorPod(ctx, key, node, pool, k8sNode); err != nil { if errors.IsAlreadyExists(err) { log.Info("hypervisor pod already exists, skip creation", "node", node.Name) return "", nil @@ -372,7 +377,13 @@ func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tf return key.Name, nil } -func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client.ObjectKey, node *tfv1.GPUNode, pool *tfv1.GPUPool) error { +func (r *GPUNodeReconciler) createHypervisorPod( + ctx context.Context, + key client.ObjectKey, + node *tfv1.GPUNode, + pool *tfv1.GPUPool, + k8sNode *corev1.Node, +) error { log := log.FromContext(ctx) podTmpl := &corev1.PodTemplate{} @@ -447,7 +458,11 @@ func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client. }) err = controllerutil.SetControllerReference(node, newPod, r.Scheme) if err != nil { - return fmt.Errorf("failed to set controller reference: %w", err) + return fmt.Errorf("failed to set controller reference for hypervisor: %w", err) + } + // also set node owned by k8s node to allow Karpenter to delete the node while hypervisor exists + if err := controllerutil.SetOwnerReference(k8sNode, newPod, r.Scheme); err != nil { + return fmt.Errorf("failed to set owner reference for hypervisor: %w", err) } // create hypervisor pod diff --git a/internal/controller/gpupool_controller.go b/internal/controller/gpupool_controller.go index 987eb81b..da8c63aa 100644 --- a/internal/controller/gpupool_controller.go +++ b/internal/controller/gpupool_controller.go @@ -30,13 +30,16 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/metrics" utils "github.com/NexusGPU/tensor-fusion/internal/utils" "golang.org/x/time/rate" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" "k8s.io/client-go/util/workqueue" + schedulingcorev1 "k8s.io/component-helpers/scheduling/corev1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -83,6 +86,9 @@ type GPUPoolReconciler struct { // and requeue until current time after that, start provisioning loop var provisioningInitializationMinTime = map[string]time.Time{} +// When GPU nodeSelector changed, trigger all node update +var poolSelectorChangeMap = map[string]string{} + // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools/status,verbs=get;update;patch // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools/finalizers,verbs=update @@ -116,6 +122,10 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } + if err := r.reconcilePoolSelectorChange(ctx, pool); err != nil { + return ctrl.Result{}, err + } + if err := r.reconcilePoolCurrentCapacityAndReadiness(ctx, pool); err != nil { return ctrl.Result{}, err } @@ -404,6 +414,59 @@ func (r *GPUPoolReconciler) reconcilePoolComponents(ctx context.Context, pool *t return ctrlResult, utilerrors.NewAggregate(errs) } +func (r *GPUPoolReconciler) reconcilePoolSelectorChange(ctx context.Context, pool *tfv1.GPUPool) error { + if pool.Spec.NodeManagerConfig != nil && pool.Spec.NodeManagerConfig.NodeSelector != nil { + hash := utils.GetObjectHash(pool.Spec.NodeManagerConfig.NodeSelector) + if poolSelectorChangeMap[pool.Name] == hash { + return nil + } + + // hash has changed, or first reconcile, should check all k8s nodes + nodes := &corev1.NodeList{} + selectors := utils.GetInitialGPUNodeSelector() + if err := r.List(ctx, nodes, client.MatchingLabels{selectors[0]: selectors[1]}); err != nil { + return err + } + for _, node := range nodes.Items { + // skip no label or deleting nodes + if node.Labels == nil || !node.DeletionTimestamp.IsZero() { + continue + } + matches, err := schedulingcorev1.MatchNodeSelectorTerms(&node, pool.Spec.NodeManagerConfig.NodeSelector) + if err != nil { + return err + } + if matches { + if err := UpdateK8SNodeSelectorHash(ctx, r.Client, &node, hash); err != nil { + return err + } + } + } + poolSelectorChangeMap[pool.Name] = hash + return nil + } + return nil +} + +func UpdateK8SNodeSelectorHash(ctx context.Context, k8sClient client.Client, node *corev1.Node, hash string) error { + // skip nodes that already injected the hash + if node.Labels[constants.LabelNodeSelectorHash] == hash { + return nil + } + // update label to trigger the GPUNode reconcile + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + latest := &corev1.Node{} + if err := k8sClient.Get(ctx, client.ObjectKey{Name: node.Name}, latest); err != nil { + return err + } + latest.Labels[constants.LabelNodeSelectorHash] = hash + return k8sClient.Update(ctx, latest) + }); err != nil { + return err + } + return nil +} + func (r *GPUPoolReconciler) cleanUpPool(ctx context.Context, pool *tfv1.GPUPool) (bool, error) { log := log.FromContext(ctx) log.Info("TensorFusionGPUPool is being deleted", "name", pool.Name) diff --git a/internal/controller/gpupool_controller_test.go b/internal/controller/gpupool_controller_test.go index 50b033cd..e3be7a99 100644 --- a/internal/controller/gpupool_controller_test.go +++ b/internal/controller/gpupool_controller_test.go @@ -42,6 +42,14 @@ var _ = Describe("GPUPool Controller", func() { pool := tfEnv.GetGPUPool(0) g.Expect(pool.Status.Phase).Should(Equal(tfv1.TensorFusionPoolPhaseRunning)) }).Should(Succeed()) + Eventually(func(g Gomega) { + nodeList := tfEnv.GetGPUNodeList(0) + for _, gpuNode := range nodeList.Items { + node := &corev1.Node{} + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: gpuNode.Name}, node)).Should(Succeed()) + g.Expect(node.Labels).To(HaveKey(constants.LabelNodeSelectorHash)) + } + }).Should(Succeed()) tfEnv.Cleanup() }) }) diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index 3a9c652d..caedc903 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -19,8 +19,6 @@ package controller import ( "context" "fmt" - "os" - "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" @@ -86,12 +84,15 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{}, err } if !matched { - // delete gpunode if no matched pool - if err := r.Delete(ctx, &tfv1.GPUNode{ - ObjectMeta: metav1.ObjectMeta{ - Name: node.Name, - }, - }); err != nil { + existingGPUNode := &tfv1.GPUNode{} + if err := r.Get(ctx, client.ObjectKey{Name: node.Name}, existingGPUNode); err != nil { + if errors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("can not get gpuNode(%s) : %w", node.Name, err) + } + // delete existing gpunode if no matched pool + if err := r.Delete(ctx, existingGPUNode); err != nil { // requeue if the gpunode is not generated if errors.IsNotFound(err) { return ctrl.Result{}, nil @@ -121,6 +122,14 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{}, nil } + // update k8s node hash + hash := utils.GetObjectHash(pool.Spec.NodeManagerConfig.NodeSelector) + if node.Labels[constants.LabelNodeSelectorHash] != hash { + if err := UpdateK8SNodeSelectorHash(ctx, r.Client, node, hash); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update k8s node hash: %w", err) + } + } + provisioningMode := pool.Spec.NodeManagerConfig.ProvisioningMode isDirectManagedMode := provisioningMode == tfv1.ProvisioningModeProvisioned isManagedNode := isDirectManagedMode || provisioningMode == tfv1.ProvisioningModeKarpenter @@ -199,11 +208,7 @@ func (r *NodeReconciler) generateGPUNode(node *corev1.Node, pool *tfv1.GPUPool, // SetupWithManager sets up the controller with the Manager. func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error { // must choose an initial label selector to avoid performance impact in large Kubernetes clusters - selector := os.Getenv("INITIAL_GPU_NODE_LABEL_SELECTOR") - if selector == "" { - selector = constants.InitialGPUNodeSelector - } - selectors := strings.Split(selector, "=") + selectors := utils.GetInitialGPUNodeSelector() p, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{ MatchLabels: map[string]string{ selectors[0]: selectors[1], diff --git a/internal/utils/compose.go b/internal/utils/compose.go index 344228b0..93e8248c 100644 --- a/internal/utils/compose.go +++ b/internal/utils/compose.go @@ -229,10 +229,9 @@ func AddTFDefaultClientConfBeforePatch( pod.Spec.Containers[injectContainerIndex].VolumeMounts = append( pod.Spec.Containers[injectContainerIndex].VolumeMounts, v1.VolumeMount{ - Name: constants.DataVolumeName, - MountPath: constants.SharedMemDeviceName, - SubPath: constants.SharedMemMountSubPath, - // + constants.TFLibsVolumeMountPath, SubPathExpr: constants.TFDataPathWorkerExpr, + Name: constants.DataVolumeName, + MountPath: constants.TFLibsVolumeMountPath, + SubPathExpr: constants.TFDataPathWorkerExpr, MountPropagation: ptr.To(v1.MountPropagationHostToContainer), }) @@ -682,12 +681,9 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon spec.Containers[0].VolumeMounts = append( spec.Containers[0].VolumeMounts, v1.VolumeMount{ - Name: constants.DataVolumeName, - MountPath: constants.SharedMemDeviceName, - // TODO not working. - // + constants.TFLibsVolumeMountPath - // SubPathExpr: constants.TFDataPathWorkerExpr, - SubPath: constants.SharedMemMountSubPath, + Name: constants.DataVolumeName, + MountPath: constants.TFLibsVolumeMountPath, + SubPathExpr: constants.TFDataPathWorkerExpr, MountPropagation: ptr.To(v1.MountPropagationHostToContainer), }) spec.Containers[0].Env = append(spec.Containers[0].Env, v1.EnvVar{ diff --git a/internal/utils/reconcile.go b/internal/utils/reconcile.go index ebc091ac..23026cf7 100644 --- a/internal/utils/reconcile.go +++ b/internal/utils/reconcile.go @@ -214,6 +214,15 @@ func IsTensorFusionWorker(pod *corev1.Pod) bool { return pod.Labels[constants.LabelComponent] == constants.ComponentWorker } +func GetInitialGPUNodeSelector() []string { + selector := os.Getenv("INITIAL_GPU_NODE_LABEL_SELECTOR") + if selector == "" { + selector = constants.InitialGPUNodeSelector + } + selectors := strings.Split(selector, "=") + return selectors +} + var GPUResourceNames = []corev1.ResourceName{ "nvidia.com/gpu", "amd.com/gpu", From e6281872ea961bd535804eb439392090aaa86b78 Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Tue, 9 Sep 2025 17:09:21 +0800 Subject: [PATCH 15/34] fix: upgrade k8s 1.34, fix shm path, helm chart issues (#355) * chore: lint issue * fix: kubernetes upgrade, fix scheduler deps issue * fix: upgrade k8s version to 1.34, use fixed operator version in helm chart * fix: update shm path * chore: comment & wording * fix: connection naming * fix: upgrade github action * fix: add test for dedicated gpu allocation mode --- .github/workflows/lint.yml | 2 +- .github/workflows/test.yml | 4 +- .vscode/settings.json | 4 + api/v1/gpuresourcequota_types.go | 8 +- charts/tensor-fusion/Chart.yaml | 2 +- charts/tensor-fusion/values.yaml | 4 +- cmd/main.go | 55 +- cmd/sched/setup.go | 9 + go.mod | 207 ++++---- go.sum | 470 +++++++++--------- internal/constants/env.go | 10 +- internal/controller/node_controller.go | 2 + internal/gpuallocator/gpuallocator.go | 16 +- .../scheduler/gpuresources/gpuresources.go | 63 +-- .../gpuresources/gpuresources_test.go | 78 +-- .../scheduler/gputopo/gpu_network_topo.go | 5 +- internal/server/router/allocator_info.go | 11 +- internal/utils/compose.go | 7 +- internal/webhook/v1/pod_webhook.go | 18 +- internal/webhook/v1/pod_webhook_test.go | 56 ++- internal/webhook/v1/tf_parser.go | 17 +- patches/scheduler-csi-capacity-3.patch | 53 +- patches/scheduler-pdb-2.patch | 17 +- scripts/patch-scheduler.sh | 4 + test/sched/gpufit_bench_test.go | 6 +- test/sched/scheduler_bench_test.go | 35 +- test/sched/setup.go | 63 ++- 27 files changed, 717 insertions(+), 509 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index dfae921c..f56d3f6c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -29,7 +29,7 @@ jobs: uses: actions/checkout@v5 - name: Setup Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version: '~1.24' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1528d13e..b4be4381 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,13 +28,13 @@ jobs: strategy: matrix: # from https://github.com/kubernetes-sigs/controller-tools/blob/main/envtest-releases.yaml - envtest_k8s_version: [1.23.5, 1.33.0] + envtest_k8s_version: [1.23.5, 1.34.0] steps: - name: Clone the code uses: actions/checkout@v5 - name: Setup Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version: '~1.24' diff --git a/.vscode/settings.json b/.vscode/settings.json index 1285d84e..2a261510 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -25,6 +25,7 @@ "clientcmdapi", "clientgoscheme", "clientset", + "clientsetfake", "cloudnative", "cloudprovider", "clusterissuers", @@ -46,6 +47,7 @@ "envtest", "essd", "Eventf", + "featuregate", "finalizer", "Finalizers", "frameworkruntime", @@ -78,6 +80,8 @@ "iface", "imageutils", "influxdata", + "internalcache", + "internalqueue", "jsonpatch", "karpenter", "karpv", diff --git a/api/v1/gpuresourcequota_types.go b/api/v1/gpuresourcequota_types.go index c6ac1dba..1b28520a 100644 --- a/api/v1/gpuresourcequota_types.go +++ b/api/v1/gpuresourcequota_types.go @@ -19,7 +19,7 @@ package v1 import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/kubernetes/pkg/scheduler/framework" + fwk "k8s.io/kube-scheduler/framework" ) // GPUResourceQuotaSpec defines the desired state of GPUResourceQuota @@ -188,6 +188,10 @@ type AllocRequest struct { PodMeta metav1.ObjectMeta } +func (p *AllocRequest) Clone() fwk.StateData { + return p +} + type GPUAllocationInfo struct { Request Resource `json:"request,omitempty"` Limit Resource `json:"limit,omitempty"` @@ -203,7 +207,7 @@ type AdjustRequest struct { NewLimit Resource } -func (ar *AllocRequest) Clone() framework.StateData { +func (ar *AdjustRequest) Clone() fwk.StateData { return ar } diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml index c72e6082..d18568b7 100644 --- a/charts/tensor-fusion/Chart.yaml +++ b/charts/tensor-fusion/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.5.6 +version: 1.5.7 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml index cf4865f4..6b9fcc0c 100644 --- a/charts/tensor-fusion/values.yaml +++ b/charts/tensor-fusion/values.yaml @@ -31,7 +31,7 @@ controller: image: repository: tensorfusion/tensor-fusion-operator # Overrides the image tag whose default is the chart appVersion. - tag: "latest" + tag: "1.43.4" # This is for setting Kubernetes Annotations to a Pod. # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ @@ -120,7 +120,7 @@ agent: image: repository: tensorfusion/tensor-fusion-agent - tag: "latest" + tag: "1.0.0" resources: requests: diff --git a/cmd/main.go b/cmd/main.go index 23cd69b8..f4f2f0ab 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -27,27 +27,6 @@ import ( // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. - - "k8s.io/client-go/kubernetes" - _ "k8s.io/client-go/plugin/pkg/client/auth" - "k8s.io/client-go/rest" - "k8s.io/klog/v2" - - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/kubernetes/cmd/kube-scheduler/app" - "k8s.io/kubernetes/pkg/scheduler" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/healthz" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/metrics/filters" - metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" - "sigs.k8s.io/controller-runtime/pkg/webhook" - - "sigs.k8s.io/yaml" - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/cmd/sched" "github.com/NexusGPU/tensor-fusion/internal/alert" @@ -65,6 +44,25 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/NexusGPU/tensor-fusion/internal/version" webhookcorev1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + k8sVer "k8s.io/apimachinery/pkg/util/version" + "k8s.io/apiserver/pkg/util/feature" + "k8s.io/client-go/kubernetes" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + _ "k8s.io/client-go/plugin/pkg/client/auth" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + "k8s.io/kubernetes/cmd/kube-scheduler/app" + "k8s.io/kubernetes/pkg/scheduler" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/yaml" // +kubebuilder:scaffold:imports ) @@ -204,6 +202,14 @@ func main() { _ = os.Setenv(constants.KubeApiVersionMajorEnv, version.Major) _ = os.Setenv(constants.KubeApiVersionMinorEnv, version.Minor) + // TODO: there will still be risk after FeatureGate removed when the feature is stable for a long time + // To be compatible with long-term k8s version, need to patch Kubernetes source code + k8sVersion := k8sVer.MustParseSemantic(version.String()) + err = feature.DefaultMutableFeatureGate.SetEmulationVersion(k8sVersion) + if err != nil { + setupLog.Error(err, "unable to set k8s version for feature gating") + } + alertEvaluatorReady = make(chan struct{}) setupTimeSeriesAndWatchGlobalConfigChanges(ctx, mgr) @@ -221,7 +227,7 @@ func main() { pricingProvider := pricing.NewStaticPricingProvider() startWebhook(mgr, portAllocator, pricingProvider) - scheduler := startScheduler(ctx, allocator, mgr) + scheduler := startScheduler(ctx, allocator, mgr, k8sVersion) startCustomResourceController(ctx, mgr, metricsRecorder, allocator, portAllocator) @@ -461,6 +467,7 @@ func startScheduler( ctx context.Context, allocator *gpuallocator.GpuAllocator, mgr manager.Manager, + k8sVersion *k8sVer.Version, ) *scheduler.Scheduler { if os.Getenv(constants.EnableSchedulerEnv) == constants.FalseStringValue { return nil @@ -479,7 +486,9 @@ func startScheduler( gpuTopoPlugin.NewWithDeps(allocator, mgr.GetClient()), ) - cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, false, gpuResourceFitOpt, gpuTopoOpt) + cc, scheduler, err := sched.SetupScheduler( + ctx, mgr, schedulerConfigPath, false, k8sVersion, gpuResourceFitOpt, gpuTopoOpt, + ) if err != nil { setupLog.Error(err, "unable to create tensor fusion scheduler") os.Exit(1) diff --git a/cmd/sched/setup.go b/cmd/sched/setup.go index 2818fba2..20b28f96 100644 --- a/cmd/sched/setup.go +++ b/cmd/sched/setup.go @@ -22,6 +22,8 @@ import ( "strings" utilerrors "k8s.io/apimachinery/pkg/util/errors" + k8sVer "k8s.io/apimachinery/pkg/util/version" + "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/tools/events" "k8s.io/component-base/configz" "k8s.io/klog/v2" @@ -50,6 +52,7 @@ func SetupScheduler( mgr manager.Manager, schedulerConfigPath string, disableHttpEndpoint bool, + k8sVersion *k8sVer.Version, outOfTreeRegistryOptions ...app.Option, ) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) { opts := options.NewOptions() @@ -73,6 +76,12 @@ func SetupScheduler( return nil, nil, err } + // Setup enumerationVersion again since it's overridden by the config + err = feature.DefaultMutableFeatureGate.SetEmulationVersion(k8sVersion) + if err != nil { + return nil, nil, err + } + if cfg, err := latest.Default(); err != nil { return nil, nil, err } else { diff --git a/go.mod b/go.mod index 72d32a0f..e8da7faf 100644 --- a/go.mod +++ b/go.mod @@ -6,171 +6,184 @@ require ( github.com/DATA-DOG/go-sqlmock v1.5.2 github.com/NVIDIA/go-nvml v0.13.0-1 github.com/aliyun/alibaba-cloud-sdk-go v1.63.107 - github.com/aws/aws-sdk-go-v2 v1.38.1 - github.com/aws/aws-sdk-go-v2/service/ec2 v1.241.0 - github.com/awslabs/operatorpkg v0.0.0-20250721225858-4e7491c57aa5 + github.com/aws/aws-sdk-go-v2 v1.38.3 + github.com/aws/aws-sdk-go-v2/service/ec2 v1.251.0 + github.com/awslabs/operatorpkg v0.0.0-20250903180825-ba7ac0af36e5 github.com/gin-contrib/gzip v1.2.3 github.com/gin-gonic/gin v1.10.1 - github.com/go-sql-driver/mysql v1.8.1 + github.com/go-sql-driver/mysql v1.9.3 github.com/influxdata/line-protocol/v2 v2.2.1 github.com/lithammer/shortuuid/v4 v4.2.0 github.com/mitchellh/mapstructure v1.5.0 - github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.38.0 + github.com/onsi/ginkgo/v2 v2.25.3 + github.com/onsi/gomega v1.38.2 github.com/pkg/errors v0.9.1 github.com/samber/lo v1.51.0 github.com/shirou/gopsutil v3.21.11+incompatible - github.com/stretchr/testify v1.11.0 - go.opentelemetry.io/otel v1.37.0 + github.com/stretchr/testify v1.11.1 + go.opentelemetry.io/otel v1.38.0 go.uber.org/zap v1.27.0 - golang.org/x/time v0.12.0 + golang.org/x/time v0.13.0 gomodules.xyz/jsonpatch/v2 v2.5.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gorm.io/driver/mysql v1.6.0 - gorm.io/gorm v1.30.1 - k8s.io/api v0.33.3 - k8s.io/apimachinery v0.33.3 - k8s.io/client-go v0.33.3 - k8s.io/component-base v0.33.3 - k8s.io/component-helpers v0.33.3 + gorm.io/gorm v1.30.3 + k8s.io/api v0.34.0 + k8s.io/apimachinery v0.34.0 + k8s.io/client-go v0.34.0 + k8s.io/component-base v0.34.0 + k8s.io/component-helpers v0.34.0 k8s.io/klog/v2 v2.130.1 - k8s.io/kubernetes v1.33.4 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/karpenter v1.6.1 - sigs.k8s.io/scheduler-plugins v0.32.7 + k8s.io/kube-scheduler v0.34.0 + k8s.io/kubernetes v1.34.0 + k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d + sigs.k8s.io/controller-runtime v0.22.0 + sigs.k8s.io/karpenter v1.6.2 sigs.k8s.io/yaml v1.6.0 ) require ( - cel.dev/expr v0.23.1 // indirect + cel.dev/expr v0.24.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect - github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect + github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/antlr4-go/antlr/v4 v4.13.1 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.2 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.2 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.0 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.2 // indirect - github.com/aws/smithy-go v1.22.5 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.6 // indirect + github.com/aws/smithy-go v1.23.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect - github.com/bytedance/sonic v1.13.2 // indirect - github.com/bytedance/sonic/loader v0.2.4 // indirect - github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/bytedance/gopkg v0.1.3 // indirect + github.com/bytedance/sonic v1.14.1 // indirect + github.com/bytedance/sonic/loader v0.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cloudwego/base64x v0.1.5 // indirect + github.com/cloudwego/base64x v0.1.6 // indirect github.com/coreos/go-semver v0.3.1 // indirect - github.com/coreos/go-systemd/v22 v22.5.0 // indirect + github.com/coreos/go-systemd/v22 v22.6.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/emicklei/go-restful/v3 v3.12.1 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fsnotify/fsnotify v1.8.0 // indirect - github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/gabriel-vasile/mimetype v1.4.8 // indirect - github.com/gin-contrib/sse v1.0.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/gabriel-vasile/mimetype v1.4.10 // indirect + github.com/gin-contrib/sse v1.1.0 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect - github.com/go-ole/go-ole v1.2.6 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-ole/go-ole v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.22.0 // indirect + github.com/go-openapi/jsonreference v0.21.1 // indirect + github.com/go-openapi/swag v0.24.1 // indirect + github.com/go-openapi/swag/cmdutils v0.24.0 // indirect + github.com/go-openapi/swag/conv v0.24.0 // indirect + github.com/go-openapi/swag/fileutils v0.24.0 // indirect + github.com/go-openapi/swag/jsonname v0.24.0 // indirect + github.com/go-openapi/swag/jsonutils v0.24.0 // indirect + github.com/go-openapi/swag/loading v0.24.0 // indirect + github.com/go-openapi/swag/mangling v0.24.0 // indirect + github.com/go-openapi/swag/netutils v0.24.0 // indirect + github.com/go-openapi/swag/stringutils v0.24.0 // indirect + github.com/go-openapi/swag/typeutils v0.24.0 // indirect + github.com/go-openapi/swag/yamlutils v0.24.0 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect - github.com/go-playground/validator/v10 v10.26.0 // indirect + github.com/go-playground/validator/v10 v10.27.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/cel-go v0.26.1 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect + github.com/google/pprof v0.0.0-20250903194437-c28834ac2320 // indirect github.com/google/uuid v1.6.0 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jinzhu/inflection v1.0.0 // indirect github.com/jinzhu/now v1.1.5 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.2 // indirect - github.com/klauspost/cpuid/v2 v2.2.10 // indirect + github.com/klauspost/cpuid/v2 v2.3.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect - github.com/moby/term v0.5.0 // indirect + github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect - github.com/pelletier/go-toml/v2 v2.2.3 // indirect + github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.62.0 // indirect - github.com/prometheus/procfs v0.15.1 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect - github.com/spf13/cobra v1.8.1 // indirect - github.com/spf13/pflag v1.0.6 // indirect - github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/spf13/cobra v1.10.1 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/stoewer/go-strcase v1.3.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect - github.com/ugorji/go/codec v1.2.12 // indirect + github.com/ugorji/go/codec v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect - go.etcd.io/etcd/api/v3 v3.5.21 // indirect - go.etcd.io/etcd/client/pkg/v3 v3.5.21 // indirect - go.etcd.io/etcd/client/v3 v3.5.21 // indirect + go.etcd.io/etcd/api/v3 v3.6.4 // indirect + go.etcd.io/etcd/client/pkg/v3 v3.6.4 // indirect + go.etcd.io/etcd/client/v3 v3.6.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.37.0 // indirect - go.opentelemetry.io/otel/sdk v1.33.0 // indirect - go.opentelemetry.io/otel/trace v1.37.0 // indirect - go.opentelemetry.io/proto/otlp v1.4.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/sdk v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.8.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/arch v0.15.0 // indirect - golang.org/x/crypto v0.39.0 // indirect - golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 // indirect - golang.org/x/net v0.41.0 // indirect - golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect - golang.org/x/tools v0.33.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250102185135-69823020774d // indirect - google.golang.org/grpc v1.69.4 // indirect - google.golang.org/protobuf v1.36.6 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/arch v0.21.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/oauth2 v0.31.0 // indirect + golang.org/x/sync v0.17.0 // indirect + golang.org/x/sys v0.36.0 // indirect + golang.org/x/term v0.35.0 // indirect + golang.org/x/text v0.29.0 // indirect + golang.org/x/tools v0.36.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250826171959-ef028d996bc1 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 // indirect + google.golang.org/grpc v1.75.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.2 // indirect - k8s.io/apiserver v0.33.2 // indirect - k8s.io/cloud-provider v0.33.2 // indirect - k8s.io/controller-manager v0.33.2 // indirect - k8s.io/csi-translation-lib v0.33.2 // indirect - k8s.io/dynamic-resource-allocation v0.33.1 // indirect - k8s.io/kms v0.33.2 // indirect - k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect - k8s.io/kube-scheduler v0.32.7 // indirect - k8s.io/kubelet v0.33.1 // indirect - sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + k8s.io/apiextensions-apiserver v0.34.0 // indirect + k8s.io/apiserver v0.34.0 // indirect + k8s.io/cloud-provider v0.34.0 // indirect + k8s.io/controller-manager v0.34.0 // indirect + k8s.io/csi-translation-lib v0.34.0 // indirect + k8s.io/dynamic-resource-allocation v0.34.0 // indirect + k8s.io/kms v0.34.0 // indirect + k8s.io/kube-openapi v0.0.0-20250905212525-66792eed8611 // indirect + k8s.io/kubelet v0.34.0 // indirect + sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect ) diff --git a/go.sum b/go.sum index 0f62b153..446e3470 100644 --- a/go.sum +++ b/go.sum @@ -1,14 +1,16 @@ -cel.dev/expr v0.23.1 h1:K4KOtPCJQjVggkARsjG9RWXP6O4R73aHeJMa/dmCQQg= -cel.dev/expr v0.23.1/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= +cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/HdrHistogram/hdrhistogram-go v1.1.2/go.mod h1:yDgFjdqOqDEKOvasDdhWNXYg9BVp4O+o5f6V/ehm6Oo= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= @@ -22,43 +24,43 @@ github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYW github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go-v2 v1.38.1 h1:j7sc33amE74Rz0M/PoCpsZQ6OunLqys/m5antM0J+Z8= -github.com/aws/aws-sdk-go-v2 v1.38.1/go.mod h1:9Q0OoGQoboYIAJyslFyF1f5K1Ryddop8gqMhWx/n4Wg= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.2 h1:sPiRHLVUIIQcoVZTNwqQcdtjkqkPopyYmIX0M5ElRf4= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.2/go.mod h1:ik86P3sgV+Bk7c1tBFCwI3VxMoSEwl4YkRB9xn1s340= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.2 h1:ZdzDAg075H6stMZtbD2o+PyB933M/f20e9WmCBC17wA= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.2/go.mod h1:eE1IIzXG9sdZCB0pNNpMpsYTLl4YdOQD3njiVN1e/E4= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.241.0 h1:twGX//bv1QH/9pyJaqynNSo0eXGkDEdDTFy8GNPsz5M= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.241.0/go.mod h1:HDxGArx3/bUnkoFsuvTNIxEj/cR3f+IgsVh1B7Pvay8= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.0 h1:6+lZi2JeGKtCraAj1rpoZfKqnQ9SptseRZioejfUOLM= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.0/go.mod h1:eb3gfbVIxIoGgJsi9pGne19dhCBpK6opTYpQqAmdy44= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.2 h1:oxmDEO14NBZJbK/M8y3brhMFEIGN4j8a6Aq8eY0sqlo= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.2/go.mod h1:4hH+8QCrk1uRWDPsVfsNDUup3taAjO8Dnx63au7smAU= -github.com/aws/smithy-go v1.22.5 h1:P9ATCXPMb2mPjYBgueqJNCA5S9UfktsW0tTxi+a7eqw= -github.com/aws/smithy-go v1.22.5/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= -github.com/awslabs/operatorpkg v0.0.0-20250721225858-4e7491c57aa5 h1:m/qynRSKYe4RKSroVqRRgMlp/cUXO54SY2upSUqfcqw= -github.com/awslabs/operatorpkg v0.0.0-20250721225858-4e7491c57aa5/go.mod h1:3Lf3VaiJyr3IP0gH53sZp16Tu5CmoaDSUv4KQwFQO/I= +github.com/aws/aws-sdk-go-v2 v1.38.3 h1:B6cV4oxnMs45fql4yRH+/Po/YU+597zgWqvDpYMturk= +github.com/aws/aws-sdk-go-v2 v1.38.3/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 h1:uF68eJA6+S9iVr9WgX1NaRGyQ/6MdIyc4JNUo6TN1FA= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6/go.mod h1:qlPeVZCGPiobx8wb1ft0GHT5l+dc6ldnwInDFaMvC7Y= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 h1:pa1DEC6JoI0zduhZePp3zmhWvk/xxm4NB8Hy/Tlsgos= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6/go.mod h1:gxEjPebnhWGJoaDdtDkA0JX46VRg1wcTHYe63OfX5pE= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.251.0 h1:hGHSNZDTFnhLGUpRkQORM8uBY9R/FOkxCkuUUJBEOQ4= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.251.0/go.mod h1:SmMqzfS4HVsOD58lwLZ79oxF58f8zVe5YdK3o+/o1Ck= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 h1:oegbebPEMA/1Jny7kvwejowCaHz1FWZAQ94WXFNCyTM= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1/go.mod h1:kemo5Myr9ac0U9JfSjMo9yHLtw+pECEHsFtJ9tqCEI8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.6 h1:LHS1YAIJXJ4K9zS+1d/xa9JAA9sL2QyXIQCQFQW/X08= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.6/go.mod h1:c9PCiTEuh0wQID5/KqA32J+HAgZxN9tOGXKCiYJjTZI= +github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE= +github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= +github.com/awslabs/operatorpkg v0.0.0-20250903180825-ba7ac0af36e5 h1:MM4Y7+YqhWLZiRuZfWrAXD2rZ0maVePbzARP3adeJ+g= +github.com/awslabs/operatorpkg v0.0.0-20250903180825-ba7ac0af36e5/go.mod h1:OCT5DIzVB2740qVgfRz0zQe/dDdvnsnFarzy6VdYNoA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/bytedance/sonic v1.13.2 h1:8/H1FempDZqC4VqjptGo14QQlJx8VdZJegxs6wwfqpQ= -github.com/bytedance/sonic v1.13.2/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1+KgkJhz4= -github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= -github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY= -github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= -github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= -github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/bytedance/gopkg v0.1.3 h1:TPBSwH8RsouGCBcMBktLt1AymVo2TVsBVCY4b6TnZ/M= +github.com/bytedance/gopkg v0.1.3/go.mod h1:576VvJ+eJgyCzdjS+c4+77QF3p7ubbtiKARP3TxducM= +github.com/bytedance/sonic v1.14.1 h1:FBMC0zVz5XUmE4z9wF4Jey0An5FueFvOsTKKKtwIl7w= +github.com/bytedance/sonic v1.14.1/go.mod h1:gi6uhQLMbTdeP0muCnrjHLeCUPyb70ujhnNlhOylAFc= +github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZwvZJyqeA= +github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= -github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= -github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= +github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M= +github.com/cloudwego/base64x v0.1.6/go.mod h1:OFcloc187FXDaYHvrNIjxSe8ncn0OOM8gEHfghB2IPU= github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec= -github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= -github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/coreos/go-systemd/v22 v22.6.0 h1:aGVa/v8B7hpb0TKl0MWoAavPDmHvobFe5R5zn0bCJWo= +github.com/coreos/go-systemd/v22 v22.6.0/go.mod h1:iG+pp635Fo7ZmV/j14KUcmEyWF+0X7Lua8rrTWzYgWU= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= @@ -70,8 +72,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= -github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= -github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -83,16 +85,16 @@ github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= -github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= -github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= -github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= -github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= -github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gabriel-vasile/mimetype v1.4.10 h1:zyueNbySn/z8mJZHLt6IPw0KoZsiQNszIpU+bX4+ZK0= +github.com/gabriel-vasile/mimetype v1.4.10/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s= github.com/gin-contrib/gzip v1.2.3 h1:dAhT722RuEG330ce2agAs75z7yB+NKvX/ZM1r8w0u2U= github.com/gin-contrib/gzip v1.2.3/go.mod h1:ad72i4Bzmaypk8M762gNXa2wkxxjbz0icRNnuLJ9a/c= -github.com/gin-contrib/sse v1.0.0 h1:y3bT1mUWUxDpW4JLQg/HnTqV4rozuW4tC9eFKTxYI9E= -github.com/gin-contrib/sse v1.0.0/go.mod h1:zNuFdwarAygJBht0NTKiSi3jRf6RbqeILZ9Sp6Slhe0= +github.com/gin-contrib/sse v1.1.0 h1:n0w2GMuUpWDVp7qSpvze6fAu9iRxJY4Hmj6AmBOU05w= +github.com/gin-contrib/sse v1.1.0/go.mod h1:hxRZ5gVpWMT7Z0B0gSNYqqsSCNIJMjzvm6fqCz9vjwM= github.com/gin-gonic/gin v1.10.1 h1:T0ujvqyCSqRopADpgPgiTT63DUQVSfojyME59Ei63pQ= github.com/gin-gonic/gin v1.10.1/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= @@ -103,66 +105,87 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= -github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= +github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/go-openapi/jsonpointer v0.22.0 h1:TmMhghgNef9YXxTu1tOopo+0BGEytxA+okbry0HjZsM= +github.com/go-openapi/jsonpointer v0.22.0/go.mod h1:xt3jV88UtExdIkkL7NloURjRQjbeUgcxFblMjq2iaiU= +github.com/go-openapi/jsonreference v0.21.1 h1:bSKrcl8819zKiOgxkbVNRUBIr6Wwj9KYrDbMjRs0cDA= +github.com/go-openapi/jsonreference v0.21.1/go.mod h1:PWs8rO4xxTUqKGu+lEvvCxD5k2X7QYkKAepJyCmSTT8= +github.com/go-openapi/swag v0.24.1 h1:DPdYTZKo6AQCRqzwr/kGkxJzHhpKxZ9i/oX0zag+MF8= +github.com/go-openapi/swag v0.24.1/go.mod h1:sm8I3lCPlspsBBwUm1t5oZeWZS0s7m/A+Psg0ooRU0A= +github.com/go-openapi/swag/cmdutils v0.24.0 h1:KlRCffHwXFI6E5MV9n8o8zBRElpY4uK4yWyAMWETo9I= +github.com/go-openapi/swag/cmdutils v0.24.0/go.mod h1:uxib2FAeQMByyHomTlsP8h1TtPd54Msu2ZDU/H5Vuf8= +github.com/go-openapi/swag/conv v0.24.0 h1:ejB9+7yogkWly6pnruRX45D1/6J+ZxRu92YFivx54ik= +github.com/go-openapi/swag/conv v0.24.0/go.mod h1:jbn140mZd7EW2g8a8Y5bwm8/Wy1slLySQQ0ND6DPc2c= +github.com/go-openapi/swag/fileutils v0.24.0 h1:U9pCpqp4RUytnD689Ek/N1d2N/a//XCeqoH508H5oak= +github.com/go-openapi/swag/fileutils v0.24.0/go.mod h1:3SCrCSBHyP1/N+3oErQ1gP+OX1GV2QYFSnrTbzwli90= +github.com/go-openapi/swag/jsonname v0.24.0 h1:2wKS9bgRV/xB8c62Qg16w4AUiIrqqiniJFtZGi3dg5k= +github.com/go-openapi/swag/jsonname v0.24.0/go.mod h1:GXqrPzGJe611P7LG4QB9JKPtUZ7flE4DOVechNaDd7Q= +github.com/go-openapi/swag/jsonutils v0.24.0 h1:F1vE1q4pg1xtO3HTyJYRmEuJ4jmIp2iZ30bzW5XgZts= +github.com/go-openapi/swag/jsonutils v0.24.0/go.mod h1:vBowZtF5Z4DDApIoxcIVfR8v0l9oq5PpYRUuteVu6f0= +github.com/go-openapi/swag/loading v0.24.0 h1:ln/fWTwJp2Zkj5DdaX4JPiddFC5CHQpvaBKycOlceYc= +github.com/go-openapi/swag/loading v0.24.0/go.mod h1:gShCN4woKZYIxPxbfbyHgjXAhO61m88tmjy0lp/LkJk= +github.com/go-openapi/swag/mangling v0.24.0 h1:PGOQpViCOUroIeak/Uj/sjGAq9LADS3mOyjznmHy2pk= +github.com/go-openapi/swag/mangling v0.24.0/go.mod h1:Jm5Go9LHkycsz0wfoaBDkdc4CkpuSnIEf62brzyCbhc= +github.com/go-openapi/swag/netutils v0.24.0 h1:Bz02HRjYv8046Ycg/w80q3g9QCWeIqTvlyOjQPDjD8w= +github.com/go-openapi/swag/netutils v0.24.0/go.mod h1:WRgiHcYTnx+IqfMCtu0hy9oOaPR0HnPbmArSRN1SkZM= +github.com/go-openapi/swag/stringutils v0.24.0 h1:i4Z/Jawf9EvXOLUbT97O0HbPUja18VdBxeadyAqS1FM= +github.com/go-openapi/swag/stringutils v0.24.0/go.mod h1:5nUXB4xA0kw2df5PRipZDslPJgJut+NjL7D25zPZ/4w= +github.com/go-openapi/swag/typeutils v0.24.0 h1:d3szEGzGDf4L2y1gYOSSLeK6h46F+zibnEas2Jm/wIw= +github.com/go-openapi/swag/typeutils v0.24.0/go.mod h1:q8C3Kmk/vh2VhpCLaoR2MVWOGP8y7Jc8l82qCTd1DYI= +github.com/go-openapi/swag/yamlutils v0.24.0 h1:bhw4894A7Iw6ne+639hsBNRHg9iZg/ISrOVr+sJGp4c= +github.com/go-openapi/swag/yamlutils v0.24.0/go.mod h1:DpKv5aYuaGm/sULePoeiG8uwMpZSfReo1HR3Ik0yaG8= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= -github.com/go-playground/validator/v10 v10.26.0 h1:SP05Nqhjcvz81uJaRfEV0YBSSSGMc/iMaVtFbr3Sw2k= -github.com/go-playground/validator/v10 v10.26.0/go.mod h1:I5QpIEbmr8On7W0TktmJAumgzX4CA1XNl4ZmDuVHKKo= -github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= -github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= +github.com/go-playground/validator/v10 v10.27.0 h1:w8+XrWVMhGkxOaaowyKH35gFydVHOvC0/uWoy2Fzwn4= +github.com/go-playground/validator/v10 v10.27.0/go.mod h1:I5QpIEbmr8On7W0TktmJAumgzX4CA1XNl4ZmDuVHKKo= +github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo= +github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= -github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/goji/httpauth v0.0.0-20160601135302-2da839ab0f4d/go.mod h1:nnjvkQ9ptGaCkuDUx6wNykzzlUixGxvkme+H/lnzb+A= -github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI= -github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= +github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8= +github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= +github.com/google/cel-go v0.26.1 h1:iPbVVEdkhTX++hpe3lzSk7D3G3QSYqLGoHOcEio+UXQ= +github.com/google/cel-go v0.26.1/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/pprof v0.0.0-20250903194437-c28834ac2320 h1:c7ayAhbRP9HnEl/hg/WQOM9s0snWztfW6feWXZbGHw0= +github.com/google/pprof v0.0.0-20250903194437-c28834ac2320/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= -github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 h1:+9834+KizmvFV7pXQGSXQTsaWhq2GjuNUt0aUU0YBYw= -github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y= +github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 h1:qnpSQwGEnkcRpTqNOIR6bJbR0gAorgP9CSALpRcKoAA= +github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1/go.mod h1:lXGCsh6c22WGtjr+qGHj1otzZpV/1kwTMAqkwZsnWRU= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0 h1:FbSCl+KggFl+Ocym490i/EyXF4lPgLoUtcSWquBM0Rs= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0/go.mod h1:qOchhhIlmRcqk/O9uCo/puJlyo07YINaIqdZfZG3Jkc= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= -github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= -github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 h1:VNqngBF40hVlDloBruUehVYC3ArSgIyScOAyMRqBxRg= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1/go.mod h1:RBRO7fro65R6tjKzYgLAFo0t1QEXY1Dp+i/bvpRiqiQ= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -182,24 +205,20 @@ github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9Y github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= -github.com/jonboulle/clockwork v0.4.0 h1:p4Cf1aMWXnXAUh8lVfewRBx1zaTSYKrKMF2g3ST4RZ4= -github.com/jonboulle/clockwork v0.4.0/go.mod h1:xgRqUGwRcjKCO1vbZUEtSLrqKoPSsUpK7fnezOII0kc= +github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I= +github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= -github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.2 h1:uAwqOtyrFYggq3pVf3hs1XKkBxrQ8dkgjWz3LCLJsiY= -github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.2/go.mod h1:LBzS4n6GX1C69tzSd5EibZ9cGOXFuHP7GxEMDYVe1sM= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kisielk/sqlstruct v0.0.0-20201105191214-5f3e10d3ab46/go.mod h1:yyMNCyc/Ib3bDTKd379tNMpB/7/H5TjM2Y9QJ5THLbE= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= -github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= -github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= -github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= +github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= +github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -221,28 +240,29 @@ github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4 github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= +github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY= -github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o= +github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw= +github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b h1:FfH+VrHHk6Lxt9HdVS0PXzSXFyS2NbZKXv33FYPol0A= github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b/go.mod h1:AC62GU6hc0BrNm+9RK9VSiwa/EUe1bkIeFORAMcHvJU= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= -github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= -github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= +github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -250,14 +270,14 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= -github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= @@ -271,13 +291,13 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0= -github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= -github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= -github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= +github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.1 h1:iS0MdW+kVTxgMoE1LAZyMiYJFKlOzLooE4MxjirtkAs= +github.com/stoewer/go-strcase v1.3.1/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -288,10 +308,8 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/stretchr/testify v1.11.0 h1:ib4sjIrwZKxE5u/Japgo/7SJV3PvgjGiRNAvTVGqQl8= -github.com/stretchr/testify v1.11.0/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75 h1:6fotK7otjonDflCTK0BCfls4SPy3NcCVb5dqqmbRknE= github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75/go.mod h1:KO6IkyS8Y3j8OdNO85qEYBsRPuteD+YciPomcXdrMnk= github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= @@ -300,8 +318,8 @@ github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaO github.com/uber/jaeger-client-go v2.30.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= github.com/uber/jaeger-lib v2.4.1+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= -github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= -github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= +github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA= +github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/xiang90/probing v0.0.0-20221125231312-a49e3df8f510 h1:S2dVYn90KE98chqDkyE9Z4N61UnQd+KOfgp5Iu53llk= @@ -310,44 +328,42 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= -go.etcd.io/bbolt v1.3.11 h1:yGEzV1wPz2yVCLsD8ZAiGHhHVlczyC9d1rP43/VCRJ0= -go.etcd.io/bbolt v1.3.11/go.mod h1:dksAq7YMXoljX0xu6VF5DMZGbhYYoLUalEiSySYAS4I= -go.etcd.io/etcd/api/v3 v3.5.21 h1:A6O2/JDb3tvHhiIz3xf9nJ7REHvtEFJJ3veW3FbCnS8= -go.etcd.io/etcd/api/v3 v3.5.21/go.mod h1:c3aH5wcvXv/9dqIw2Y810LDXJfhSYdHQ0vxmP3CCHVY= -go.etcd.io/etcd/client/pkg/v3 v3.5.21 h1:lPBu71Y7osQmzlflM9OfeIV2JlmpBjqBNlLtcoBqUTc= -go.etcd.io/etcd/client/pkg/v3 v3.5.21/go.mod h1:BgqT/IXPjK9NkeSDjbzwsHySX3yIle2+ndz28nVsjUs= -go.etcd.io/etcd/client/v2 v2.305.21 h1:eLiFfexc2mE+pTLz9WwnoEsX5JTTpLCYVivKkmVXIRA= -go.etcd.io/etcd/client/v2 v2.305.21/go.mod h1:OKkn4hlYNf43hpjEM3Ke3aRdUkhSl8xjKjSf8eCq2J8= -go.etcd.io/etcd/client/v3 v3.5.21 h1:T6b1Ow6fNjOLOtM0xSoKNQt1ASPCLWrF9XMHcH9pEyY= -go.etcd.io/etcd/client/v3 v3.5.21/go.mod h1:mFYy67IOqmbRf/kRUvsHixzo3iG+1OF2W2+jVIQRAnU= -go.etcd.io/etcd/pkg/v3 v3.5.21 h1:jUItxeKyrDuVuWhdh0HtjUANwyuzcb7/FAeUfABmQsk= -go.etcd.io/etcd/pkg/v3 v3.5.21/go.mod h1:wpZx8Egv1g4y+N7JAsqi2zoUiBIUWznLjqJbylDjWgU= -go.etcd.io/etcd/raft/v3 v3.5.21 h1:dOmE0mT55dIUsX77TKBLq+RgyumsQuYeiRQnW/ylugk= -go.etcd.io/etcd/raft/v3 v3.5.21/go.mod h1:fmcuY5R2SNkklU4+fKVBQi2biVp5vafMrWUEj4TJ4Cs= -go.etcd.io/etcd/server/v3 v3.5.21 h1:9w0/k12majtgarGmlMVuhwXRI2ob3/d1Ik3X5TKo0yU= -go.etcd.io/etcd/server/v3 v3.5.21/go.mod h1:G1mOzdwuzKT1VRL7SqRchli/qcFrtLBTAQ4lV20sXXo= +go.etcd.io/bbolt v1.4.2 h1:IrUHp260R8c+zYx/Tm8QZr04CX+qWS5PGfPdevhdm1I= +go.etcd.io/bbolt v1.4.2/go.mod h1:Is8rSHO/b4f3XigBC0lL0+4FwAQv3HXEEIgFMuKHceM= +go.etcd.io/etcd/api/v3 v3.6.4 h1:7F6N7toCKcV72QmoUKa23yYLiiljMrT4xCeBL9BmXdo= +go.etcd.io/etcd/api/v3 v3.6.4/go.mod h1:eFhhvfR8Px1P6SEuLT600v+vrhdDTdcfMzmnxVXXSbk= +go.etcd.io/etcd/client/pkg/v3 v3.6.4 h1:9HBYrjppeOfFjBjaMTRxT3R7xT0GLK8EJMVC4xg6ok0= +go.etcd.io/etcd/client/pkg/v3 v3.6.4/go.mod h1:sbdzr2cl3HzVmxNw//PH7aLGVtY4QySjQFuaCgcRFAI= +go.etcd.io/etcd/client/v3 v3.6.4 h1:YOMrCfMhRzY8NgtzUsHl8hC2EBSnuqbR3dh84Uryl7A= +go.etcd.io/etcd/client/v3 v3.6.4/go.mod h1:jaNNHCyg2FdALyKWnd7hxZXZxZANb0+KGY+YQaEMISo= +go.etcd.io/etcd/pkg/v3 v3.6.4 h1:fy8bmXIec1Q35/jRZ0KOes8vuFxbvdN0aAFqmEfJZWA= +go.etcd.io/etcd/pkg/v3 v3.6.4/go.mod h1:kKcYWP8gHuBRcteyv6MXWSN0+bVMnfgqiHueIZnKMtE= +go.etcd.io/etcd/server/v3 v3.6.4 h1:LsCA7CzjVt+8WGrdsnh6RhC0XqCsLkBly3ve5rTxMAU= +go.etcd.io/etcd/server/v3 v3.6.4/go.mod h1:aYCL/h43yiONOv0QIR82kH/2xZ7m+IWYjzRmyQfnCAg= +go.etcd.io/raft/v3 v3.6.0 h1:5NtvbDVYpnfZWcIHgGRk9DyzkBIXOi8j+DDp1IcnUWQ= +go.etcd.io/raft/v3 v3.6.0/go.mod h1:nLvLevg6+xrVtHUmVaTcTz603gQPHfh7kUAwV6YpfGo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 h1:PS8wXpbyaDJQ2VDHHncMe9Vct0Zn1fEjpsjrLxGJoSc= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0/go.mod h1:HDBUsEjOuRC0EzKZ1bSaRGZWUBAzo+MhAcUUORSr4D0= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/sdk v1.33.0 h1:iax7M131HuAm9QkZotNHEfstof92xM+N8sr3uHXc2IM= -go.opentelemetry.io/otel/sdk v1.33.0/go.mod h1:A1Q5oi7/9XaMlIWzPSxLRWOI8nG3FnzHJNbiENQuihM= -go.opentelemetry.io/otel/sdk/metric v1.31.0 h1:i9hxxLJF/9kkvfHppyLL55aW7iIJz4JjxTeYusH7zMc= -go.opentelemetry.io/otel/sdk/metric v1.31.0/go.mod h1:CRInTMVvNhUKgSAMbKyTMxqOBC0zgyxzW55lZzX43Y8= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 h1:YH4g8lQroajqUwWbq/tr2QX1JFmEXaDLgG+ew9bLMWo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0/go.mod h1:fvPi2qXDqFs8M4B4fmJhE92TyQs9Ydjlg3RvfUp+NbQ= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.8.0 h1:fRAZQDcAFHySxpJ1TwlA1cJ4tvcrw7nXl9xWWC8N5CE= +go.opentelemetry.io/proto/otlp v1.8.0/go.mod h1:tIeYOeNBU4cvmPqpaji1P+KbB4Oloai8wN4rWzRrFF0= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= @@ -360,23 +376,23 @@ go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/arch v0.15.0 h1:QtOrQd0bTUnhNVNndMpLHNWrDmYzZ2KDqSrEymqInZw= -golang.org/x/arch v0.15.0/go.mod h1:JmwW7aLIoRUKgaTzhkiEFxvcEiQGyOg9BMonBJUS7EE= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/arch v0.21.0 h1:iTC9o7+wP6cPWpDWkivCvQFGAHDQ59SrSxsLPcnkArw= +golang.org/x/arch v0.21.0/go.mod h1:dNHoOeKiyja7GTvF9NJS1l3Z2yntpQNzgrjh1cU103A= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= -golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 h1:y5zboxd6LQAqYIhHnB48p0ByQ/GnQx2BE33L8BOHQkI= -golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6/go.mod h1:U6Lno4MTRCDY+Ba7aCcauB9T60gsv5s4ralQzP72ZoQ= +golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b h1:DXr+pvt3nC887026GRP39Ej11UATqWDmWuS99x26cD0= +golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b/go.mod h1:4QTo5u+SEIbbKW1RacMZq1YEfOBqeXa19JeshGi+zc4= golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= @@ -388,32 +404,33 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= -golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/oauth2 v0.31.0 h1:8Fq0yVZLh4j4YA47vHKFTa9Ew5XIrCP8LC6UeNZnLxo= +golang.org/x/oauth2 v0.31.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.35.0 h1:bZBVKBudEyhRcajGcNc3jIfWPqV4y/Kt2XcoigOWtDQ= +golang.org/x/term v0.35.0/go.mod h1:TPGtkTLesOwf2DE8CgVYiZinHAOuy5AYUYT1lENIZnA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= -golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= -golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI= +golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -421,8 +438,8 @@ golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -431,24 +448,24 @@ gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0 gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= -google.golang.org/genproto v0.0.0-20240123012728-ef4313101c80 h1:KAeGQVN3M9nD0/bQXnr/ClcEMJ968gUXJQ9pwfSynuQ= -google.golang.org/genproto v0.0.0-20240123012728-ef4313101c80/go.mod h1:cc8bqMqtv9gMOr0zHg2Vzff5ULhhL2IXP4sbcn32Dro= -google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 h1:st3LcW/BPi75W4q1jJTEor/QWwbNlPlDG0JTn6XhZu0= -google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8/go.mod h1:klhJGKFyG8Tn50enBn7gizg4nXGXJ+jqEREdCWaPcV4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250102185135-69823020774d h1:xJJRGY7TJcvIlpSrN3K6LAWgNFUILlO+OMAqtg9aqnw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250102185135-69823020774d/go.mod h1:3ENsm/5D1mzDyhpzeRi1NR784I0BcofWBoSc5QqqMK4= -google.golang.org/grpc v1.69.4 h1:MF5TftSMkd8GLw/m0KM6V8CMOCY6NZ1NQDPGFgbTt4A= -google.golang.org/grpc v1.69.4/go.mod h1:vyjdE6jLBI76dgpDojsFGNaHlxdjXN9ghpnd2o7JGZ4= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/genproto/googleapis/api v0.0.0-20250826171959-ef028d996bc1 h1:APHvLLYBhtZvsbnpkfknDZ7NyH4z5+ub/I0u8L3Oz6g= +google.golang.org/genproto/googleapis/api v0.0.0-20250826171959-ef028d996bc1/go.mod h1:xUjFWUnWDpZ/C0Gu0qloASKFb6f8/QXiiXhSPFsD668= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 h1:pmJpJEvT846VzausCQ5d7KreSROcDqmO388w5YbnltA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1/go.mod h1:GmFNa4BdJZ2a8G+wCe9Bg3wwThLrJun751XstdJt5Og= +google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= +google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= @@ -465,61 +482,56 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gorm.io/driver/mysql v1.6.0 h1:eNbLmNTpPpTOVZi8MMxCi2aaIm0ZpInbORNXDwyLGvg= gorm.io/driver/mysql v1.6.0/go.mod h1:D/oCC2GWK3M/dqoLxnOlaNKmXz8WNTfcS9y5ovaSqKo= -gorm.io/gorm v1.30.1 h1:lSHg33jJTBxs2mgJRfRZeLDG+WZaHYCk3Wtfl6Ngzo4= -gorm.io/gorm v1.30.1/go.mod h1:8Z33v652h4//uMA76KjeDH8mJXPm1QNCYrMeatR0DOE= -k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8= -k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE= -k8s.io/apiextensions-apiserver v0.33.2 h1:6gnkIbngnaUflR3XwE1mCefN3YS8yTD631JXQhsU6M8= -k8s.io/apiextensions-apiserver v0.33.2/go.mod h1:IvVanieYsEHJImTKXGP6XCOjTwv2LUMos0YWc9O+QP8= -k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA= -k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.2 h1:KGTRbxn2wJagJowo29kKBp4TchpO1DRO3g+dB/KOJN4= -k8s.io/apiserver v0.33.2/go.mod h1:9qday04wEAMLPWWo9AwqCZSiIn3OYSZacDyu/AcoM/M= -k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA= -k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg= -k8s.io/cloud-provider v0.33.2 h1:tP/18SbhytAapqg2/tGD5PFUR6VLYra+QfJ7Qn3FN34= -k8s.io/cloud-provider v0.33.2/go.mod h1:yS8ArLLLZV1+Tv6hkSYrZuYEVz+wQgiekUtaqe9Wxao= -k8s.io/component-base v0.33.3 h1:mlAuyJqyPlKZM7FyaoM/LcunZaaY353RXiOd2+B5tGA= -k8s.io/component-base v0.33.3/go.mod h1:ktBVsBzkI3imDuxYXmVxZ2zxJnYTZ4HAsVj9iF09qp4= -k8s.io/component-helpers v0.33.3 h1:fjWVORSQfI0WKzPeIFSju/gMD9sybwXBJ7oPbqQu6eM= -k8s.io/component-helpers v0.33.3/go.mod h1:7iwv+Y9Guw6X4RrnNQOyQlXcvJrVjPveHVqUA5dm31c= -k8s.io/controller-manager v0.33.2 h1:HIs8PbdTOaY6wTOvKKLwoAHSO6GeDjmYS0Gjnd6rF+c= -k8s.io/controller-manager v0.33.2/go.mod h1:n8maAdN06E3cD0h5N0wuYBv9Qi9FePl7y6Iz3pfc9PY= -k8s.io/csi-translation-lib v0.33.2 h1:QyWkVcf0rbNjc53uAqCyl9kmHCRn1O0Z4QT69y/jwHQ= -k8s.io/csi-translation-lib v0.33.2/go.mod h1:nFPX6BA20EDdIQpitb6p2wVtvLBuXsmm6D1Cwi3rDnE= -k8s.io/dynamic-resource-allocation v0.33.1 h1:xnEWV764LIsRQDTQ0tLFQMz1lY34Ep7D+/NNbrODfm4= -k8s.io/dynamic-resource-allocation v0.33.1/go.mod h1:AgBLCrIi+//A4VKljjJ7YPpJ+LeyDyTvUk7v8+Qf3pI= +gorm.io/gorm v1.30.3 h1:QiG8upl0Sg9ba2Zatfjy0fy4It2iNBL2/eMdvEkdXNs= +gorm.io/gorm v1.30.3/go.mod h1:8Z33v652h4//uMA76KjeDH8mJXPm1QNCYrMeatR0DOE= +k8s.io/api v0.34.0 h1:L+JtP2wDbEYPUeNGbeSa/5GwFtIA662EmT2YSLOkAVE= +k8s.io/api v0.34.0/go.mod h1:YzgkIzOOlhl9uwWCZNqpw6RJy9L2FK4dlJeayUoydug= +k8s.io/apiextensions-apiserver v0.34.0 h1:B3hiB32jV7BcyKcMU5fDaDxk882YrJ1KU+ZSkA9Qxoc= +k8s.io/apiextensions-apiserver v0.34.0/go.mod h1:hLI4GxE1BDBy9adJKxUxCEHBGZtGfIg98Q+JmTD7+g0= +k8s.io/apimachinery v0.34.0 h1:eR1WO5fo0HyoQZt1wdISpFDffnWOvFLOOeJ7MgIv4z0= +k8s.io/apimachinery v0.34.0/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apiserver v0.34.0 h1:Z51fw1iGMqN7uJ1kEaynf2Aec1Y774PqU+FVWCFV3Jg= +k8s.io/apiserver v0.34.0/go.mod h1:52ti5YhxAvewmmpVRqlASvaqxt0gKJxvCeW7ZrwgazQ= +k8s.io/client-go v0.34.0 h1:YoWv5r7bsBfb0Hs2jh8SOvFbKzzxyNo0nSb0zC19KZo= +k8s.io/client-go v0.34.0/go.mod h1:ozgMnEKXkRjeMvBZdV1AijMHLTh3pbACPvK7zFR+QQY= +k8s.io/cloud-provider v0.34.0 h1:OgrNE+WSgfvDBQf6WS9qFM7Xr37bc0Og5kkL4hyWDmU= +k8s.io/cloud-provider v0.34.0/go.mod h1:JbMa0t6JIGDMLI7Py6bdp9TN6cfuHrWGq+E/X+Ljkmo= +k8s.io/component-base v0.34.0 h1:bS8Ua3zlJzapklsB1dZgjEJuJEeHjj8yTu1gxE2zQX8= +k8s.io/component-base v0.34.0/go.mod h1:RSCqUdvIjjrEm81epPcjQ/DS+49fADvGSCkIP3IC6vg= +k8s.io/component-helpers v0.34.0 h1:5T7P9XGMoUy1JDNKzHf0p/upYbeUf8ZaSf9jbx0QlIo= +k8s.io/component-helpers v0.34.0/go.mod h1:kaOyl5tdtnymriYcVZg4uwDBe2d1wlIpXyDkt6sVnt4= +k8s.io/controller-manager v0.34.0 h1:oCHoqS8dcFp7zDSu7HUvTpakq3isSxil3GprGGlJMsE= +k8s.io/controller-manager v0.34.0/go.mod h1:XFto21U+Mm9BT8r/Jd5E4tHCGtwjKAUFOuDcqaj2VK0= +k8s.io/csi-translation-lib v0.34.0 h1:WhCkq35XATZ+x6NKqI4u7XSYtmucuCN7jDk+mmm9XUU= +k8s.io/csi-translation-lib v0.34.0/go.mod h1:lZ+vpT3/6hx7GxXcI1mcoHxZSONvxgl2NwawzFnJP4Y= +k8s.io/dynamic-resource-allocation v0.34.0 h1:RrFNZXb2s5cvvf+KKdO92ss/e+zjGFFaDKAIpzA+Pu8= +k8s.io/dynamic-resource-allocation v0.34.0/go.mod h1:aqmoDIvXjQRhSgxQkFLl6+Ndg6MfdEOI+TQsj1j9V+g= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kms v0.33.2 h1:GFwNXX4CZGQCg9DPOaJi1/+iKidCtB9/OIAGdzRo8FI= -k8s.io/kms v0.33.2/go.mod h1:C1I8mjFFBNzfUZXYt9FZVJ8MJl7ynFbGgZFbBzkBJ3E= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/kube-scheduler v0.32.7 h1:QOvu/fNEYGg1gzzpowWHFCI8SD3vJs5Iz0qebEQADd4= -k8s.io/kube-scheduler v0.32.7/go.mod h1:ez/2BnvZv2Bq1K9LpBsDgRsTvwJLAzkcpRMfY7rhLMA= -k8s.io/kubelet v0.33.1 h1:x4LCw1/iZVWOKA4RoITnuB8gMHnw31HPB3S0EF0EexE= -k8s.io/kubelet v0.33.1/go.mod h1:8WpdC9M95VmsqIdGSQrajXooTfT5otEj8pGWOm+KKfQ= -k8s.io/kubernetes v1.33.4 h1:T1d5FLUYm3/KyUeV7YJhKTR980zHCHb7K2xhCSo3lE8= -k8s.io/kubernetes v1.33.4/go.mod h1:nrt8sldmckKz2fCZhgRX3SKfS2e+CzXATPv6ITNkU00= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= +k8s.io/kms v0.34.0 h1:u+/rcxQ3Jr7gC9AY5nXuEnBcGEB7ZOIJ9cdLdyHyEjQ= +k8s.io/kms v0.34.0/go.mod h1:s1CFkLG7w9eaTYvctOxosx88fl4spqmixnNpys0JAtM= +k8s.io/kube-openapi v0.0.0-20250905212525-66792eed8611 h1:o4oKOsvSymDkZRsMAPZU7bRdwL+lPOK5VS10Dr1D6eg= +k8s.io/kube-openapi v0.0.0-20250905212525-66792eed8611/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/kube-scheduler v0.34.0 h1:iUT5spyg0RlZ9W5dImrxSxv0yTqbsI+/J72/Iuv9ed8= +k8s.io/kube-scheduler v0.34.0/go.mod h1:7pt2HDb32lZOihbt/aamuMBvSe1o+rrd2rQC8aJyfP0= +k8s.io/kubelet v0.34.0 h1:1nZt1Q6Kfx7xCaTS9vnqR9sjZDxf3cRSQkAFCczULmc= +k8s.io/kubelet v0.34.0/go.mod h1:NqbF8ViVettlZbf9hw9DJhubaWn7rGvDDTcLMDm6tQ0= +k8s.io/kubernetes v1.34.0 h1:NvUrwPAVB4W3mSOpJ/RtNGHWWYyUP/xPaX5rUSpzA0w= +k8s.io/kubernetes v1.34.0/go.mod h1:iu+FhII+Oc/1gGWLJcer6wpyih441aNFHl7Pvm8yPto= +k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0= +k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/karpenter v1.6.1 h1:ZAC802Prk/GyKoGUu0LuzEn9fFmJLfUtMfo64derQgw= -sigs.k8s.io/karpenter v1.6.1/go.mod h1:AxCaeRjv1Pgw/Ff7vT4aqyXcg8v1UdBcfzWMCaKSVjA= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 h1:qPrZsv1cwQiFeieFlRqT627fVZ+tyfou/+S5S0H5ua0= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= +sigs.k8s.io/controller-runtime v0.22.0 h1:mTOfibb8Hxwpx3xEkR56i7xSjB+nH4hZG37SrlCY5e0= +sigs.k8s.io/controller-runtime v0.22.0/go.mod h1:FwiwRjkRPbiN+zp2QRp7wlTCzbUXxZ/D4OzuQUDwBHY= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/karpenter v1.6.2 h1:WFayZ49CSOaDMku1iYBTsD3A9hOB2yU/U95VcSAJ8KM= +sigs.k8s.io/karpenter v1.6.2/go.mod h1:AxCaeRjv1Pgw/Ff7vT4aqyXcg8v1UdBcfzWMCaKSVjA= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/scheduler-plugins v0.32.7 h1:fGr4JKraaTe6it4PIqUlXStfctFKYxJgYkDsiU6699o= -sigs.k8s.io/scheduler-plugins v0.32.7/go.mod h1:Oem5rktj6wgFr2SUqcaInUTIBX8tlY8c4qid5vp2lBw= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/constants/env.go b/internal/constants/env.go index 1e26a392..06212f20 100644 --- a/internal/constants/env.go +++ b/internal/constants/env.go @@ -73,9 +73,10 @@ const ( LdPreloadFileName = "ld.so.preload" LdPreloadFile = "/etc/ld.so.preload" - TFLibsVolumeName = "tf-libs" - TFLibsVolumeMountPath = "/tensor-fusion" - TFConnectionNamePrefix = "-tf-vgpu-" + TFLibsVolumeName = "tf-libs" + TFLibsVolumeMountPath = "/tensor-fusion" + TFConnectionNamePrefix = "-tf-vgpu-" + TFConnectionNameNoPrefix = "tf-vgpu-" HostIPFieldRef = "status.hostIP" NodeNameFieldRef = "spec.nodeName" @@ -98,8 +99,7 @@ const ( LdPreloadEnv = "LD_PRELOAD" LdPreloadLimiter = "/home/app/libcuda_limiter.so" - SharedMemDeviceName = "/dev/shm" - SharedMemMountSubPath = "shm" + SharedMemMountSubPath = "/shm" // disable GPU limiter, for emergency use DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER" diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index caedc903..d8908847 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -53,6 +53,8 @@ type NodeReconciler struct { // +kubebuilder:rbac:groups=core,resources=nodes/finalizers,verbs=create;get;patch;update // Reconcile k8s nodes to create and update GPUNode +// +//nolint:gocyclo func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) node := &corev1.Node{} diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index 6987ab77..c4a36980 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -30,7 +30,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/retry" - "k8s.io/kubernetes/pkg/scheduler/framework" + fwk "k8s.io/kube-scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" @@ -40,6 +40,8 @@ import ( const MaxGPUCounterPerAllocation = 128 const CleanUpCheckInterval = 3 * time.Minute +var GPUCapacityMap = map[string]tfv1.Resource{} + type Strategy interface { Score(gpu *tfv1.GPU) int @@ -51,7 +53,7 @@ type SimulateSchedulingFilterDetail struct { FilterStageDetails []filter.FilterDetail } -func (p *SimulateSchedulingFilterDetail) Clone() framework.StateData { +func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData { return p } @@ -882,6 +884,10 @@ func (s *GpuAllocator) handleGPUCreate(ctx context.Context, gpu *tfv1.GPU) { s.poolGpuStore[pool][gpuInMem.Name] = gpuInMem } } + + if gpu.Status.GPUModel != "" { + GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity + } log.Info("Added GPU to store", "name", key.Name, "phase", gpu.Status.Phase) } @@ -930,6 +936,12 @@ func (s *GpuAllocator) handleGPUUpdate(ctx context.Context, gpu *tfv1.GPU) { s.gpuStore[key] = gpu.DeepCopy() log.V(6).Info("Updated GPU in store (new entry)", "name", key.Name, "phase", gpu.Status.Phase) } + + if gpu.Status.GPUModel != "" { + if _, exists := GPUCapacityMap[gpu.Status.GPUModel]; !exists { + GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity + } + } } func syncGPUMetadataAndStatusFromCluster(old *tfv1.GPU, gpu *tfv1.GPU) { diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go index 16dd1c61..ee6b6e58 100644 --- a/internal/scheduler/gpuresources/gpuresources.go +++ b/internal/scheduler/gpuresources/gpuresources.go @@ -19,6 +19,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" + fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -58,7 +59,7 @@ type GPUSchedulingStateData struct { FinalGPUs []string } -func (p *GPUSchedulingStateData) Clone() framework.StateData { +func (p *GPUSchedulingStateData) Clone() fwk.StateData { return p } @@ -93,7 +94,7 @@ func (s *GPUFit) Name() string { return Name } -func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { +func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Pod, _ []fwk.NodeInfo) (*framework.PreFilterResult, *fwk.Status) { // Handle progressive migration case if utils.IsProgressiveMigration() && utils.HasGPUResourceRequest(pod) { nodeNames := s.allocator.ListNonUsingNodes() @@ -102,19 +103,19 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod "use native GPU resources, available native GPU nodes: "+strconv.Itoa(len(nodeNames))) return &framework.PreFilterResult{ NodeNames: nodeNames, - }, framework.NewStatus(framework.Success, "progressive migration for native resources claim") + }, fwk.NewStatus(fwk.Success, "progressive migration for native resources claim") } // Skip non tensor-fusion mode if !utils.IsTensorFusionWorker(pod) { - return nil, framework.NewStatus(framework.Skip, "skip for non tensor-fusion mode") + return nil, fwk.NewStatus(fwk.Skip, "skip for non tensor-fusion mode") } // Handle tensor-fusion mode scheduling s.logger.Info("checking GPU node resources for pod", "pod", pod.Name) allocRequest, reason, err := s.allocator.ComposeAllocationRequest(pod) if err != nil { - return nil, framework.NewStatus(framework.Error, reason) + return nil, fwk.NewStatus(fwk.Error, reason) } state.Write(CycleStateAllocateRequest, allocRequest) @@ -134,7 +135,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeWarning, "GPUQuotaOrCapacityNotEnough", "check quota and filter", "TensorFusion schedule failed, no enough resource or quotas: "+err.Error()) s.logger.Error(err, "failed to check quota and filter", "pod", pod.Name) - return nil, framework.NewStatus(framework.Unschedulable, err.Error()) + return nil, fwk.NewStatus(fwk.Unschedulable, err.Error()) } validNodesValidGPUs := lo.GroupBy(filteredGPUs, func(gpu *tfv1.GPU) string { @@ -199,51 +200,51 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod return &framework.PreFilterResult{ NodeNames: nodeNames, - }, framework.NewStatus(framework.Success) + }, fwk.NewStatus(fwk.Success) } func (s *GPUFit) PreFilterExtensions() framework.PreFilterExtensions { return nil } -func (s *GPUFit) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { +func (s *GPUFit) Filter(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeInfo fwk.NodeInfo) *fwk.Status { if !utils.IsTensorFusionWorker(pod) { - return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode") + return fwk.NewStatus(fwk.Success, "skip for non tensor-fusion mode") } filterResult, err := state.Read(CycleStateGPUSchedulingResult) if err != nil { - return framework.NewStatus(framework.Error, err.Error()) + return fwk.NewStatus(fwk.Error, err.Error()) } - nodeName := nodeInfo.GetName() + nodeName := nodeInfo.Node().Name if _, ok := filterResult.(*GPUSchedulingStateData).NodeGPUs[nodeName]; !ok { - return framework.NewStatus(framework.Unschedulable, "no valid node found, gpu capacity not enough") + return fwk.NewStatus(fwk.Unschedulable, "no valid node found, gpu capacity not enough") } - return framework.NewStatus(framework.Success, "") + return fwk.NewStatus(fwk.Success, "") } func (s *GPUFit) Score( ctx context.Context, - state *framework.CycleState, + state fwk.CycleState, pod *v1.Pod, - nodeInfo *framework.NodeInfo, -) (int64, *framework.Status) { + nodeInfo fwk.NodeInfo, +) (int64, *fwk.Status) { // Skip non tensor-fusion mode scheduling if !utils.IsTensorFusionWorker(pod) { - return 0, framework.NewStatus(framework.Success, "") + return 0, fwk.NewStatus(fwk.Success, "") } if state == nil { - return 0, framework.NewStatus(framework.Error, "no valid node found, gpu capacity not enough") + return 0, fwk.NewStatus(fwk.Error, "no valid node found, gpu capacity not enough") } filterResult, err := state.Read(CycleStateGPUSchedulingResult) if err != nil { - return 0, framework.NewStatus(framework.Error, err.Error()) + return 0, fwk.NewStatus(fwk.Error, err.Error()) } scheduledState := filterResult.(*GPUSchedulingStateData) - gpuScoreMap, ok := scheduledState.ValidNodeGPUScore[nodeInfo.GetName()] + gpuScoreMap, ok := scheduledState.ValidNodeGPUScore[nodeInfo.Node().Name] if !ok { - return 0, framework.NewStatus(framework.Unschedulable, "no valid node found, gpu capacity not enough") + return 0, fwk.NewStatus(fwk.Unschedulable, "no valid node found, gpu capacity not enough") } // normalize to 0-100, when node has more GPUs but filtered out, // should consider it as 100 when strategy is compact_first, and consider as 0 when is low_load_first @@ -252,7 +253,7 @@ func (s *GPUFit) Score( sum += score } - notMatchingGPUScoreMap, ok := scheduledState.ValidNodeNotMatchingGPUScore[nodeInfo.GetName()] + notMatchingGPUScoreMap, ok := scheduledState.ValidNodeNotMatchingGPUScore[nodeInfo.Node().Name] if ok { for _, score := range notMatchingGPUScoreMap { sum += score @@ -265,27 +266,27 @@ func (s *GPUFit) ScoreExtensions() framework.ScoreExtensions { return nil } -func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { +func (s *GPUFit) Reserve(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeName string) *fwk.Status { if !utils.IsTensorFusionWorker(pod) { - return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode") + return fwk.NewStatus(fwk.Success, "skip for non tensor-fusion mode") } s.logger.Info("Reserving pod for GPU resources", "pod", pod.Name, "node", nodeName) allocRequest, err := state.Read(CycleStateAllocateRequest) if err != nil { - return framework.NewStatus(framework.Error, err.Error()) + return fwk.NewStatus(fwk.Error, err.Error()) } schedulingResultRaw, err := state.Read(CycleStateGPUSchedulingResult) if err != nil { - return framework.NewStatus(framework.Error, err.Error()) + return fwk.NewStatus(fwk.Error, err.Error()) } // set final GPUs and try update GPU allocator cache schedulingResult := schedulingResultRaw.(*GPUSchedulingStateData) gpuScoreMap, ok := schedulingResult.ValidNodeGPUScore[nodeName] if !ok { - return framework.NewStatus(framework.Unschedulable, "no valid node found, gpu capacity not enough") + return fwk.NewStatus(fwk.Unschedulable, "no valid node found, gpu capacity not enough") } // find top N score GPUs in this node @@ -306,12 +307,12 @@ func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod * allocRequest.(*tfv1.AllocRequest), ) if err != nil { - return framework.NewStatus(framework.Error, err.Error()) + return fwk.NewStatus(fwk.Error, err.Error()) } - return framework.NewStatus(framework.Success, "") + return fwk.NewStatus(fwk.Success, "") } -func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { +func (s *GPUFit) Unreserve(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeName string) { if !utils.IsTensorFusionWorker(pod) { return } @@ -330,7 +331,7 @@ func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod }, schedulingResult.FinalGPUs, pod.ObjectMeta) } -func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { +func (s *GPUFit) PostBind(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeName string) { if !utils.IsTensorFusionWorker(pod) { return } diff --git a/internal/scheduler/gpuresources/gpuresources_test.go b/internal/scheduler/gpuresources/gpuresources_test.go index fb7e45b5..71af8c0f 100644 --- a/internal/scheduler/gpuresources/gpuresources_test.go +++ b/internal/scheduler/gpuresources/gpuresources_test.go @@ -14,23 +14,28 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/informers" + clientsetfake "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/events" - "k8s.io/kubernetes/pkg/scheduler/framework" + fwk "k8s.io/kube-scheduler/framework" + framework "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" + "k8s.io/kubernetes/pkg/scheduler/metrics" st "k8s.io/kubernetes/pkg/scheduler/testing" tf "k8s.io/kubernetes/pkg/scheduler/testing/framework" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/log" - testutil "sigs.k8s.io/scheduler-plugins/test/util" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" + internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" + internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" ) type GPUResourcesSuite struct { @@ -201,6 +206,7 @@ func (s *GPUResourcesSuite) SetupTest() { }, }, } + s.client = fake.NewClientBuilder().WithScheme(scheme.Scheme). WithRuntimeObjects(objList...). WithStatusSubresource( @@ -213,9 +219,11 @@ func (s *GPUResourcesSuite) SetupTest() { ). Build() + k8sObjs := make([]runtime.Object, 0, len(pods)+len(nodes)) for _, pod := range pods { err := s.client.Create(s.ctx, pod) s.NoError(err) + k8sObjs = append(k8sObjs, pod) } for _, gpu := range gpus { err := s.client.Create(s.ctx, gpu) @@ -224,6 +232,7 @@ func (s *GPUResourcesSuite) SetupTest() { for _, node := range nodes { err := s.client.Create(s.ctx, node) s.NoError(err) + k8sObjs = append(k8sObjs, node) } var registerPlugins []tf.RegisterPluginFunc @@ -233,11 +242,16 @@ func (s *GPUResourcesSuite) SetupTest() { tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), ) + fakeClientSet := clientsetfake.NewSimpleClientset(k8sObjs...) + informerFactory := informers.NewSharedInformerFactory(fakeClientSet, 0) + metrics.Register() + metricsRecorder := metrics.NewMetricsAsyncRecorder(1000, time.Second, s.ctx.Done()) fwk, err := tf.NewFramework( s.ctx, registeredPlugins, "", - frameworkruntime.WithPodNominator(testutil.NewPodNominator(nil)), - frameworkruntime.WithSnapshotSharedLister(testutil.NewFakeSharedLister(pods, nodes)), + frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)), + frameworkruntime.WithSnapshotSharedLister(internalcache.NewEmptySnapshot()), frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), + frameworkruntime.WithMetricsRecorder(metricsRecorder), ) s.NoError(err) s.fwk = fwk @@ -271,7 +285,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { tests := []struct { name string pod *v1.Pod - expectedStatus framework.Code + expectedStatus fwk.Code expectedNodes string }{ { @@ -282,7 +296,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { constants.TFLOPSRequestAnnotation: "100", constants.VRAMRequestAnnotation: "10Gi", }), - expectedStatus: framework.Success, + expectedStatus: fwk.Success, expectedNodes: "node-a node-b", }, { @@ -293,7 +307,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { constants.TFLOPSRequestAnnotation: "2000", constants.VRAMRequestAnnotation: "10Gi", }), - expectedStatus: framework.Success, + expectedStatus: fwk.Success, expectedNodes: "node-b", }, { @@ -304,7 +318,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { constants.TFLOPSRequestAnnotation: "100", constants.VRAMRequestAnnotation: "10Gi", }), - expectedStatus: framework.Success, + expectedStatus: fwk.Success, expectedNodes: "node-b", }, { @@ -315,7 +329,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { constants.TFLOPSRequestAnnotation: "2000", constants.VRAMRequestAnnotation: "80Gi", }), - expectedStatus: framework.Unschedulable, + expectedStatus: fwk.Unschedulable, expectedNodes: "", }, { @@ -326,7 +340,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { constants.TFLOPSRequestAnnotation: "100", constants.VRAMRequestAnnotation: "10Gi", }), - expectedStatus: framework.Unschedulable, + expectedStatus: fwk.Unschedulable, expectedNodes: "", }, } @@ -334,9 +348,9 @@ func (s *GPUResourcesSuite) TestPreFilter() { for _, tt := range tests { s.Run(tt.name, func() { state := framework.NewCycleState() - res, status := s.plugin.PreFilter(s.ctx, state, tt.pod) + res, status := s.plugin.PreFilter(s.ctx, state, tt.pod, []fwk.NodeInfo{}) s.Equal(tt.expectedStatus, status.Code(), status.Message()) - if tt.expectedStatus == framework.Success { + if tt.expectedStatus == fwk.Success { s.Require().NotNil(res) nodes := sort.StringSlice(res.NodeNames.UnsortedList()) nodes.Sort() @@ -351,19 +365,19 @@ func (s *GPUResourcesSuite) TestPreFilterForNonTensorFusionPod() { tests := []struct { name string pod *v1.Pod - expectedStatus framework.Code + expectedStatus fwk.Code expectedNodes string }{ { name: "pod requires 1 GPU, enough capacity", pod: s.makeNonTensorFusionPod("p1", 1), - expectedStatus: framework.Success, + expectedStatus: fwk.Success, expectedNodes: "node-b node-c", }, { name: "pod requires 2 GPU, enough capacity", pod: s.makeNonTensorFusionPod("p1", 2), - expectedStatus: framework.Success, + expectedStatus: fwk.Success, expectedNodes: "node-b node-c", }, } @@ -371,9 +385,9 @@ func (s *GPUResourcesSuite) TestPreFilterForNonTensorFusionPod() { for _, tt := range tests { s.Run(tt.name, func() { state := framework.NewCycleState() - res, status := s.plugin.PreFilter(s.ctx, state, tt.pod) + res, status := s.plugin.PreFilter(s.ctx, state, tt.pod, []fwk.NodeInfo{}) s.Equal(tt.expectedStatus, status.Code(), status.Message()) - if tt.expectedStatus == framework.Success { + if tt.expectedStatus == fwk.Success { s.Require().NotNil(res) nodes := sort.StringSlice(res.NodeNames.UnsortedList()) nodes.Sort() @@ -394,23 +408,23 @@ func (s *GPUResourcesSuite) TestFilter() { constants.TFLOPSLimitAnnotation: "100", constants.VRAMLimitAnnotation: "40Gi", }) - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) tests := []struct { name string nodeName string - expectedStatus framework.Code + expectedStatus fwk.Code }{ { name: "node with available GPU", nodeName: "node-a", - expectedStatus: framework.Success, + expectedStatus: fwk.Success, }, { name: "node without available GPU", nodeName: "node-c", - expectedStatus: framework.Unschedulable, + expectedStatus: fwk.Unschedulable, }, } @@ -435,7 +449,7 @@ func (s *GPUResourcesSuite) TestScore() { constants.TFLOPSLimitAnnotation: "100", constants.VRAMLimitAnnotation: "40Gi", }) - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) // node a as one worker consumed 10% GPU resources @@ -466,7 +480,7 @@ func (s *GPUResourcesSuite) TestReserveAndUnreserve() { constants.TFLOPSLimitAnnotation: "100", constants.VRAMLimitAnnotation: "40Gi", }) - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) // Reserve on node-a @@ -507,7 +521,7 @@ func (s *GPUResourcesSuite) TestPostBind() { constants.TFLOPSLimitAnnotation: "100", constants.VRAMLimitAnnotation: "40Gi", }) - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) reserveStatus := s.plugin.Reserve(s.ctx, state, pod, "node-a") @@ -629,13 +643,13 @@ func (s *GPUResourcesSuite) TestReserve_ErrorHandling() { // No pre-filter call, so state is empty status := s.plugin.Reserve(s.ctx, state, pod, "node-a") s.Error(status.AsError()) - s.Equal(framework.Error, status.Code()) + s.Equal(fwk.Error, status.Code()) // Pre-filter, but for a different node - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) status = s.plugin.Reserve(s.ctx, state, pod, "node-c-non-existent") - s.Equal(framework.Unschedulable, status.Code()) + s.Equal(fwk.Unschedulable, status.Code()) } func (s *GPUResourcesSuite) TestUnreserve_ErrorHandling() { @@ -668,7 +682,7 @@ func (s *GPUResourcesSuite) TestPostBind_ErrorHandling() { s.plugin.PostBind(s.ctx, state, pod, "node-a") // Test with a pod that doesn't exist in the client - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) reserveStatus := s.plugin.Reserve(s.ctx, state, pod, "node-a") s.Require().True(reserveStatus.IsSuccess()) @@ -688,7 +702,7 @@ func (s *GPUResourcesSuite) TestFilter_ErrorHandling() { // No pre-filter call, so state is empty status := s.plugin.Filter(s.ctx, state, pod, nodeInfo) s.Error(status.AsError()) - s.Equal(framework.Error, status.Code()) + s.Equal(fwk.Error, status.Code()) } func (s *GPUResourcesSuite) TestScore_ErrorHandling() { @@ -704,13 +718,13 @@ func (s *GPUResourcesSuite) TestScore_ErrorHandling() { nodeInfo.SetNode(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-a"}}) _, status := s.plugin.Score(s.ctx, state, pod, nodeInfo) s.Error(status.AsError()) - s.Equal(framework.Error, status.Code()) + s.Equal(fwk.Error, status.Code()) // Pre-filter, but for a different node nodeInfo = &framework.NodeInfo{} nodeInfo.SetNode(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-c-non-existent"}}) - _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod) + _, preFilterStatus := s.plugin.PreFilter(s.ctx, state, pod, []fwk.NodeInfo{}) s.Require().True(preFilterStatus.IsSuccess()) _, status = s.plugin.Score(s.ctx, state, pod, nodeInfo) - s.Equal(framework.Unschedulable, status.Code()) + s.Equal(fwk.Unschedulable, status.Code()) } diff --git a/internal/scheduler/gputopo/gpu_network_topo.go b/internal/scheduler/gputopo/gpu_network_topo.go index f481ea8c..197e3995 100644 --- a/internal/scheduler/gputopo/gpu_network_topo.go +++ b/internal/scheduler/gputopo/gpu_network_topo.go @@ -9,6 +9,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" + fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -53,6 +54,6 @@ func (s *GPUNetworkTopologyAware) Name() string { return Name } -func (s *GPUNetworkTopologyAware) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { - return framework.NewStatus(framework.Success, "") +func (s *GPUNetworkTopologyAware) Filter(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeInfo fwk.NodeInfo) *fwk.Status { + return fwk.NewStatus(fwk.Success, "") } diff --git a/internal/server/router/allocator_info.go b/internal/server/router/allocator_info.go index 7c8c4f78..58a949cf 100644 --- a/internal/server/router/allocator_info.go +++ b/internal/server/router/allocator_info.go @@ -17,6 +17,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/scheduler" "k8s.io/kubernetes/pkg/scheduler/framework" "sigs.k8s.io/controller-runtime/pkg/log" @@ -104,20 +105,20 @@ func (r *AllocatorInfoRouter) SimulateScheduleOnePod(ctx *gin.Context) { state.SetRecordPluginMetrics(false) podsToActivate := framework.NewPodsToActivate() state.Write(framework.PodsToActivateKey, podsToActivate) - state.Write(framework.StateKey(constants.SchedulerSimulationKey), &gpuallocator.SimulateSchedulingFilterDetail{ + state.Write(fwk.StateKey(constants.SchedulerSimulationKey), &gpuallocator.SimulateSchedulingFilterDetail{ FilterStageDetails: []filter.FilterDetail{}, }) // simulate schedulingCycle non side effect part - fwk := r.scheduler.Profiles[pod.Spec.SchedulerName] - if fwk == nil { + fwkInstance := r.scheduler.Profiles[pod.Spec.SchedulerName] + if fwkInstance == nil { log.FromContext(ctx).Error(nil, "scheduler framework not found", "pod", pod.Name, "namespace", pod.Namespace) ctx.JSON(http.StatusInternalServerError, gin.H{"error": "scheduler framework not found"}) return } - scheduleResult, err := r.scheduler.SchedulePod(ctx, fwk, state, pod) + scheduleResult, err := r.scheduler.SchedulePod(ctx, fwkInstance, state, pod) gpuCycleState, _ := state.Read(gpuresources.CycleStateGPUSchedulingResult) - simulateSchedulingFilterDetail, _ := state.Read(framework.StateKey(constants.SchedulerSimulationKey)) + simulateSchedulingFilterDetail, _ := state.Read(fwk.StateKey(constants.SchedulerSimulationKey)) if err != nil { if fitError, ok := err.(*framework.FitError); ok { ctx.JSON(http.StatusOK, gin.H{ diff --git a/internal/utils/compose.go b/internal/utils/compose.go index 93e8248c..2a62af0b 100644 --- a/internal/utils/compose.go +++ b/internal/utils/compose.go @@ -230,7 +230,7 @@ func AddTFDefaultClientConfBeforePatch( pod.Spec.Containers[injectContainerIndex].VolumeMounts, v1.VolumeMount{ Name: constants.DataVolumeName, - MountPath: constants.TFLibsVolumeMountPath, + MountPath: constants.TFDataPath + constants.SharedMemMountSubPath, SubPathExpr: constants.TFDataPathWorkerExpr, MountPropagation: ptr.To(v1.MountPropagationHostToContainer), }) @@ -462,8 +462,7 @@ func composeHypervisorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool, enableVect spec.Containers[0].VolumeMounts = append(spec.Containers[0].VolumeMounts, v1.VolumeMount{ Name: constants.DataVolumeName, ReadOnly: false, - MountPath: constants.SharedMemDeviceName, - SubPath: constants.SharedMemMountSubPath, + MountPath: constants.TFDataPath, }, v1.VolumeMount{ Name: constants.TensorFusionGPUInfoConfigVolumeName, MountPath: constants.TensorFusionGPUInfoConfigMountPath, @@ -682,7 +681,7 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon spec.Containers[0].VolumeMounts, v1.VolumeMount{ Name: constants.DataVolumeName, - MountPath: constants.TFLibsVolumeMountPath, + MountPath: constants.TFDataPath + constants.SharedMemMountSubPath, SubPathExpr: constants.TFDataPathWorkerExpr, MountPropagation: ptr.To(v1.MountPropagationHostToContainer), }) diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 542a3ab0..8c5aca06 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -53,20 +53,18 @@ func SetupPodWebhookWithManager(mgr ctrl.Manager, portAllocator *portallocator.P webhookServer.Register("/mutate-v1-pod", &admission.Webhook{ Handler: &TensorFusionPodMutator{ - decoder: admission.NewDecoder(runtime.NewScheme()), - Client: mgr.GetClient(), - portAllocator: portAllocator, - pricingProvider: pricingProvider, + decoder: admission.NewDecoder(runtime.NewScheme()), + Client: mgr.GetClient(), + portAllocator: portAllocator, }, }) return nil } type TensorFusionPodMutator struct { - Client client.Client - decoder admission.Decoder - portAllocator *portallocator.PortAllocator - pricingProvider pricing.PricingProvider + Client client.Client + decoder admission.Decoder + portAllocator *portallocator.PortAllocator } // Handle implements admission.Handler interface. @@ -103,7 +101,7 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque return admission.Errored(http.StatusBadRequest, fmt.Errorf("failed to marshal current pod: %w", err)) } - tfInfo, err := ParseTensorFusionInfo(ctx, m.Client, pod, m.pricingProvider) + tfInfo, err := ParseTensorFusionInfo(ctx, m.Client, pod) if err != nil { return admission.Errored(http.StatusInternalServerError, fmt.Errorf("parse tf resources: %w", err)) } @@ -395,7 +393,7 @@ func addConnectionForRemoteFixedReplicaVirtualGPU(pod *corev1.Pod, container *co if pod.GenerateName == "" && pod.Name != "" { prefix = pod.Name + constants.TFConnectionNamePrefix } else { - prefix = pod.GenerateName + constants.TFConnectionNamePrefix + prefix = pod.GenerateName + constants.TFConnectionNameNoPrefix } connectionName := fmt.Sprintf("%s%s", prefix, utils.NewShortID(10)) connectionNamespace := pod.Namespace diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index d72770cc..374f2620 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -23,9 +23,9 @@ import ( "net/http" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing" "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" @@ -241,6 +241,56 @@ var _ = Describe("TensorFusionPodMutator", func() { Expect(resp.Patches).To(BeEmpty()) }) + It("should handle dedicated GPU", func() { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-local-gpu", + Namespace: "default", + Labels: map[string]string{ + constants.TensorFusionEnabledLabelKey: "true", + }, + Annotations: map[string]string{ + constants.DedicatedGPUAnnotation: constants.TrueStringValue, + constants.GPUModelAnnotation: "A100", + constants.GpuPoolKey: "mock", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "test-image", + }, + }, + }, + } + podBytes, err := json.Marshal(pod) + Expect(err).NotTo(HaveOccurred()) + req := admission.Request{ + AdmissionRequest: admissionv1.AdmissionRequest{ + Object: runtime.RawExtension{ + Raw: podBytes, + }, + Operation: admissionv1.Create, + Namespace: "default", + }, + } + + gpuallocator.GPUCapacityMap["A100"] = tfv1.Resource{ + Tflops: resource.MustParse("312"), + Vram: resource.MustParse("40Gi"), + } + resp := mutator.Handle(ctx, req) + Expect(resp.Allowed).To(BeTrue()) + + op, found := lo.Find(resp.Patches, func(patch jsonpatch.JsonPatchOperation) bool { + return patch.Operation == "add" && + patch.Path == "/metadata/annotations/tensor-fusion.ai~1tflops-request" + }) + Expect(found).To(BeTrue()) + Expect(op.Value).To(Equal("312")) + }) + It("should handle invalid pod specification", func() { req := admission.Request{ AdmissionRequest: admissionv1.AdmissionRequest{ @@ -533,9 +583,7 @@ var _ = Describe("TensorFusionPodMutator", func() { }, }, } - // Create a mock pricing provider for testing - mockPricingProvider := &pricing.StaticPricingProvider{} - tfInfo, err := ParseTensorFusionInfo(ctx, k8sClient, pod, mockPricingProvider) + tfInfo, err := ParseTensorFusionInfo(ctx, k8sClient, pod) Expect(err).NotTo(HaveOccurred()) Expect(tfInfo.ContainerNames).To(HaveLen(1)) Expect(tfInfo.ContainerNames[0]).To(Equal("test-container")) diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index cd72fbc1..2fa7b744 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -7,8 +7,8 @@ import ( "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -30,7 +30,6 @@ func ParseTensorFusionInfo( ctx context.Context, k8sClient client.Client, pod *corev1.Pod, - pricingProvider pricing.PricingProvider, ) (utils.TensorFusionInfo, error) { var info utils.TensorFusionInfo if pod.Annotations == nil { @@ -118,7 +117,7 @@ func ParseTensorFusionInfo( } // Handle dedicated GPU logic - err = handleDedicatedGPU(pod, workloadProfile, pricingProvider) + err = handleDedicatedGPU(pod, workloadProfile) if err != nil { return info, fmt.Errorf("handle dedicated GPU: %w", err) } @@ -237,7 +236,7 @@ func setDefaultQuotasIfExists(workloadProfile *tfv1.WorkloadProfile, single tfv1 } // handleDedicatedGPU handles dedicated GPU annotation by setting full GPU capacity -func handleDedicatedGPU(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile, pricingProvider pricing.PricingProvider) error { +func handleDedicatedGPU(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) error { dedicatedGPU, ok := pod.Annotations[constants.DedicatedGPUAnnotation] if !ok || dedicatedGPU != constants.TrueStringValue { return nil // Not a dedicated GPU request @@ -249,16 +248,16 @@ func handleDedicatedGPU(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile, } // Get full GPU capacity from pricing provider - tflops, vram, found := pricingProvider.GetGPUCapacityByModel(workloadProfile.Spec.GPUModel) + resource, found := gpuallocator.GPUCapacityMap[workloadProfile.Spec.GPUModel] if !found { return fmt.Errorf("could not find capacity information for GPU model: %s", workloadProfile.Spec.GPUModel) } // Set full capacity for both requests and limits - workloadProfile.Spec.Resources.Requests.Tflops = tflops - workloadProfile.Spec.Resources.Requests.Vram = vram - workloadProfile.Spec.Resources.Limits.Tflops = tflops - workloadProfile.Spec.Resources.Limits.Vram = vram + workloadProfile.Spec.Resources.Requests.Tflops = resource.Tflops + workloadProfile.Spec.Resources.Requests.Vram = resource.Vram + workloadProfile.Spec.Resources.Limits.Tflops = resource.Tflops + workloadProfile.Spec.Resources.Limits.Vram = resource.Vram return nil } diff --git a/patches/scheduler-csi-capacity-3.patch b/patches/scheduler-csi-capacity-3.patch index 29a21ae8..c5841d08 100644 --- a/patches/scheduler-csi-capacity-3.patch +++ b/patches/scheduler-csi-capacity-3.patch @@ -9,11 +9,11 @@ "strings" "time" -@@ -514,6 +516,14 @@ +@@ -543,6 +545,14 @@ } handlers = append(handlers, handlerRegistration) - case framework.CSIStorageCapacity: -+ // FIX kubernetes 1.24 and lower version API missing issue + case fwk.CSIStorageCapacity: ++ // FIX kubernetes 1.23 and lower version API missing issue + minorVersionStr := os.Getenv("KUBE_API_VERSION_MINOR") + if minorVersionStr != "" { + minorVersion, err := strconv.Atoi(minorVersionStr) @@ -22,5 +22,50 @@ + } + } if handlerRegistration, err = informerFactory.Storage().V1().CSIStorageCapacities().Informer().AddEventHandler( - buildEvtResHandler(at, framework.CSIStorageCapacity), + buildEvtResHandler(at, fwk.CSIStorageCapacity), ); err != nil { +@@ -578,6 +588,14 @@ + } + handlers = append(handlers, handlerRegistration) + case fwk.ResourceClaim: ++ // FIX kubernetes lower version API missing issue ++ minorVersionStr := os.Getenv("KUBE_API_VERSION_MINOR") ++ if minorVersionStr != "" { ++ minorVersion, err := strconv.Atoi(minorVersionStr) ++ if err != nil || minorVersion < 34 { ++ continue ++ } ++ } + if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { + handlerRegistration = resourceClaimCache.AddEventHandler( + buildEvtResHandler(at, fwk.ResourceClaim), +@@ -585,6 +603,14 @@ + handlers = append(handlers, handlerRegistration) + } + case fwk.ResourceSlice: ++ // FIX kubernetes lower version API missing issue ++ minorVersionStr := os.Getenv("KUBE_API_VERSION_MINOR") ++ if minorVersionStr != "" { ++ minorVersion, err := strconv.Atoi(minorVersionStr) ++ if err != nil || minorVersion < 34 { ++ continue ++ } ++ } + if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { + if handlerRegistration, err = resourceSliceTracker.AddEventHandler( + buildEvtResHandler(at, fwk.ResourceSlice), +@@ -594,6 +620,14 @@ + handlers = append(handlers, handlerRegistration) + } + case fwk.DeviceClass: ++ // FIX kubernetes lower version API missing issue ++ minorVersionStr := os.Getenv("KUBE_API_VERSION_MINOR") ++ if minorVersionStr != "" { ++ minorVersion, err := strconv.Atoi(minorVersionStr) ++ if err != nil || minorVersion < 34 { ++ continue ++ } ++ } + if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { + if handlerRegistration, err = informerFactory.Resource().V1().DeviceClasses().Informer().AddEventHandler( + buildEvtResHandler(at, fwk.DeviceClass), diff --git a/patches/scheduler-pdb-2.patch b/patches/scheduler-pdb-2.patch index 810bb0c6..12af371e 100644 --- a/patches/scheduler-pdb-2.patch +++ b/patches/scheduler-pdb-2.patch @@ -9,22 +9,14 @@ "sync" "sync/atomic" "time" -@@ -34,6 +36,7 @@ - "k8s.io/apimachinery/pkg/util/sets" - corelisters "k8s.io/client-go/listers/core/v1" - policylisters "k8s.io/client-go/listers/policy/v1" -+ policyv1 "k8s.io/client-go/listers/policy/v1" - corev1helpers "k8s.io/component-helpers/scheduling/corev1" - "k8s.io/klog/v2" - extenderv1 "k8s.io/kube-scheduler/extender/v1" -@@ -145,7 +148,16 @@ +@@ -148,8 +150,17 @@ func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator { podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister() - pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister() -+ + + // FIX kubernetes 1.21 and lower version API missing issue -+ var pdbLister policyv1.PodDisruptionBudgetLister ++ var pdbLister policylisters.PodDisruptionBudgetLister + minorVersionStr := os.Getenv("KUBE_API_VERSION_MINOR") + if minorVersionStr != "" { + minorVersion, err := strconv.Atoi(minorVersionStr) @@ -32,6 +24,7 @@ + pdbLister = fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister() + } + } - ++ ev := &Evaluator{ PluginName: pluginName, + Handler: fh, diff --git a/scripts/patch-scheduler.sh b/scripts/patch-scheduler.sh index 9afbc43b..23c2837d 100755 --- a/scripts/patch-scheduler.sh +++ b/scripts/patch-scheduler.sh @@ -1,6 +1,10 @@ #!/bin/bash git apply ./patches/scheduler-csi-capacity-1.patch git apply ./patches/scheduler-csi-capacity-2.patch + +# diff -u eventhandlers.go eventhandlers-new.go > changes.patch git apply ./patches/scheduler-csi-capacity-3.patch git apply ./patches/scheduler-pdb-1.patch + +# diff -u original_file.go modified_file.go > changes.patch git apply ./patches/scheduler-pdb-2.patch \ No newline at end of file diff --git a/test/sched/gpufit_bench_test.go b/test/sched/gpufit_bench_test.go index 20be047e..3acb53d4 100644 --- a/test/sched/gpufit_bench_test.go +++ b/test/sched/gpufit_bench_test.go @@ -42,7 +42,7 @@ func BenchmarkGPUFitPlugin(b *testing.B) { break } testPod := fixture.pods[i] - fixture.plugin.PreFilter(fixture.ctx, state, testPod) + fixture.plugin.PreFilter(fixture.ctx, state, testPod, nil) filterResult, err := state.Read(gpuResourceFitPlugin.CycleStateGPUSchedulingResult) if err != nil { b.Fatal(err) @@ -82,7 +82,7 @@ func BenchmarkGPUFitPlugin(b *testing.B) { b.Run("Filter", func(b *testing.B) { state := framework.NewCycleState() - fixture.plugin.PreFilter(fixture.ctx, state, testPod) + fixture.plugin.PreFilter(fixture.ctx, state, testPod, nil) nodeInfo := &framework.NodeInfo{} b.ResetTimer() @@ -94,7 +94,7 @@ func BenchmarkGPUFitPlugin(b *testing.B) { b.Run("Score", func(b *testing.B) { state := framework.NewCycleState() - fixture.plugin.PreFilter(fixture.ctx, state, testPod) + fixture.plugin.PreFilter(fixture.ctx, state, testPod, nil) nodeInfo := &framework.NodeInfo{} b.ResetTimer() diff --git a/test/sched/scheduler_bench_test.go b/test/sched/scheduler_bench_test.go index 65f43a13..fde318bd 100644 --- a/test/sched/scheduler_bench_test.go +++ b/test/sched/scheduler_bench_test.go @@ -6,14 +6,18 @@ import ( "os" "path/filepath" "runtime" + "strings" "testing" "time" "github.com/NexusGPU/tensor-fusion/cmd/sched" + "github.com/NexusGPU/tensor-fusion/internal/constants" gpuResourceFitPlugin "github.com/NexusGPU/tensor-fusion/internal/scheduler/gpuresources" gpuTopoPlugin "github.com/NexusGPU/tensor-fusion/internal/scheduler/gputopo" "github.com/NexusGPU/tensor-fusion/internal/utils" "go.uber.org/zap/zapcore" + "k8s.io/apimachinery/pkg/util/version" + "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" @@ -41,7 +45,19 @@ func defaultBenchmarkConfig() BenchmarkConfig { var testEnv *envtest.Environment -func setupKubernetes() (*rest.Config, error) { +func setupKubernetes() (*version.Version, *rest.Config, error) { + // export ENVTEST_K8S_VERSION=1.34.0 + // Run `./bin/setup-envtest use ${ENVTEST_K8S_VERSION} --bin-dir ./bin` before running the test + k8sVersion := os.Getenv("ENVTEST_K8S_VERSION") + if k8sVersion == "" { + k8sVersion = "1.31.0" + } + majorVersion := k8sVersion[:strings.Index(k8sVersion, ".")] + minorVersion := k8sVersion[strings.Index(k8sVersion, ".")+1 : strings.LastIndex(k8sVersion, ".")] + _ = os.Setenv(constants.KubeApiVersionMajorEnv, majorVersion) + _ = os.Setenv(constants.KubeApiVersionMinorEnv, minorVersion) + ver := version.MustParse(k8sVersion) + _ = feature.DefaultMutableFeatureGate.SetEmulationVersion(ver) testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{ filepath.Join("..", "..", "config", "crd", "bases"), @@ -49,15 +65,14 @@ func setupKubernetes() (*rest.Config, error) { }, ErrorIfCRDPathMissing: true, - // The BinaryAssetsDirectory is only required if you want to run the tests directly - // without call the makefile target test. If not informed it will look for the - // default path defined in controller-runtime which is /usr/local/kubebuilder/. - // Note that you must have the required binaries setup under the bin directory to perform - // the tests directly. When we run make test it will be setup and used automatically. BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", - fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), + fmt.Sprintf("%s-%s-%s", k8sVersion, runtime.GOOS, runtime.GOARCH)), + } + cfg, err := testEnv.Start() + if err != nil { + return nil, nil, err } - return testEnv.Start() + return ver, cfg, nil } // Estimated Performance: 400-500 pods/second for 1K nodes, 10K Pods cluster on Mac M4 Pro @@ -65,7 +80,7 @@ func setupKubernetes() (*rest.Config, error) { func BenchmarkScheduler(b *testing.B) { klog.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(false), zap.Level(zapcore.ErrorLevel))) // Setup phase - runs once before all benchmark iterations - cfg, err := setupKubernetes() + ver, cfg, err := setupKubernetes() if err != nil { b.Fatal(err) } @@ -99,7 +114,7 @@ func BenchmarkScheduler(b *testing.B) { testCtx := ctx cc, scheduler, err := sched.SetupScheduler(testCtx, nil, - "../../config/samples/scheduler-config.yaml", true, gpuResourceFitOpt, gpuTopoOpt) + "../../config/samples/scheduler-config.yaml", true, ver, gpuResourceFitOpt, gpuTopoOpt) if err != nil { b.Fatal(err) } diff --git a/test/sched/setup.go b/test/sched/setup.go index 03e40bfa..6fa4167d 100644 --- a/test/sched/setup.go +++ b/test/sched/setup.go @@ -17,18 +17,22 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + informers "k8s.io/client-go/informers" + clientsetfake "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/events" "k8s.io/klog/v2" + internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" + internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" + "k8s.io/kubernetes/pkg/scheduler/metrics" st "k8s.io/kubernetes/pkg/scheduler/testing" tf "k8s.io/kubernetes/pkg/scheduler/testing/framework" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - testutil "sigs.k8s.io/scheduler-plugins/test/util" ) // BenchmarkConfig holds benchmark configuration @@ -85,23 +89,35 @@ func NewBenchmarkFixture( b.Logf("%d Pods created, Needed TFLOPS: %f, Needed VRAM: %f", len(pods), neededTflops, neededVRAM) // Batch create resources for better performance - batchCreateResources(b, ctx, client, nodes, gpus, pods, realAPIServer) + k8sNativeObjects := batchCreateResources(b, ctx, client, nodes, gpus, pods, realAPIServer) // Setup allocator allocator := setupAllocator(b, ctx, client) // Setup framework and plugin - fwk, plugin := setupFrameworkAndPlugin(b, ctx, client, allocator, pods, nodes) - - return &BenchmarkFixture{ - ctx: ctx, - cancel: cancel, - plugin: plugin, - nodes: nodes, - pods: pods, - allocator: allocator, - client: client, - fwk: fwk, + if !realAPIServer { + fwk, plugin := setupFrameworkAndPlugin(b, ctx, client, allocator, k8sNativeObjects) + return &BenchmarkFixture{ + ctx: ctx, + cancel: cancel, + plugin: plugin, + nodes: nodes, + pods: pods, + allocator: allocator, + client: client, + fwk: fwk, + } + } else { + return &BenchmarkFixture{ + ctx: ctx, + cancel: cancel, + plugin: nil, + nodes: nodes, + pods: pods, + allocator: allocator, + client: client, + fwk: nil, + } } } @@ -273,7 +289,8 @@ func generatePods(count int, namespace, poolName string) ([]*v1.Pod, float64, fl func batchCreateResources( b *testing.B, ctx context.Context, client client.Client, nodes []*v1.Node, gpus []*tfv1.GPU, pods []*v1.Pod, realAPIServer bool, -) { +) []runtime.Object { + k8sObjs := []runtime.Object{} require.NoError(b, client.Create(ctx, &v1.Namespace{ ObjectMeta: metav1.ObjectMeta{Name: "benchmark-ns"}, })) @@ -283,6 +300,7 @@ func batchCreateResources( for _, node := range nodes { nodeCopy := node.DeepCopy() require.NoError(b, client.Create(ctx, nodeCopy)) + k8sObjs = append(k8sObjs, nodeCopy) if realAPIServer { node.ResourceVersion = nodeCopy.ResourceVersion @@ -310,13 +328,15 @@ func batchCreateResources( b.Logf("Creating %d pods", len(pods)) for _, pod := range pods { require.NoError(b, client.Create(ctx, pod)) + k8sObjs = append(k8sObjs, pod) } b.Logf("%d pods created, duration: %v", len(pods), time.Since(timer)) + return k8sObjs } func setupFrameworkAndPlugin( b *testing.B, ctx context.Context, client client.Client, - allocator *gpuallocator.GpuAllocator, pods []*v1.Pod, nodes []*v1.Node, + allocator *gpuallocator.GpuAllocator, k8sObjs []runtime.Object, ) (framework.Framework, *gpuResourceFitPlugin.GPUFit) { // Register plugins including our GPU plugin registeredPlugins := []tf.RegisterPluginFunc{ @@ -324,11 +344,16 @@ func setupFrameworkAndPlugin( tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), } - // Create framework - fwk, err := tf.NewFramework(ctx, registeredPlugins, "", - frameworkruntime.WithPodNominator(testutil.NewPodNominator(nil)), - frameworkruntime.WithSnapshotSharedLister(testutil.NewFakeSharedLister(pods, nodes)), + fakeClientSet := clientsetfake.NewSimpleClientset(k8sObjs...) + informerFactory := informers.NewSharedInformerFactory(fakeClientSet, 0) + metrics.Register() + metricsRecorder := metrics.NewMetricsAsyncRecorder(1000, time.Second, ctx.Done()) + fwk, err := tf.NewFramework( + ctx, registeredPlugins, "", + frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)), + frameworkruntime.WithSnapshotSharedLister(internalcache.NewEmptySnapshot()), frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), + frameworkruntime.WithMetricsRecorder(metricsRecorder), ) require.NoError(b, err) From 52d4fd24a3d2d53a7fa0e659fb468636b1623a27 Mon Sep 17 00:00:00 2001 From: dylan Date: Tue, 9 Sep 2025 07:57:51 -0700 Subject: [PATCH 16/34] cel fliter enhancement --- .../filter/cel_filter/cel_filter.go | 1073 +++++++++++++++-- .../cel_filter/cel_filter_benchmark_test.go | 9 +- .../filter/cel_filter/expression_cache.go | 6 +- 3 files changed, 957 insertions(+), 131 deletions(-) diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go index a9369535..ea463b0f 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -3,29 +3,431 @@ package cel_filter import ( "context" "fmt" - "time" + "reflect" + "regexp" + "runtime" + "strconv" + "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/google/cel-go/cel" "github.com/google/cel-go/common/types" + "github.com/google/cel-go/common/types/ref" + "github.com/google/cel-go/interpreter" "sigs.k8s.io/controller-runtime/pkg/log" ) +// Parallel processing constants +const ( + // Threshold for enabling parallel processing + ParallelThreshold = 2000 + // Default number of worker goroutines + DefaultWorkerCount = 4 +) + +// fieldUsage tracks which GPU fields are used in the expression +type fieldUsage struct { + labels bool + annotations bool + available bool + nodeSelector bool + runningApps bool +} + +// FastPathPredicate represents a compiled fast-path predicate function +type FastPathPredicate func(gpu *tfv1.GPU) bool + +// ExpressionPattern represents a recognized expression pattern for fast path +type ExpressionPattern struct { + Pattern *regexp.Regexp + Generator func(matches []string) FastPathPredicate +} + +// Common fast path patterns - order matters (most specific first) +var fastPathPatterns = []ExpressionPattern{ + // Complex AND pattern: gpu.available.tflops >= NUMBER && gpu.labels['KEY'] == 'VALUE' + { + Pattern: regexp.MustCompile(`^gpu\.available\.tflops\s*>=\s*([0-9]+(?:\.[0-9]+)?)\s*&&\s*gpu\.labels\['([^']+)'\]\s*==\s*'([^']+)'$`), + Generator: func(matches []string) FastPathPredicate { + threshold, _ := strconv.ParseFloat(matches[1], 64) + labelKey, labelValue := matches[2], matches[3] + return func(gpu *tfv1.GPU) bool { + return gpu.Status.Available != nil && + gpu.Status.Available.Tflops.AsApproximateFloat64() >= threshold && + gpu.Labels != nil && gpu.Labels[labelKey] == labelValue + } + }, + }, + // gpu.available.tflops >= NUMBER + { + Pattern: regexp.MustCompile(`^gpu\.available\.tflops\s*>=\s*([0-9]+(?:\.[0-9]+)?)$`), + Generator: func(matches []string) FastPathPredicate { + threshold, _ := strconv.ParseFloat(matches[1], 64) + return func(gpu *tfv1.GPU) bool { + return gpu.Status.Available != nil && gpu.Status.Available.Tflops.AsApproximateFloat64() >= threshold + } + }, + }, + // gpu.available.tflops > NUMBER + { + Pattern: regexp.MustCompile(`^gpu\.available\.tflops\s*>\s*([0-9]+(?:\.[0-9]+)?)$`), + Generator: func(matches []string) FastPathPredicate { + threshold, _ := strconv.ParseFloat(matches[1], 64) + return func(gpu *tfv1.GPU) bool { + return gpu.Status.Available != nil && gpu.Status.Available.Tflops.AsApproximateFloat64() > threshold + } + }, + }, + // gpu.available.vram >= NUMBER + { + Pattern: regexp.MustCompile(`^gpu\.available\.vram\s*>=\s*([0-9]+(?:\.[0-9]+)?)$`), + Generator: func(matches []string) FastPathPredicate { + threshold, _ := strconv.ParseFloat(matches[1], 64) + return func(gpu *tfv1.GPU) bool { + return gpu.Status.Available != nil && gpu.Status.Available.Vram.AsApproximateFloat64() >= threshold + } + }, + }, + // gpu.available.vram > NUMBER + { + Pattern: regexp.MustCompile(`^gpu\.available\.vram\s*>\s*([0-9]+(?:\.[0-9]+)?)$`), + Generator: func(matches []string) FastPathPredicate { + threshold, _ := strconv.ParseFloat(matches[1], 64) + return func(gpu *tfv1.GPU) bool { + return gpu.Status.Available != nil && gpu.Status.Available.Vram.AsApproximateFloat64() > threshold + } + }, + }, + // gpu.labels['KEY'] == 'VALUE' + { + Pattern: regexp.MustCompile(`^gpu\.labels\['([^']+)'\]\s*==\s*'([^']+)'$`), + Generator: func(matches []string) FastPathPredicate { + key, value := matches[1], matches[2] + return func(gpu *tfv1.GPU) bool { + return gpu.Labels != nil && gpu.Labels[key] == value + } + }, + }, + // gpu.annotations['KEY'] == 'VALUE' + { + Pattern: regexp.MustCompile(`^gpu\.annotations\['([^']+)'\]\s*==\s*'([^']+)'$`), + Generator: func(matches []string) FastPathPredicate { + key, value := matches[1], matches[2] + return func(gpu *tfv1.GPU) bool { + return gpu.Annotations != nil && gpu.Annotations[key] == value + } + }, + }, +} + + +// ZeroAllocActivation provides zero-allocation variable resolution for CEL +// This eliminates the need to create map[string]interface{} for each GPU +type ZeroAllocActivation struct { + gpu *tfv1.GPU + workerPodKey tfv1.NameNamespace + usage fieldUsage +} + +// ResolveName implements interpreter.Activation interface +func (a *ZeroAllocActivation) ResolveName(name string) (interface{}, bool) { + switch name { + case CELVarGPU: + return a.createGPUObject(), true + case CELVarWorkerPodKey: + return a.createWorkerPodKeyObject(), true + default: + return nil, false + } +} + +// Parent implements interpreter.Activation interface +func (a *ZeroAllocActivation) Parent() interpreter.Activation { + return nil +} + +// createGPUObject creates GPU object on-demand without maps +func (a *ZeroAllocActivation) createGPUObject() interface{} { + // Return GPU value with lazy caching + return &gpuVal{GPU: a.gpu} +} + + +// createWorkerPodKeyObject creates worker pod key object +func (a *ZeroAllocActivation) createWorkerPodKeyObject() interface{} { + return map[string]interface{}{ + "name": a.workerPodKey.Name, + "namespace": a.workerPodKey.Namespace, + } +} + +// gpuVal implements CEL value interface for GPU objects to eliminate map allocations +type gpuVal struct { + *tfv1.GPU + // Cached sub-values to avoid repeated allocations + labels ref.Val + annotations ref.Val + nodeSelector ref.Val + available ref.Val + runningApps ref.Val +} + +// Type implements ref.Val interface +func (v *gpuVal) Type() ref.Type { + return types.MapType +} + +// Value implements ref.Val interface +func (v *gpuVal) Value() interface{} { + return v.GPU +} + +// Equal implements ref.Val interface +func (v *gpuVal) Equal(other ref.Val) ref.Val { + if otherGPU, ok := other.(*gpuVal); ok { + return types.Bool(v.GPU.UID == otherGPU.GPU.UID) + } + return types.False +} + +// ConvertToNative implements ref.Val interface +func (v *gpuVal) ConvertToNative(typeDesc reflect.Type) (interface{}, error) { + return v.GPU, nil +} + +// ConvertToType implements ref.Val interface +func (v *gpuVal) ConvertToType(typeValue ref.Type) ref.Val { + switch typeValue { + case types.TypeType: + return types.MapType + default: + return types.NewErr("type conversion error") + } +} + +// HasField implements traits.FieldTester interface +func (v *gpuVal) HasField(field string) bool { + switch field { + case GPUFieldName, GPUFieldNamespace, GPUFieldGPUModel, GPUFieldUUID, + GPUFieldPhase, GPUFieldUsedBy, GPUFieldMessage, GPUFieldLabels, + GPUFieldAnnotations, GPUFieldAvailable, GPUFieldNodeSelector, GPUFieldRunningApps: + return true + default: + return false + } +} + +// Get implements traits.Indexer interface for field access with lazy caching +func (v *gpuVal) Get(index ref.Val) ref.Val { + field, ok := index.Value().(string) + if !ok { + return types.NewErr("index must be string") + } + + switch field { + case GPUFieldName: + return types.String(v.GPU.Name) + case GPUFieldNamespace: + return types.String(v.GPU.Namespace) + case GPUFieldGPUModel: + return types.String(v.GPU.Status.GPUModel) + case GPUFieldUUID: + return types.String(v.GPU.Status.UUID) + case GPUFieldPhase: + return types.String(string(v.GPU.Status.Phase)) + case GPUFieldUsedBy: + return types.String(string(v.GPU.Status.UsedBy)) + case GPUFieldMessage: + return types.String(v.GPU.Status.Message) + case GPUFieldLabels: + // Lazy initialization with caching + if v.labels == nil { + v.labels = &labelsVal{labels: v.GPU.Labels} + } + return v.labels + case GPUFieldAnnotations: + // Lazy initialization with caching + if v.annotations == nil { + v.annotations = &labelsVal{labels: v.GPU.Annotations} + } + return v.annotations + case GPUFieldAvailable: + // Lazy initialization with caching + if v.available == nil { + v.available = &availableVal{available: v.GPU.Status.Available} + } + return v.available + case GPUFieldNodeSelector: + // Lazy initialization with caching + if v.nodeSelector == nil { + v.nodeSelector = &labelsVal{labels: v.GPU.Status.NodeSelector} + } + return v.nodeSelector + case GPUFieldRunningApps: + // For now, keep simple implementation - can optimize later if needed + if v.runningApps == nil { + apps := make([]interface{}, len(v.GPU.Status.RunningApps)) + for i, app := range v.GPU.Status.RunningApps { + apps[i] = map[string]interface{}{ + "name": app.Name, + "namespace": app.Namespace, + } + } + v.runningApps = types.NewDynamicList(types.DefaultTypeAdapter, apps) + } + return v.runningApps + default: + return types.NewErr("no such field: %s", field) + } +} + +// availableVal provides direct access to GPU available resources without maps +type availableVal struct { + available *tfv1.Resource +} + +// Type implements ref.Val interface +func (v *availableVal) Type() ref.Type { + return types.MapType +} + +// Value implements ref.Val interface +func (v *availableVal) Value() interface{} { + return v.available +} + +// Equal implements ref.Val interface +func (v *availableVal) Equal(other ref.Val) ref.Val { + return types.False // Not used in comparisons +} + +// ConvertToNative implements ref.Val interface +func (v *availableVal) ConvertToNative(typeDesc reflect.Type) (interface{}, error) { + return v.available, nil +} + +// ConvertToType implements ref.Val interface +func (v *availableVal) ConvertToType(typeValue ref.Type) ref.Val { + return types.NewErr("type conversion not supported") +} + +// Get implements field access for available resources +func (v *availableVal) Get(index ref.Val) ref.Val { + field, ok := index.Value().(string) + if !ok { + return types.NewErr("index must be string") + } + + if v.available == nil { + switch field { + case "tflops": + return types.Double(0.0) + case "vram": + return types.Int(0) + default: + return types.NewErr("no such field: %s", field) + } + } + + switch field { + case "tflops": + return types.Double(v.available.Tflops.AsApproximateFloat64()) + case "vram": + return types.Int(v.available.Vram.Value()) + default: + return types.NewErr("no such field: %s", field) + } +} + +// HasField implements field testing +func (v *availableVal) HasField(field string) bool { + return field == "tflops" || field == "vram" +} + +// labelsVal provides direct access to GPU labels without copying +type labelsVal struct { + labels map[string]string +} + +// Type implements ref.Val interface +func (v *labelsVal) Type() ref.Type { + return types.MapType +} + +// Value implements ref.Val interface +func (v *labelsVal) Value() interface{} { + return v.labels +} + +// Equal implements ref.Val interface +func (v *labelsVal) Equal(other ref.Val) ref.Val { + return types.False // Not used in comparisons +} + +// ConvertToNative implements ref.Val interface +func (v *labelsVal) ConvertToNative(typeDesc reflect.Type) (interface{}, error) { + return v.labels, nil +} + +// ConvertToType implements ref.Val interface +func (v *labelsVal) ConvertToType(typeValue ref.Type) ref.Val { + return types.NewErr("type conversion not supported") +} + +// Get implements map access for labels +func (v *labelsVal) Get(index ref.Val) ref.Val { + key, ok := index.Value().(string) + if !ok { + return types.NewErr("index must be string") + } + + if v.labels == nil { + return types.String("") + } + + value, exists := v.labels[key] + if !exists { + return types.String("") + } + return types.String(value) +} + // AllocRequestCELFilter converts AllocRequest to CEL filter and executes it type CELFilter struct { - cache *ExpressionCache - expression string - name string + cache *ExpressionCache + name string + // Store early filtering criteria for optimization + requiredPhase string + requiredGPUModel string + userExpression string + // Track which fields are actually used + usage fieldUsage + // Display expression for logging (read-only) + displayExpression string + // Fast path predicate for common patterns + fastPathPredicate FastPathPredicate } // NewAllocRequestCELFilter creates a new CEL filter from allocation request func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, error) { - // Convert AllocRequest to CEL expression - expression, err := convertAllocRequestToCEL(req) - if err != nil { - return nil, fmt.Errorf("failed to convert AllocRequest to CEL: %w", err) + // Extract early filtering criteria + var requiredPhase, requiredGPUModel, userExpression, displayExpression string + + if req != nil { + requiredPhase = "Ready" // Keep as Ready for compatibility with tests + requiredGPUModel = req.GPUModel + userExpression = req.CELFilterExpression + + // Build display expression for logging (not used for execution) + displayExpression = buildDisplayExpression(req) } + // Analyze field usage in user expression only + usage := analyzeFieldUsage(userExpression) + + // Try to compile fast path predicate + fastPath := compileFastPath(userExpression) + // Handle nil request case name := "AllocRequest-unknown" if req != nil { @@ -33,9 +435,14 @@ func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, e } return &CELFilter{ - cache: cache, - expression: expression, - name: name, + cache: cache, + name: name, + requiredPhase: requiredPhase, + requiredGPUModel: requiredGPUModel, + userExpression: userExpression, + usage: usage, + displayExpression: displayExpression, + fastPathPredicate: fastPath, }, nil } @@ -51,84 +458,93 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, return gpus, nil } - if f.expression == "" { - // If no expression, return all GPUs (no filtering needed) - return gpus, nil + // Pre-allocate result slice with estimated capacity + filteredGPUs := make([]*tfv1.GPU, 0, len(gpus)) + + // Early filtering phase: apply basic filters first to reduce CEL evaluation overhead + earlyFilteredGPUs := make([]*tfv1.GPU, 0, len(gpus)) + for _, gpu := range gpus { + // Fast path: check phase first (most common filter) + if f.requiredPhase != "" && string(gpu.Status.Phase) != f.requiredPhase { + continue + } + + // Fast path: check GPU model (second most common filter) + if f.requiredGPUModel != "" && gpu.Status.GPUModel != f.requiredGPUModel { + continue + } + + earlyFilteredGPUs = append(earlyFilteredGPUs, gpu) } - // Get compiled program from cache - program, err := f.cache.GetOrCompileProgram(f.expression) + // If no user expression, return early filtered results + if f.userExpression == "" { + log.V(1).Info("CEL filter applied (early filtering only)", + "filter", f.name, + "inputGPUs", len(gpus), + "earlyFilteredGPUs", len(earlyFilteredGPUs), + "outputGPUs", len(earlyFilteredGPUs)) + return earlyFilteredGPUs, nil + } + + // If no GPUs passed early filtering, return empty result + if len(earlyFilteredGPUs) == 0 { + return earlyFilteredGPUs, nil + } + + // Get compiled program from cache for user expression + program, err := f.cache.GetOrCompileProgram(f.userExpression) if err != nil { - return nil, fmt.Errorf("failed to get CEL program for expression %q: %w", f.expression, err) + return nil, fmt.Errorf("failed to get CEL program for expression %q: %w", f.userExpression, err) } - var filteredGPUs []*tfv1.GPU - for _, gpu := range gpus { - // Create timeout context for CEL evaluation - evalCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) - - // Create variables for CEL evaluation - vars := createCELVariables(*gpu, workerPodKey) - - // Evaluate with timeout - resultChan := make(chan evalResult, 1) - go func() { - result, _, evalErr := program.Eval(vars) - resultChan <- evalResult{result: result, err: evalErr} - }() - - select { - case evalRes := <-resultChan: - cancel() - if evalRes.err != nil { - log.Error(evalRes.err, "CEL expression evaluation failed", - "expression", f.expression, - "gpu", gpu.Name, - "workerPodKey", workerPodKey) - // On error, exclude the GPU (fail-safe) - continue - } - - // Convert result to boolean - if boolResult, ok := evalRes.result.(types.Bool); ok { - if bool(boolResult) { + // Use fast path if available, otherwise fall back to CEL + if f.fastPathPredicate != nil { + // Fast path: direct Go function evaluation with optional parallelization + if len(earlyFilteredGPUs) >= ParallelThreshold { + filteredGPUs = f.filterParallel(earlyFilteredGPUs) + } else { + for _, gpu := range earlyFilteredGPUs { + if f.fastPathPredicate(gpu) { filteredGPUs = append(filteredGPUs, gpu) } - } else { - log.Error(nil, "CEL expression did not return boolean", - "expression", f.expression, - "result", evalRes.result, - "gpu", gpu.Name) - // On non-boolean result, exclude the GPU (fail-safe) - continue } - case <-evalCtx.Done(): - cancel() - // Timeout - skip this GPU (fail-safe behavior) - log.V(1).Info("CEL evaluation timeout", "gpu", gpu.Name, "expression", f.expression) - continue } + + log.V(1).Info("CEL filter applied (fast path)", + "filter", f.name, + "displayExpression", f.displayExpression, + "userExpression", f.userExpression, + "inputGPUs", len(gpus), + "earlyFilteredGPUs", len(earlyFilteredGPUs), + "outputGPUs", len(filteredGPUs)) + } else { + // Fallback to CEL evaluation for complex expressions + if len(earlyFilteredGPUs) >= ParallelThreshold { + // Use parallel evaluation for large GPU sets + filteredGPUs = f.filterFallbackParallel(ctx, program, earlyFilteredGPUs, workerPodKey) + } else { + // Sequential evaluation for smaller sets + filteredGPUs = f.filterFallbackSequential(ctx, program, earlyFilteredGPUs, workerPodKey) + } + + log.V(1).Info("CEL filter applied (CEL evaluation)", + "filter", f.name, + "displayExpression", f.displayExpression, + "userExpression", f.userExpression, + "inputGPUs", len(gpus), + "earlyFilteredGPUs", len(earlyFilteredGPUs), + "outputGPUs", len(filteredGPUs)) } - log.V(1).Info("AllocRequest CEL filter applied", - "filter", f.name, - "expression", f.expression, - "inputGPUs", len(gpus), - "outputGPUs", len(filteredGPUs)) - return filteredGPUs, nil } -type evalResult struct { - result interface{} - err error -} - -// convertAllocRequestToCEL converts an allocation request to a CEL expression -func convertAllocRequestToCEL(req *tfv1.AllocRequest) (string, error) { +// buildDisplayExpression creates a readable expression string for logging purposes only +func buildDisplayExpression(req *tfv1.AllocRequest) string { if req == nil { - return "", nil + return "" } var conditions []string @@ -138,30 +554,24 @@ func convertAllocRequestToCEL(req *tfv1.AllocRequest) (string, error) { conditions = append(conditions, req.CELFilterExpression) } - // Add GPU phase condition (must be Ready) - conditions = append(conditions, "gpu.phase == 'Ready'") - - // Add GPU model filter if specified - if req.GPUModel != "" { - conditions = append(conditions, fmt.Sprintf("gpu.gpuModel == '%s'", req.GPUModel)) - } - - // If no conditions, return empty expression (no filtering) + // If no conditions, return empty expression if len(conditions) == 0 { - return "", nil + return "" } - // Combine all conditions with AND + // Combine all conditions with AND using strings.Builder for efficiency if len(conditions) == 1 { - return conditions[0], nil + return conditions[0] } - expression := conditions[0] + var builder strings.Builder + builder.WriteString(conditions[0]) for i := 1; i < len(conditions); i++ { - expression += " && " + conditions[i] + builder.WriteString(" && ") + builder.WriteString(conditions[i]) } - return expression, nil + return builder.String() } // createCELEnvironment creates a CEL environment with GPU-related variables and functions @@ -171,58 +581,469 @@ func createCELEnvironment() (*cel.Env, error) { cel.Variable(CELVarGPU, cel.MapType(cel.StringType, cel.DynType)), // Define worker pod key cel.Variable(CELVarWorkerPodKey, cel.MapType(cel.StringType, cel.StringType)), - // Define request object structure - cel.Variable(CELVarRequest, cel.MapType(cel.StringType, cel.DynType)), ) } -// createCELVariables creates variables for CEL evaluation from GPU and request information -func createCELVariables(gpu tfv1.GPU, workerPodKey tfv1.NameNamespace) map[string]interface{} { - // Convert GPU to a map for CEL evaluation - gpuMap := map[string]interface{}{ - GPUFieldName: gpu.Name, - GPUFieldNamespace: gpu.Namespace, - GPUFieldGPUModel: gpu.Status.GPUModel, - GPUFieldUUID: gpu.Status.UUID, - GPUFieldPhase: string(gpu.Status.Phase), - GPUFieldUsedBy: string(gpu.Status.UsedBy), - GPUFieldMessage: gpu.Status.Message, - GPUFieldLabels: gpu.Labels, - GPUFieldAnnotations: gpu.Annotations, + +// filterParallel processes GPUs in parallel for large datasets +func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { + numGPUs := len(gpus) + numWorkers := runtime.NumCPU() + if numWorkers > DefaultWorkerCount { + numWorkers = DefaultWorkerCount + } + + chunkSize := (numGPUs + numWorkers - 1) / numWorkers + resultChannels := make([]<-chan []*tfv1.GPU, numWorkers) + + // Create workers + for i := 0; i < numWorkers; i++ { + start := i * chunkSize + end := start + chunkSize + if end > numGPUs { + end = numGPUs + } + + if start >= end { + // No work for this worker + ch := make(chan []*tfv1.GPU, 1) + ch <- []*tfv1.GPU{} + close(ch) + resultChannels[i] = ch + continue + } + + chunk := gpus[start:end] + resultCh := make(chan []*tfv1.GPU, 1) + resultChannels[i] = resultCh + + // Start worker goroutine + go func(gpuChunk []*tfv1.GPU, resultCh chan<- []*tfv1.GPU) { + defer close(resultCh) + + filtered := make([]*tfv1.GPU, 0, len(gpuChunk)/2) // Estimate 50% pass rate + for _, gpu := range gpuChunk { + if f.fastPathPredicate(gpu) { + filtered = append(filtered, gpu) + } + } + resultCh <- filtered + }(chunk, resultCh) + } + + // Collect results + var totalFiltered []*tfv1.GPU + for _, ch := range resultChannels { + chunkResults := <-ch + totalFiltered = append(totalFiltered, chunkResults...) + } + + return totalFiltered +} + +// filterFallbackSequential performs sequential CEL evaluation for smaller GPU sets +func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Program, gpus []*tfv1.GPU, workerPodKey tfv1.NameNamespace) []*tfv1.GPU { + filteredGPUs := make([]*tfv1.GPU, 0, len(gpus)/2) + log := log.FromContext(ctx) + + for i, gpu := range gpus { + // Periodic context check every 64 GPUs for very large sets + if i&63 == 0 { + select { + case <-ctx.Done(): + log.V(1).Info("CEL evaluation cancelled", "processedGPUs", len(filteredGPUs), "totalGPUs", len(gpus)) + return filteredGPUs + default: + } + } + + // Use zero-allocation activation instead of maps + activation := &ZeroAllocActivation{ + gpu: gpu, + workerPodKey: workerPodKey, + usage: f.usage, + } + + // Direct synchronous evaluation with custom activation + result, _, evalErr := program.Eval(activation) + + if evalErr != nil { + log.Error(evalErr, "CEL expression evaluation failed", + "expression", f.userExpression, + "gpu", gpu.Name, + "workerPodKey", workerPodKey) + // On error, exclude the GPU (fail-safe) + continue + } + + // Convert result to boolean + if boolResult, ok := result.(types.Bool); ok { + if bool(boolResult) { + filteredGPUs = append(filteredGPUs, gpu) + } + } else { + log.Error(nil, "CEL expression did not return boolean", + "expression", f.userExpression, + "result", result, + "gpu", gpu.Name) + // On non-boolean result, exclude the GPU (fail-safe) + continue + } + } + + return filteredGPUs +} + +// filterFallbackParallel performs parallel CEL evaluation for large GPU sets +func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Program, gpus []*tfv1.GPU, workerPodKey tfv1.NameNamespace) []*tfv1.GPU { + numGPUs := len(gpus) + numWorkers := runtime.NumCPU() + if numWorkers > DefaultWorkerCount { + numWorkers = DefaultWorkerCount + } + + chunkSize := (numGPUs + numWorkers - 1) / numWorkers + resultChannels := make([]<-chan []*tfv1.GPU, numWorkers) + + // Create workers + for i := 0; i < numWorkers; i++ { + start := i * chunkSize + end := start + chunkSize + if end > numGPUs { + end = numGPUs + } + + if start >= end { + // No work for this worker + ch := make(chan []*tfv1.GPU, 1) + ch <- []*tfv1.GPU{} + close(ch) + resultChannels[i] = ch + continue + } + + chunk := gpus[start:end] + resultCh := make(chan []*tfv1.GPU, 1) + resultChannels[i] = resultCh + + // Start worker goroutine + go func(gpuChunk []*tfv1.GPU, resultCh chan<- []*tfv1.GPU) { + defer close(resultCh) + + filtered := make([]*tfv1.GPU, 0, len(gpuChunk)/2) // Estimate 50% pass rate + + for _, gpu := range gpuChunk { + // Use zero-allocation activation + activation := &ZeroAllocActivation{ + gpu: gpu, + workerPodKey: workerPodKey, + usage: f.usage, + } + + // Direct synchronous evaluation + result, _, evalErr := program.Eval(activation) + if evalErr != nil { + // On error, exclude the GPU (fail-safe) + continue + } + + // Convert result to boolean + if boolResult, ok := result.(types.Bool); ok { + if bool(boolResult) { + filtered = append(filtered, gpu) + } + } + // On non-boolean result, exclude the GPU (fail-safe) + } + resultCh <- filtered + }(chunk, resultCh) + } + + // Collect results + var totalFiltered []*tfv1.GPU + for _, ch := range resultChannels { + chunkResults := <-ch + totalFiltered = append(totalFiltered, chunkResults...) + } + + return totalFiltered +} + + +// compileFastPath tries to compile expression into a fast path predicate +// Uses AST analysis for better pattern matching than regex +func compileFastPath(expression string) FastPathPredicate { + if expression == "" { + return nil + } + + // Try AST-based compilation first (more flexible) + if pred := compileASTFastPath(expression); pred != nil { + return pred + } + + // Fall back to regex patterns for backward compatibility + for _, pattern := range fastPathPatterns { + matches := pattern.Pattern.FindStringSubmatch(expression) + if matches != nil { + return pattern.Generator(matches) + } + } + + return nil +} + +// compileASTFastPath analyzes AST to generate fast path predicates +func compileASTFastPath(expression string) FastPathPredicate { + // Parse expression to AST + env, err := createCELEnvironment() + if err != nil { + return nil + } + + _, issues := env.Parse(expression) + if issues != nil && issues.Err() != nil { + return nil + } + + // Extract conditions from expression string (simplified approach) + conditions := extractConditionsFromString(expression) + if len(conditions) == 0 { + return nil + } + + // Generate fast path predicate + return func(gpu *tfv1.GPU) bool { + for _, condition := range conditions { + if !evaluateCondition(gpu, condition) { + return false // Short-circuit on first failure (AND logic) + } + } + return true } +} + +// astCondition represents a simple condition extracted from AST +type astCondition struct { + field string // e.g., "gpu.available.tflops", "gpu.labels['env']" + operator string // "==", "!=", ">=", ">" + value interface{} // expected value +} - // Add available information if available - if gpu.Status.Available != nil { - gpuMap[GPUFieldAvailable] = map[string]interface{}{ - ResourceFieldTFlops: gpu.Status.Available.Tflops.AsApproximateFloat64(), - ResourceFieldVRAM: gpu.Status.Available.Vram.AsApproximateFloat64(), + +// extractConditionsFromString uses enhanced pattern matching to extract conditions +// This bridges the gap between regex and full AST until full AST implementation +func extractConditionsFromString(exprStr string) []astCondition { + var conditions []astCondition + + // Split by && to handle multiple conditions + parts := strings.Split(exprStr, " && ") + + for _, part := range parts { + part = strings.TrimSpace(part) + + // Handle gpu.available.tflops >= X + if strings.Contains(part, "gpu.available.tflops") && strings.Contains(part, ">=") { + if condition := parseNumericCondition(part, "gpu.available.tflops", ">="); condition != nil { + conditions = append(conditions, *condition) + } + } else if strings.Contains(part, "gpu.available.tflops") && strings.Contains(part, ">") { + if condition := parseNumericCondition(part, "gpu.available.tflops", ">"); condition != nil { + conditions = append(conditions, *condition) + } + } + + // Handle gpu.available.vram >= X + if strings.Contains(part, "gpu.available.vram") && strings.Contains(part, ">=") { + if condition := parseNumericCondition(part, "gpu.available.vram", ">="); condition != nil { + conditions = append(conditions, *condition) + } + } + + // Handle gpu.labels['key'] == 'value' + if strings.Contains(part, "gpu.labels[") && strings.Contains(part, "==") { + if condition := parseLabelCondition(part, "gpu.labels"); condition != nil { + conditions = append(conditions, *condition) + } } + + // Handle gpu.annotations['key'] == 'value' + if strings.Contains(part, "gpu.annotations[") && strings.Contains(part, "==") { + if condition := parseLabelCondition(part, "gpu.annotations"); condition != nil { + conditions = append(conditions, *condition) + } + } + + // Handle gpu.gpuModel == 'value' + if strings.Contains(part, "gpu.gpuModel") && strings.Contains(part, "==") { + if condition := parseStringCondition(part, "gpu.gpuModel", "=="); condition != nil { + conditions = append(conditions, *condition) + } + } + } + + return conditions +} + +// parseNumericCondition parses numeric comparison conditions +func parseNumericCondition(expr, field, operator string) *astCondition { + parts := strings.Split(expr, operator) + if len(parts) != 2 { + return nil } + + valueStr := strings.TrimSpace(parts[1]) + value, err := strconv.ParseFloat(valueStr, 64) + if err != nil { + return nil + } + + return &astCondition{ + field: field, + operator: operator, + value: value, + } +} + +// parseLabelCondition parses label/annotation map access conditions +func parseLabelCondition(expr, fieldPrefix string) *astCondition { + // Extract key from gpu.labels['key'] == 'value' format + keyStart := strings.Index(expr, "['") + 2 + keyEnd := strings.Index(expr[keyStart:], "']") + if keyEnd == -1 { + return nil + } + key := expr[keyStart : keyStart+keyEnd] + + // Extract value + valueStart := strings.LastIndex(expr, "'") + if valueStart == -1 { + return nil + } + // Find the quote before the last quote + prevQuotePos := strings.LastIndex(expr[:valueStart], "'") + if prevQuotePos == -1 { + return nil + } + value := expr[prevQuotePos+1 : valueStart] + + return &astCondition{ + field: fieldPrefix + "['" + key + "']", + operator: "==", + value: value, + } +} - // Add node selector information - if gpu.Status.NodeSelector != nil { - gpuMap[GPUFieldNodeSelector] = gpu.Status.NodeSelector +// parseStringCondition parses simple string equality conditions +func parseStringCondition(expr, field, operator string) *astCondition { + parts := strings.Split(expr, operator) + if len(parts) != 2 { + return nil + } + + valueStr := strings.TrimSpace(parts[1]) + // Remove quotes + if strings.HasPrefix(valueStr, "'") && strings.HasSuffix(valueStr, "'") { + valueStr = valueStr[1 : len(valueStr)-1] } + + return &astCondition{ + field: field, + operator: operator, + value: valueStr, + } +} - // Add running apps information (always set, even if empty) - runningApps := make([]map[string]interface{}, len(gpu.Status.RunningApps)) - for i, app := range gpu.Status.RunningApps { - runningApps[i] = map[string]interface{}{ - AppFieldName: app.Name, - AppFieldNamespace: app.Namespace, - AppFieldCount: app.Count, +// evaluateCondition evaluates a single condition against a GPU +func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { + switch condition.field { + case "gpu.available.tflops": + if gpu.Status.Available == nil { + return false + } + actualValue := gpu.Status.Available.Tflops.AsApproximateFloat64() + expectedValue, ok := condition.value.(float64) + if !ok { + return false } + + switch condition.operator { + case ">=": + return actualValue >= expectedValue + case ">": + return actualValue > expectedValue + default: + return false + } + + case "gpu.available.vram": + if gpu.Status.Available == nil { + return false + } + actualValue := float64(gpu.Status.Available.Vram.Value()) + expectedValue, ok := condition.value.(float64) + if !ok { + return false + } + + switch condition.operator { + case ">=": + return actualValue >= expectedValue + case ">": + return actualValue > expectedValue + default: + return false + } + + case "gpu.gpuModel": + expectedValue, ok := condition.value.(string) + if !ok { + return false + } + return gpu.Status.GPUModel == expectedValue + + default: + // Handle label/annotation access + if strings.HasPrefix(condition.field, "gpu.labels['") { + key := strings.TrimSuffix(strings.TrimPrefix(condition.field, "gpu.labels['"), "']") + expectedValue, ok := condition.value.(string) + if !ok { + return false + } + if gpu.Labels == nil { + return expectedValue == "" + } + return gpu.Labels[key] == expectedValue + } + + if strings.HasPrefix(condition.field, "gpu.annotations['") { + key := strings.TrimSuffix(strings.TrimPrefix(condition.field, "gpu.annotations['"), "']") + expectedValue, ok := condition.value.(string) + if !ok { + return false + } + if gpu.Annotations == nil { + return expectedValue == "" + } + return gpu.Annotations[key] == expectedValue + } + + return false } - gpuMap[GPUFieldRunningApps] = runningApps +} - // Worker pod key information - workerPodKeyMap := map[string]string{ - PodKeyFieldName: workerPodKey.Name, - PodKeyFieldNamespace: workerPodKey.Namespace, +// analyzeFieldUsage performs simple heuristic analysis of which fields are used in the expression +func analyzeFieldUsage(expression string) fieldUsage { + if expression == "" { + return fieldUsage{} } - return map[string]interface{}{ - CELVarGPU: gpuMap, - CELVarWorkerPodKey: workerPodKeyMap, + return fieldUsage{ + labels: strings.Contains(expression, "labels"), + annotations: strings.Contains(expression, "annotations"), + available: strings.Contains(expression, "available") || strings.Contains(expression, "tflops") || strings.Contains(expression, "vram"), + nodeSelector: strings.Contains(expression, "nodeSelector"), + runningApps: strings.Contains(expression, "runningApps"), } } + diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go index 8894db07..5020114e 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go @@ -13,13 +13,14 @@ import ( // Benchmark performance of the CEL filter compared to the original filter func BenchmarkFilterPerformance(b *testing.B) { // Create test data - const numGPUs = 1000 + const numGPUs = 10000 gpus := make([]*tfv1.GPU, numGPUs) for i := 0; i < numGPUs; i++ { gpuModel := "A100" - if i%3 == 0 { + switch i % 3 { + case 0: gpuModel = "V100" - } else if i%3 == 1 { + case 1: gpuModel = "H100" } @@ -254,7 +255,7 @@ func printPerformanceComparison(b *testing.B) { === GPU Filter Performance Comparison === Test Environment: -- Number of GPUs: 1000 +- Number of GPUs: 10000 - GPU Models: A100 (33%%), V100 (33%%), H100 (33%%) - GPU Phases: Ready (90%%), Pending (10%%) diff --git a/internal/gpuallocator/filter/cel_filter/expression_cache.go b/internal/gpuallocator/filter/cel_filter/expression_cache.go index 4065c3b9..f98fb1d1 100644 --- a/internal/gpuallocator/filter/cel_filter/expression_cache.go +++ b/internal/gpuallocator/filter/cel_filter/expression_cache.go @@ -88,6 +88,10 @@ func (c *ExpressionCache) GetOrCompileProgram(expression string) (cel.Program, e return nil, fmt.Errorf("failed to compile CEL expression %q: %w", expression, issues.Err()) } + // Validate result type - must return boolean + // Note: Skip type validation for now as CEL type system is complex + // Runtime validation in Filter method is sufficient + program, err := c.env.Program(ast) if err != nil { c.misses++ @@ -121,7 +125,7 @@ func (c *ExpressionCache) hashExpression(expression string) string { // evictLRU removes the least recently used entry from cache func (c *ExpressionCache) evictLRU() { var oldestKey string - var oldestTime time.Time = time.Now() + var oldestTime = time.Now() for key, cached := range c.cache { if cached.AccessedAt.Before(oldestTime) { From e55e53d957cc16f1bf037357dd89cedbc7854658 Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Wed, 10 Sep 2025 09:12:54 +0800 Subject: [PATCH 17/34] fix: dedicated gpu annotation causing webhook failure issue (#356) --- internal/gpuallocator/gpuallocator.go | 5 +++++ internal/webhook/v1/tf_parser.go | 1 + 2 files changed, 6 insertions(+) diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index c4a36980..c43cae70 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -850,6 +850,11 @@ func (s *GpuAllocator) handleGPUCreate(ctx context.Context, gpu *tfv1.GPU) { defer s.storeMutex.Unlock() if s.gpuStore[key] != nil { + if gpu.Status.GPUModel != "" { + if _, exists := GPUCapacityMap[gpu.Status.GPUModel]; !exists { + GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity + } + } syncGPUMetadataAndStatusFromCluster(s.gpuStore[key], gpu) log.V(6).Info("GPU already exists in store", "name", key.Name) return diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index 2fa7b744..51da5358 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -258,6 +258,7 @@ func handleDedicatedGPU(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) workloadProfile.Spec.Resources.Requests.Vram = resource.Vram workloadProfile.Spec.Resources.Limits.Tflops = resource.Tflops workloadProfile.Spec.Resources.Limits.Vram = resource.Vram + workloadProfile.Spec.Qos = tfv1.QoSCritical return nil } From 0d7702431b3ddf8ca9f01605b8436f1c1000c040 Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:27:34 +0800 Subject: [PATCH 18/34] fix: extract GPU map update logic into separate method and fix webhook domain name, virtual cap calculation (#357) * fix: virtual tflops/vram not calculated bug * fix: extract GPU map update logic into separate method and fix webhook domain name * fix: nvidia device plugin compatible mode state consistent issue * fix: nvidia device plugin compatible mode issue --- .vscode/launch.json | 3 +- charts/tensor-fusion/Chart.yaml | 2 +- .../admission-webhooks/mutating-webhook.yaml | 2 +- cmd/main.go | 7 +- internal/controller/gpunode_controller.go | 11 ++- internal/controller/suite_test.go | 7 +- internal/gpuallocator/gpuallocator.go | 77 ++++++++++++------- internal/gpuallocator/gpuallocator_test.go | 8 +- internal/gpuallocator/node_capacity.go | 17 +++- .../scheduler/gpuresources/gpuresources.go | 10 ++- 10 files changed, 98 insertions(+), 46 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index bce7b733..ef1ab245 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -61,7 +61,8 @@ "KUBECONFIG": "~/.kube/config-local-studio", "ENABLE_WEBHOOKS": "false", "ENABLE_SCHEDULER": "true", - "ENABLE_CR_CONTROLLER": "true" + "ENABLE_CR_CONTROLLER": "true", + "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true" }, "args": [ "--metrics-path", "${workspaceFolder}/logs/metrics.log", diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml index d18568b7..d2dc9f06 100644 --- a/charts/tensor-fusion/Chart.yaml +++ b/charts/tensor-fusion/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.5.7 +version: 1.5.8 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml index 7fcdda1a..242d17e0 100644 --- a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml +++ b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml @@ -11,7 +11,7 @@ webhooks: namespace: {{ include "tensor-fusion.namespace" . }} path: /mutate-v1-pod failurePolicy: {{ .Values.controller.admissionWebhooks.failurePolicy }} - name: mpod-v1.kb.io + name: mpod.tensor-fusion.ai rules: - apiGroups: - "" diff --git a/cmd/main.go b/cmd/main.go index f4f2f0ab..f00a6b2e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -364,9 +364,10 @@ func startCustomResourceController( } if err = (&controller.GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("GPUNode"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + Allocator: allocator, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "GPUNode") os.Exit(1) diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index 054d5922..ae503f28 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -47,8 +47,9 @@ import ( // GPUNodeReconciler reconciles a GPUNode object type GPUNodeReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + Allocator *gpuallocator.GpuAllocator } // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete @@ -158,7 +159,9 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, err } -func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Context, hypervisorName string, node *tfv1.GPUNode, poolObj *tfv1.GPUPool) error { +func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity( + ctx context.Context, hypervisorName string, node *tfv1.GPUNode, poolObj *tfv1.GPUPool, +) error { pod := &corev1.Pod{} fetchErr := r.Get(ctx, client.ObjectKey{Name: hypervisorName, Namespace: utils.CurrentNamespace()}, pod) if fetchErr != nil { @@ -183,7 +186,7 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont return nil } else { - gpuModels, err := gpuallocator.RefreshGPUNodeCapacity(ctx, r.Client, node, poolObj) + gpuModels, err := gpuallocator.RefreshGPUNodeCapacity(ctx, r.Client, node, poolObj, r.Allocator) if err != nil { return err } diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 388b938f..0ba3228a 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -180,9 +180,10 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) err = (&GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("GPUNode"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + Allocator: allocator, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index c43cae70..d2259a34 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -545,12 +545,13 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1. } func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] { + <-s.initializedCh set := sets.New[string]() - for nodeName, gpuNames := range s.nodeWorkerStore { + for nodeName, podNames := range s.nodeWorkerStore { // If using by TF, the node can not be used by original scheduler // If using by other scheduler, won't record as TF worker, thus the map is empty // Return non using nodes can ensure original scheduler not conflict with TF - if len(gpuNames) == 0 { + if len(podNames) == 0 { set.Insert(nodeName) } } @@ -564,6 +565,20 @@ func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier } } +func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest { + allocRequests := make([]*tfv1.AllocRequest, 0, 8) + for workerName := range s.nodeWorkerStore[nodeName] { + podUID := s.podNamespaceNsToPodUID[workerName.String()] + if podUID == "" { + continue + } + if request, exists := s.uniqueAllocation[podUID]; exists { + allocRequests = append(allocRequests, request) + } + } + return allocRequests +} + func (s *GpuAllocator) checkGPUCapacityAndQuota(gpu *tfv1.GPU, oldRes, newRes tfv1.Resource) (tfv1.Resource, error) { if gpu.Status.Available == nil { return tfv1.Resource{}, fmt.Errorf("GPU available is nil, skip check") @@ -870,29 +885,7 @@ func (s *GpuAllocator) handleGPUCreate(ctx context.Context, gpu *tfv1.GPU) { } s.gpuStore[key] = gpuInMem - if gpuInMem.Status.NodeSelector != nil { - gpuNodeName := gpuInMem.Status.NodeSelector[constants.KubernetesHostNameLabel] - if gpuNodeName != "" { - if _, exists := s.nodeGpuStore[gpuNodeName]; !exists { - s.nodeGpuStore[gpuNodeName] = make(map[string]*tfv1.GPU, 4) - } - s.nodeGpuStore[gpuNodeName][gpuInMem.Name] = gpuInMem - } - } - - if gpuInMem.Labels != nil { - pool := gpuInMem.Labels[constants.GpuPoolKey] - if pool != "" { - if _, exists := s.poolGpuStore[pool]; !exists { - s.poolGpuStore[pool] = make(map[string]*tfv1.GPU, 128) - } - s.poolGpuStore[pool][gpuInMem.Name] = gpuInMem - } - } - - if gpu.Status.GPUModel != "" { - GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity - } + s.addOrUpdateGPUMaps(gpuInMem) log.Info("Added GPU to store", "name", key.Name, "phase", gpu.Status.Phase) } @@ -942,10 +935,36 @@ func (s *GpuAllocator) handleGPUUpdate(ctx context.Context, gpu *tfv1.GPU) { log.V(6).Info("Updated GPU in store (new entry)", "name", key.Name, "phase", gpu.Status.Phase) } - if gpu.Status.GPUModel != "" { - if _, exists := GPUCapacityMap[gpu.Status.GPUModel]; !exists { - GPUCapacityMap[gpu.Status.GPUModel] = *gpu.Status.Capacity + s.addOrUpdateGPUMaps(gpu) +} + +func (s *GpuAllocator) addOrUpdateGPUMaps(gpuInMem *tfv1.GPU) { + if gpuInMem.Status.NodeSelector != nil { + gpuNodeName := gpuInMem.Status.NodeSelector[constants.KubernetesHostNameLabel] + if gpuNodeName != "" { + if _, exists := s.nodeGpuStore[gpuNodeName]; !exists { + s.nodeGpuStore[gpuNodeName] = make(map[string]*tfv1.GPU, 4) + } + s.nodeGpuStore[gpuNodeName][gpuInMem.Name] = gpuInMem + if _, exists := s.nodeWorkerStore[gpuNodeName]; !exists { + s.nodeWorkerStore[gpuNodeName] = make(map[types.NamespacedName]struct{}, 4) + } } + + } + + if gpuInMem.Labels != nil { + pool := gpuInMem.Labels[constants.GpuPoolKey] + if pool != "" { + if _, exists := s.poolGpuStore[pool]; !exists { + s.poolGpuStore[pool] = make(map[string]*tfv1.GPU, 128) + } + s.poolGpuStore[pool][gpuInMem.Name] = gpuInMem + } + } + + if gpuInMem.Status.GPUModel != "" { + GPUCapacityMap[gpuInMem.Status.GPUModel] = *gpuInMem.Status.Capacity } } @@ -1166,7 +1185,7 @@ func (s *GpuAllocator) reconcileAllocationState() { // No workers, but node contains GPU, need include into nodeWorkerStore with empty map gpuNodeName := gpu.Status.NodeSelector[constants.KubernetesHostNameLabel] if _, exists := s.nodeWorkerStore[gpuNodeName]; !exists { - s.nodeWorkerStore[gpuNodeName] = map[types.NamespacedName]struct{}{} + s.nodeWorkerStore[gpuNodeName] = make(map[types.NamespacedName]struct{}, 4) } } diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go index 08d78130..bb3a494d 100644 --- a/internal/gpuallocator/gpuallocator_test.go +++ b/internal/gpuallocator/gpuallocator_test.go @@ -97,7 +97,7 @@ var _ = Describe("GPU Allocator", func() { if err := k8sClient.Get(ctx, types.NamespacedName{Name: "test-pool"}, pool); err != nil { Expect(err).NotTo(HaveOccurred()) } - _, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool) + _, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool, allocator) // Verify resources were reduced on the allocated GPU gpu := getGPU(gpus[0].Name) @@ -107,8 +107,14 @@ var _ = Describe("GPU Allocator", func() { node := getGPUNode(gpu) diffTflops := node.Status.TotalTFlops.Value() - node.Status.AvailableTFlops.Value() diffVRAM := node.Status.TotalVRAM.Value() - node.Status.AvailableVRAM.Value() + + diffVirtualTflops := node.Status.VirtualTFlops.Value() - node.Status.VirtualAvailableTFlops.Value() + diffVirtualVRAM := node.Status.VirtualVRAM.Value() - node.Status.VirtualAvailableVRAM.Value() Expect(diffTflops).To(BeEquivalentTo(50)) Expect(diffVRAM).To(BeEquivalentTo(8 * 1024 * 1024 * 1024)) + + Expect(diffVirtualTflops).To(BeEquivalentTo(50)) + Expect(diffVirtualVRAM).To(BeEquivalentTo(8 * 1024 * 1024 * 1024)) }) It("should allocate multiple GPUs from the same node", func() { diff --git a/internal/gpuallocator/node_capacity.go b/internal/gpuallocator/node_capacity.go index dc7488f6..43cce870 100644 --- a/internal/gpuallocator/node_capacity.go +++ b/internal/gpuallocator/node_capacity.go @@ -11,7 +11,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, pool *tfv1.GPUPool) ([]string, error) { +func RefreshGPUNodeCapacity( + ctx context.Context, k8sClient client.Client, + node *tfv1.GPUNode, pool *tfv1.GPUPool, + allocator *GpuAllocator, +) ([]string, error) { gpuList := &tfv1.GPUList{} if err := k8sClient.List(ctx, gpuList, client.MatchingLabels{constants.LabelKeyOwner: node.Name}); err != nil { return nil, fmt.Errorf("failed to list GPUs: %w", err) @@ -54,6 +58,17 @@ func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node * node.Status.VirtualTFlops = virtualTFlops node.Status.VirtualVRAM = virtualVRAM + vramAvailable := virtualVRAM.DeepCopy() + tflopsAvailable := virtualTFlops.DeepCopy() + + allocRequests := allocator.GetAllocationReqByNodeName(node.Name) + for _, allocRequest := range allocRequests { + vramAvailable.Sub(allocRequest.Limit.Vram) + tflopsAvailable.Sub(allocRequest.Limit.Tflops) + } + node.Status.VirtualAvailableVRAM = &vramAvailable + node.Status.VirtualAvailableTFlops = &tflopsAvailable + node.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning if !equality.Semantic.DeepEqual(node.Status, statusCopy) { diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go index ee6b6e58..861b95eb 100644 --- a/internal/scheduler/gpuresources/gpuresources.go +++ b/internal/scheduler/gpuresources/gpuresources.go @@ -158,11 +158,17 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po continue } + preAllocSize := total - matched + if preAllocSize <= 0 { + s.logger.Error(nil, "Filtering GPU error, unexpected less than 0", "pod", + pod.Name, "node", k, "totalGPU count", total, "matchedGPU count", matched) + preAllocSize = 2 + } // range if it's not in validNodesValidGPUs, add to validNodeNonMatchingGPUs - validNodeNonMatchingGPUs[k] = make([]*tfv1.GPU, 0, total-matched) + validNodeNonMatchingGPUs[k] = make([]*tfv1.GPU, 0, preAllocSize) for gpuName, gpu := range allGPUs { seen := false - // just loop because the number always <= 8 + // just loop because the number always <= 8/16 for _, matchedGPU := range matchedGPUs { if gpuName == matchedGPU.Name { seen = true From 52dc0a45d11af1c2c79d577ade95890a1ca55a2b Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 14 Sep 2025 04:18:28 -0700 Subject: [PATCH 19/34] cel fix phase filter --- .../filter/cel_filter/cel_filter.go | 181 +++++++++--------- .../cel_filter/cel_filter_benchmark_test.go | 25 +-- .../filter/cel_filter/cel_filter_test.go | 77 ++++---- 3 files changed, 144 insertions(+), 139 deletions(-) diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go index ea463b0f..1c3e01c9 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -10,10 +10,12 @@ import ( "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/google/cel-go/cel" "github.com/google/cel-go/common/types" "github.com/google/cel-go/common/types/ref" "github.com/google/cel-go/interpreter" + "github.com/samber/lo" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -39,8 +41,8 @@ type FastPathPredicate func(gpu *tfv1.GPU) bool // ExpressionPattern represents a recognized expression pattern for fast path type ExpressionPattern struct { - Pattern *regexp.Regexp - Generator func(matches []string) FastPathPredicate + Pattern *regexp.Regexp + Generator func(matches []string) FastPathPredicate } // Common fast path patterns - order matters (most specific first) @@ -52,7 +54,7 @@ var fastPathPatterns = []ExpressionPattern{ threshold, _ := strconv.ParseFloat(matches[1], 64) labelKey, labelValue := matches[2], matches[3] return func(gpu *tfv1.GPU) bool { - return gpu.Status.Available != nil && + return gpu.Status.Available != nil && gpu.Status.Available.Tflops.AsApproximateFloat64() >= threshold && gpu.Labels != nil && gpu.Labels[labelKey] == labelValue } @@ -88,7 +90,7 @@ var fastPathPatterns = []ExpressionPattern{ } }, }, - // gpu.available.vram > NUMBER + // gpu.available.vram > NUMBER { Pattern: regexp.MustCompile(`^gpu\.available\.vram\s*>\s*([0-9]+(?:\.[0-9]+)?)$`), Generator: func(matches []string) FastPathPredicate { @@ -120,7 +122,6 @@ var fastPathPatterns = []ExpressionPattern{ }, } - // ZeroAllocActivation provides zero-allocation variable resolution for CEL // This eliminates the need to create map[string]interface{} for each GPU type ZeroAllocActivation struct { @@ -141,7 +142,7 @@ func (a *ZeroAllocActivation) ResolveName(name string) (interface{}, bool) { } } -// Parent implements interpreter.Activation interface +// Parent implements interpreter.Activation interface func (a *ZeroAllocActivation) Parent() interpreter.Activation { return nil } @@ -152,7 +153,6 @@ func (a *ZeroAllocActivation) createGPUObject() interface{} { return &gpuVal{GPU: a.gpu} } - // createWorkerPodKeyObject creates worker pod key object func (a *ZeroAllocActivation) createWorkerPodKeyObject() interface{} { return map[string]interface{}{ @@ -166,7 +166,7 @@ type gpuVal struct { *tfv1.GPU // Cached sub-values to avoid repeated allocations labels ref.Val - annotations ref.Val + annotations ref.Val nodeSelector ref.Val available ref.Val runningApps ref.Val @@ -195,7 +195,7 @@ func (v *gpuVal) ConvertToNative(typeDesc reflect.Type) (interface{}, error) { return v.GPU, nil } -// ConvertToType implements ref.Val interface +// ConvertToType implements ref.Val interface func (v *gpuVal) ConvertToType(typeValue ref.Type) ref.Val { switch typeValue { case types.TypeType: @@ -209,8 +209,8 @@ func (v *gpuVal) ConvertToType(typeValue ref.Type) ref.Val { func (v *gpuVal) HasField(field string) bool { switch field { case GPUFieldName, GPUFieldNamespace, GPUFieldGPUModel, GPUFieldUUID, - GPUFieldPhase, GPUFieldUsedBy, GPUFieldMessage, GPUFieldLabels, - GPUFieldAnnotations, GPUFieldAvailable, GPUFieldNodeSelector, GPUFieldRunningApps: + GPUFieldPhase, GPUFieldUsedBy, GPUFieldMessage, GPUFieldLabels, + GPUFieldAnnotations, GPUFieldAvailable, GPUFieldNodeSelector, GPUFieldRunningApps: return true default: return false @@ -223,7 +223,7 @@ func (v *gpuVal) Get(index ref.Val) ref.Val { if !ok { return types.NewErr("index must be string") } - + switch field { case GPUFieldName: return types.String(v.GPU.Name) @@ -246,7 +246,7 @@ func (v *gpuVal) Get(index ref.Val) ref.Val { } return v.labels case GPUFieldAnnotations: - // Lazy initialization with caching + // Lazy initialization with caching if v.annotations == nil { v.annotations = &labelsVal{labels: v.GPU.Annotations} } @@ -286,7 +286,7 @@ type availableVal struct { available *tfv1.Resource } -// Type implements ref.Val interface +// Type implements ref.Val interface func (v *availableVal) Type() ref.Type { return types.MapType } @@ -317,22 +317,22 @@ func (v *availableVal) Get(index ref.Val) ref.Val { if !ok { return types.NewErr("index must be string") } - + if v.available == nil { switch field { - case "tflops": + case ResourceFieldTFlops: return types.Double(0.0) - case "vram": + case ResourceFieldVRAM: return types.Int(0) default: return types.NewErr("no such field: %s", field) } } - + switch field { - case "tflops": + case ResourceFieldTFlops: return types.Double(v.available.Tflops.AsApproximateFloat64()) - case "vram": + case ResourceFieldVRAM: return types.Int(v.available.Vram.Value()) default: return types.NewErr("no such field: %s", field) @@ -341,7 +341,7 @@ func (v *availableVal) Get(index ref.Val) ref.Val { // HasField implements field testing func (v *availableVal) HasField(field string) bool { - return field == "tflops" || field == "vram" + return field == ResourceFieldTFlops || field == ResourceFieldVRAM } // labelsVal provides direct access to GPU labels without copying @@ -354,7 +354,7 @@ func (v *labelsVal) Type() ref.Type { return types.MapType } -// Value implements ref.Val interface +// Value implements ref.Val interface func (v *labelsVal) Value() interface{} { return v.labels } @@ -380,11 +380,11 @@ func (v *labelsVal) Get(index ref.Val) ref.Val { if !ok { return types.NewErr("index must be string") } - + if v.labels == nil { return types.String("") } - + value, exists := v.labels[key] if !exists { return types.String("") @@ -397,7 +397,7 @@ type CELFilter struct { cache *ExpressionCache name string // Store early filtering criteria for optimization - requiredPhase string + requiredPhases []tfv1.TensorFusionGPUPhase requiredGPUModel string userExpression string // Track which fields are actually used @@ -411,10 +411,14 @@ type CELFilter struct { // NewAllocRequestCELFilter creates a new CEL filter from allocation request func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, error) { // Extract early filtering criteria - var requiredPhase, requiredGPUModel, userExpression, displayExpression string + var requiredPhases []tfv1.TensorFusionGPUPhase + var requiredGPUModel, userExpression, displayExpression string if req != nil { - requiredPhase = "Ready" // Keep as Ready for compatibility with tests + requiredPhases = []tfv1.TensorFusionGPUPhase{ + tfv1.TensorFusionGPUPhaseRunning, + tfv1.TensorFusionGPUPhasePending, + } requiredGPUModel = req.GPUModel userExpression = req.CELFilterExpression @@ -424,7 +428,7 @@ func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, e // Analyze field usage in user expression only usage := analyzeFieldUsage(userExpression) - + // Try to compile fast path predicate fastPath := compileFastPath(userExpression) @@ -437,7 +441,7 @@ func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, e return &CELFilter{ cache: cache, name: name, - requiredPhase: requiredPhase, + requiredPhases: requiredPhases, requiredGPUModel: requiredGPUModel, userExpression: userExpression, usage: usage, @@ -464,8 +468,12 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, // Early filtering phase: apply basic filters first to reduce CEL evaluation overhead earlyFilteredGPUs := make([]*tfv1.GPU, 0, len(gpus)) for _, gpu := range gpus { + // when running progressive migration mode, only return GPUs used by tensor-fusion + if utils.IsProgressiveMigration() && gpu.Status.UsedBy != tfv1.UsedByTensorFusion { + continue + } // Fast path: check phase first (most common filter) - if f.requiredPhase != "" && string(gpu.Status.Phase) != f.requiredPhase { + if f.requiredPhases != nil && !lo.Contains(f.requiredPhases, gpu.Status.Phase) { continue } @@ -498,7 +506,6 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, return nil, fmt.Errorf("failed to get CEL program for expression %q: %w", f.userExpression, err) } - // Use fast path if available, otherwise fall back to CEL if f.fastPathPredicate != nil { // Fast path: direct Go function evaluation with optional parallelization @@ -511,7 +518,7 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, } } } - + log.V(1).Info("CEL filter applied (fast path)", "filter", f.name, "displayExpression", f.displayExpression, @@ -528,7 +535,7 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, // Sequential evaluation for smaller sets filteredGPUs = f.filterFallbackSequential(ctx, program, earlyFilteredGPUs, workerPodKey) } - + log.V(1).Info("CEL filter applied (CEL evaluation)", "filter", f.name, "displayExpression", f.displayExpression, @@ -584,7 +591,6 @@ func createCELEnvironment() (*cel.Env, error) { ) } - // filterParallel processes GPUs in parallel for large datasets func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { numGPUs := len(gpus) @@ -592,10 +598,10 @@ func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { if numWorkers > DefaultWorkerCount { numWorkers = DefaultWorkerCount } - + chunkSize := (numGPUs + numWorkers - 1) / numWorkers resultChannels := make([]<-chan []*tfv1.GPU, numWorkers) - + // Create workers for i := 0; i < numWorkers; i++ { start := i * chunkSize @@ -603,7 +609,7 @@ func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { if end > numGPUs { end = numGPUs } - + if start >= end { // No work for this worker ch := make(chan []*tfv1.GPU, 1) @@ -612,15 +618,15 @@ func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { resultChannels[i] = ch continue } - + chunk := gpus[start:end] resultCh := make(chan []*tfv1.GPU, 1) resultChannels[i] = resultCh - + // Start worker goroutine go func(gpuChunk []*tfv1.GPU, resultCh chan<- []*tfv1.GPU) { defer close(resultCh) - + filtered := make([]*tfv1.GPU, 0, len(gpuChunk)/2) // Estimate 50% pass rate for _, gpu := range gpuChunk { if f.fastPathPredicate(gpu) { @@ -630,14 +636,14 @@ func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { resultCh <- filtered }(chunk, resultCh) } - + // Collect results var totalFiltered []*tfv1.GPU for _, ch := range resultChannels { chunkResults := <-ch totalFiltered = append(totalFiltered, chunkResults...) } - + return totalFiltered } @@ -645,7 +651,7 @@ func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Program, gpus []*tfv1.GPU, workerPodKey tfv1.NameNamespace) []*tfv1.GPU { filteredGPUs := make([]*tfv1.GPU, 0, len(gpus)/2) log := log.FromContext(ctx) - + for i, gpu := range gpus { // Periodic context check every 64 GPUs for very large sets if i&63 == 0 { @@ -690,7 +696,7 @@ func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Pr continue } } - + return filteredGPUs } @@ -701,10 +707,10 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog if numWorkers > DefaultWorkerCount { numWorkers = DefaultWorkerCount } - + chunkSize := (numGPUs + numWorkers - 1) / numWorkers resultChannels := make([]<-chan []*tfv1.GPU, numWorkers) - + // Create workers for i := 0; i < numWorkers; i++ { start := i * chunkSize @@ -712,7 +718,7 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog if end > numGPUs { end = numGPUs } - + if start >= end { // No work for this worker ch := make(chan []*tfv1.GPU, 1) @@ -721,17 +727,17 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog resultChannels[i] = ch continue } - + chunk := gpus[start:end] resultCh := make(chan []*tfv1.GPU, 1) resultChannels[i] = resultCh - + // Start worker goroutine go func(gpuChunk []*tfv1.GPU, resultCh chan<- []*tfv1.GPU) { defer close(resultCh) - + filtered := make([]*tfv1.GPU, 0, len(gpuChunk)/2) // Estimate 50% pass rate - + for _, gpu := range gpuChunk { // Use zero-allocation activation activation := &ZeroAllocActivation{ @@ -758,30 +764,29 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog resultCh <- filtered }(chunk, resultCh) } - + // Collect results var totalFiltered []*tfv1.GPU for _, ch := range resultChannels { chunkResults := <-ch totalFiltered = append(totalFiltered, chunkResults...) } - + return totalFiltered } - // compileFastPath tries to compile expression into a fast path predicate // Uses AST analysis for better pattern matching than regex func compileFastPath(expression string) FastPathPredicate { if expression == "" { return nil } - - // Try AST-based compilation first (more flexible) + + // Try AST-based compilation first (more flexible) if pred := compileASTFastPath(expression); pred != nil { return pred } - + // Fall back to regex patterns for backward compatibility for _, pattern := range fastPathPatterns { matches := pattern.Pattern.FindStringSubmatch(expression) @@ -789,7 +794,7 @@ func compileFastPath(expression string) FastPathPredicate { return pattern.Generator(matches) } } - + return nil } @@ -800,18 +805,18 @@ func compileASTFastPath(expression string) FastPathPredicate { if err != nil { return nil } - + _, issues := env.Parse(expression) if issues != nil && issues.Err() != nil { return nil } - + // Extract conditions from expression string (simplified approach) conditions := extractConditionsFromString(expression) if len(conditions) == 0 { return nil } - + // Generate fast path predicate return func(gpu *tfv1.GPU) bool { for _, condition := range conditions { @@ -825,23 +830,22 @@ func compileASTFastPath(expression string) FastPathPredicate { // astCondition represents a simple condition extracted from AST type astCondition struct { - field string // e.g., "gpu.available.tflops", "gpu.labels['env']" - operator string // "==", "!=", ">=", ">" + field string // e.g., "gpu.available.tflops", "gpu.labels['env']" + operator string // "==", "!=", ">=", ">" value interface{} // expected value } - // extractConditionsFromString uses enhanced pattern matching to extract conditions // This bridges the gap between regex and full AST until full AST implementation func extractConditionsFromString(exprStr string) []astCondition { var conditions []astCondition - + // Split by && to handle multiple conditions parts := strings.Split(exprStr, " && ") - + for _, part := range parts { part = strings.TrimSpace(part) - + // Handle gpu.available.tflops >= X if strings.Contains(part, "gpu.available.tflops") && strings.Contains(part, ">=") { if condition := parseNumericCondition(part, "gpu.available.tflops", ">="); condition != nil { @@ -852,28 +856,28 @@ func extractConditionsFromString(exprStr string) []astCondition { conditions = append(conditions, *condition) } } - + // Handle gpu.available.vram >= X if strings.Contains(part, "gpu.available.vram") && strings.Contains(part, ">=") { if condition := parseNumericCondition(part, "gpu.available.vram", ">="); condition != nil { conditions = append(conditions, *condition) } } - + // Handle gpu.labels['key'] == 'value' if strings.Contains(part, "gpu.labels[") && strings.Contains(part, "==") { if condition := parseLabelCondition(part, "gpu.labels"); condition != nil { conditions = append(conditions, *condition) } } - - // Handle gpu.annotations['key'] == 'value' + + // Handle gpu.annotations['key'] == 'value' if strings.Contains(part, "gpu.annotations[") && strings.Contains(part, "==") { if condition := parseLabelCondition(part, "gpu.annotations"); condition != nil { conditions = append(conditions, *condition) } } - + // Handle gpu.gpuModel == 'value' if strings.Contains(part, "gpu.gpuModel") && strings.Contains(part, "==") { if condition := parseStringCondition(part, "gpu.gpuModel", "=="); condition != nil { @@ -881,7 +885,7 @@ func extractConditionsFromString(exprStr string) []astCondition { } } } - + return conditions } @@ -891,13 +895,13 @@ func parseNumericCondition(expr, field, operator string) *astCondition { if len(parts) != 2 { return nil } - + valueStr := strings.TrimSpace(parts[1]) value, err := strconv.ParseFloat(valueStr, 64) if err != nil { return nil } - + return &astCondition{ field: field, operator: operator, @@ -905,7 +909,7 @@ func parseNumericCondition(expr, field, operator string) *astCondition { } } -// parseLabelCondition parses label/annotation map access conditions +// parseLabelCondition parses label/annotation map access conditions func parseLabelCondition(expr, fieldPrefix string) *astCondition { // Extract key from gpu.labels['key'] == 'value' format keyStart := strings.Index(expr, "['") + 2 @@ -914,9 +918,9 @@ func parseLabelCondition(expr, fieldPrefix string) *astCondition { return nil } key := expr[keyStart : keyStart+keyEnd] - + // Extract value - valueStart := strings.LastIndex(expr, "'") + valueStart := strings.LastIndex(expr, "'") if valueStart == -1 { return nil } @@ -926,7 +930,7 @@ func parseLabelCondition(expr, fieldPrefix string) *astCondition { return nil } value := expr[prevQuotePos+1 : valueStart] - + return &astCondition{ field: fieldPrefix + "['" + key + "']", operator: "==", @@ -940,13 +944,13 @@ func parseStringCondition(expr, field, operator string) *astCondition { if len(parts) != 2 { return nil } - + valueStr := strings.TrimSpace(parts[1]) // Remove quotes if strings.HasPrefix(valueStr, "'") && strings.HasSuffix(valueStr, "'") { valueStr = valueStr[1 : len(valueStr)-1] } - + return &astCondition{ field: field, operator: operator, @@ -966,7 +970,7 @@ func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { if !ok { return false } - + switch condition.operator { case ">=": return actualValue >= expectedValue @@ -975,7 +979,7 @@ func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { default: return false } - + case "gpu.available.vram": if gpu.Status.Available == nil { return false @@ -985,7 +989,7 @@ func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { if !ok { return false } - + switch condition.operator { case ">=": return actualValue >= expectedValue @@ -994,14 +998,14 @@ func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { default: return false } - + case "gpu.gpuModel": expectedValue, ok := condition.value.(string) if !ok { return false } return gpu.Status.GPUModel == expectedValue - + default: // Handle label/annotation access if strings.HasPrefix(condition.field, "gpu.labels['") { @@ -1015,7 +1019,7 @@ func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { } return gpu.Labels[key] == expectedValue } - + if strings.HasPrefix(condition.field, "gpu.annotations['") { key := strings.TrimSuffix(strings.TrimPrefix(condition.field, "gpu.annotations['"), "']") expectedValue, ok := condition.value.(string) @@ -1027,7 +1031,7 @@ func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { } return gpu.Annotations[key] == expectedValue } - + return false } } @@ -1046,4 +1050,3 @@ func analyzeFieldUsage(expression string) fieldUsage { runningApps: strings.Contains(expression, "runningApps"), } } - diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go index 5020114e..5d1e7091 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go @@ -7,13 +7,14 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" ) // Benchmark performance of the CEL filter compared to the original filter func BenchmarkFilterPerformance(b *testing.B) { // Create test data - const numGPUs = 10000 + const numGPUs = 1000000 gpus := make([]*tfv1.GPU, numGPUs) for i := 0; i < numGPUs; i++ { gpuModel := "A100" @@ -24,9 +25,9 @@ func BenchmarkFilterPerformance(b *testing.B) { gpuModel = "H100" } - phase := "Ready" + phase := constants.PhaseRunning if i%10 == 0 { - phase = "Pending" + phase = constants.PhasePending } gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", gpuModel, phase, 150.0, 40.0) @@ -44,7 +45,7 @@ func BenchmarkFilterPerformance(b *testing.B) { b.Run("OriginalFilters", func(b *testing.B) { // Import the original filter package registry := filter.NewFilterRegistry().With( - filter.NewPhaseFilter("Ready"), + filter.NewPhaseFilter(constants.PhaseRunning), filter.NewGPUModelFilter("A100"), ) @@ -149,7 +150,7 @@ func BenchmarkCachePerformance(b *testing.B) { b.Fatal(err) } - expression := "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0" + expression := "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0" b.Run("CacheHit", func(b *testing.B) { // Pre-warm cache @@ -170,7 +171,7 @@ func BenchmarkCachePerformance(b *testing.B) { b.Run("CacheMiss", func(b *testing.B) { expressions := make([]string, b.N) for i := 0; i < b.N; i++ { - expressions[i] = fmt.Sprintf("gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= %d.0", i%200+50) + expressions[i] = fmt.Sprintf("gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= %d.0", i%200+50) } b.ResetTimer() @@ -188,7 +189,7 @@ func BenchmarkExpressionComplexity(b *testing.B) { const numGPUs = 100 gpus := make([]*tfv1.GPU, numGPUs) for i := 0; i < numGPUs; i++ { - gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", "A100", "Ready", 150.0, 40.0) + gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Labels["environment"] = "production" gpu.Labels["tier"] = "high-performance" gpu.Annotations["priority"] = "critical" @@ -204,23 +205,23 @@ func BenchmarkExpressionComplexity(b *testing.B) { }{ { name: "Simple", - expression: "gpu.phase == 'Ready'", + expression: "gpu.phase == 'Running'", }, { name: "Medium", - expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100'", + expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100'", }, { name: "Complex", - expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0", + expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0", }, { name: "VeryComplex", - expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'", + expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'", }, { name: "UltraComplex", - expression: "gpu.phase == 'Ready' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production' && gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'", + expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production' && gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'", }, } diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go index c21e4ee8..ffc903ea 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go @@ -6,6 +6,7 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/api/resource" @@ -67,9 +68,9 @@ func TestCELFilter_NormalCases(t *testing.T) { name: "filter by GPU model", request: createTestAllocRequest("default", "test-workload", "A100", ""), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), - createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), - createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0), + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), + createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), }, expectedCount: 2, description: "Should filter GPUs matching the specified model A100", @@ -78,57 +79,57 @@ func TestCELFilter_NormalCases(t *testing.T) { name: "filter by GPU phase only", request: createTestAllocRequest("default", "test-workload", "", ""), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), - createTestGPU("gpu-2", "default", "A100", "Pending", 150.0, 40.0), - createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0), - createTestGPU("gpu-4", "default", "A100", "Failed", 150.0, 40.0), + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "default", "A100", constants.PhasePending, 150.0, 40.0), + createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-4", "default", "A100", constants.PhaseFailed, 150.0, 40.0), }, - expectedCount: 2, - description: "Should only return GPUs in Ready phase", + expectedCount: 3, + description: "Should return GPUs in Running and Pending phases", }, { name: "custom CEL expression - filter by available TFLOPS", request: createTestAllocRequest("default", "test-workload", "", "gpu.available.tflops > 120.0"), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), - createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), - createTestGPU("gpu-3", "default", "H100", "Ready", 200.0, 80.0), + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), + createTestGPU("gpu-3", "default", "H100", constants.PhaseRunning, 200.0, 80.0), }, expectedCount: 2, - description: "Should filter GPUs with TFLOPS > 120 and Ready phase", + description: "Should filter GPUs with TFLOPS > 120 and Running/Pending phase", }, { name: "custom CEL expression - filter by available VRAM", request: createTestAllocRequest("default", "test-workload", "", "gpu.available.vram > 35000000000"), // > 35GB in bytes gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), // 40GB - createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), // 32GB - createTestGPU("gpu-3", "default", "H100", "Ready", 200.0, 80.0), // 80GB + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // 40GB + createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), // 32GB + createTestGPU("gpu-3", "default", "H100", constants.PhaseRunning, 200.0, 80.0), // 80GB }, expectedCount: 2, - description: "Should filter GPUs with VRAM > 35GB and Ready phase", + description: "Should filter GPUs with VRAM > 35GB and Running/Pending phase", }, { name: "combined model and custom CEL expression", request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0"), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), - createTestGPU("gpu-2", "default", "A100", "Ready", 120.0, 40.0), - createTestGPU("gpu-3", "default", "V100", "Ready", 160.0, 32.0), - createTestGPU("gpu-4", "default", "A100", "Ready", 180.0, 40.0), + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 120.0, 40.0), + createTestGPU("gpu-3", "default", "V100", constants.PhaseRunning, 160.0, 32.0), + createTestGPU("gpu-4", "default", "A100", constants.PhaseRunning, 180.0, 40.0), }, expectedCount: 2, - description: "Should filter A100 GPUs with TFLOPS >= 150 and Ready phase", + description: "Should filter A100 GPUs with TFLOPS >= 150 and Running/Pending phase", }, { name: "filter by labels", request: createTestAllocRequest("default", "test-workload", "", "gpu.labels['environment'] == 'production'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0) + gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu1.Labels["environment"] = "production" - gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 150.0, 40.0) + gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu2.Labels["environment"] = "development" - gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0) + gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu3.Labels["environment"] = "production" return []*tfv1.GPU{gpu1, gpu2, gpu3} }(), @@ -139,11 +140,11 @@ func TestCELFilter_NormalCases(t *testing.T) { name: "filter by annotations", request: createTestAllocRequest("default", "test-workload", "", "gpu.annotations['priority'] == 'critical'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0) + gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu1.Annotations["priority"] = "critical" - gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 150.0, 40.0) + gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu2.Annotations["priority"] = "low" - gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0) + gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu3.Annotations["priority"] = "critical" return []*tfv1.GPU{gpu1, gpu2, gpu3} }(), @@ -154,13 +155,13 @@ func TestCELFilter_NormalCases(t *testing.T) { name: "combined labels and annotations filter", request: createTestAllocRequest("default", "test-workload", "", "gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0) + gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu1.Labels["tier"] = "high-performance" gpu1.Annotations["priority"] = "critical" - gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 150.0, 40.0) + gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu2.Labels["tier"] = "standard" gpu2.Annotations["priority"] = "critical" - gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 150.0, 40.0) + gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) gpu3.Labels["tier"] = "high-performance" gpu3.Annotations["priority"] = "low" return []*tfv1.GPU{gpu1, gpu2, gpu3} @@ -179,16 +180,16 @@ func TestCELFilter_NormalCases(t *testing.T) { name: "complex combined expression with model, resources, and metadata", request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", "Ready", 180.0, 40.0) + gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 180.0, 40.0) gpu1.Labels["environment"] = "production" - gpu2 := createTestGPU("gpu-2", "default", "A100", "Ready", 120.0, 40.0) + gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 120.0, 40.0) gpu2.Labels["environment"] = "production" - gpu3 := createTestGPU("gpu-3", "default", "A100", "Ready", 200.0, 40.0) + gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 200.0, 40.0) gpu3.Labels["environment"] = "development" return []*tfv1.GPU{gpu1, gpu2, gpu3} }(), expectedCount: 1, - description: "Should filter A100 GPUs with TFLOPS >= 150, production environment, and Ready phase", + description: "Should filter A100 GPUs with TFLOPS >= 150, production environment, and Running/Pending phase", }, } @@ -224,11 +225,11 @@ func TestCELFilter_EdgeAndExceptionCases(t *testing.T) { t.Run("CEL expressions edge cases", func(t *testing.T) { // Test GPUs for execution testGPUs := []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", "Ready", 150.0, 40.0), - createTestGPU("gpu-2", "default", "V100", "Ready", 100.0, 32.0), + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), } // Add GPU with nil resources - gpuWithNilResources := createTestGPU("gpu-nil", "default", "A100", "Ready", 0, 0) + gpuWithNilResources := createTestGPU("gpu-nil", "default", "A100", constants.PhaseRunning, 0, 0) gpuWithNilResources.Status.Available = nil testGPUs = append(testGPUs, gpuWithNilResources) From cd1d7ddb8f5b81067526a89b95e569c99d1ab01b Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 14 Sep 2025 06:39:22 -0700 Subject: [PATCH 20/34] disable predicate fast path --- .../filter/cel_filter/cel_filter.go | 705 ++++++------------ .../cel_filter/cel_filter_benchmark_test.go | 2 +- .../filter/cel_filter/cel_filter_test.go | 116 +++ 3 files changed, 343 insertions(+), 480 deletions(-) diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go index 1c3e01c9..80622f23 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -6,14 +6,16 @@ import ( "reflect" "regexp" "runtime" - "strconv" "strings" + "sync" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/google/cel-go/cel" "github.com/google/cel-go/common/types" "github.com/google/cel-go/common/types/ref" + "github.com/google/cel-go/common/types/traits" "github.com/google/cel-go/interpreter" "github.com/samber/lo" "sigs.k8s.io/controller-runtime/pkg/log" @@ -27,6 +29,30 @@ const ( DefaultWorkerCount = 4 ) +// Global string pool for GPU Phase values to reduce allocations +var ( + gpuPhaseStringPool = sync.OnceValue(func() map[string]types.String { + return map[string]types.String{ + constants.PhaseUnknown: types.String(constants.PhaseUnknown), + constants.PhasePending: types.String(constants.PhasePending), + constants.PhaseUpdating: types.String(constants.PhaseUpdating), + constants.PhaseRunning: types.String(constants.PhaseRunning), + constants.PhaseMigrating: types.String(constants.PhaseMigrating), + constants.PhaseDestroying: types.String(constants.PhaseDestroying), + } + }) +) + +// getPooledPhaseString returns a pooled CEL String for the given phase +func getPooledPhaseString(phase string) ref.Val { + pool := gpuPhaseStringPool() + if pooled, exists := pool[phase]; exists { + return pooled + } + // Return error for unexpected phase values + return types.NewErr("unknown GPU phase: %s", phase) +} + // fieldUsage tracks which GPU fields are used in the expression type fieldUsage struct { labels bool @@ -45,98 +71,33 @@ type ExpressionPattern struct { Generator func(matches []string) FastPathPredicate } -// Common fast path patterns - order matters (most specific first) -var fastPathPatterns = []ExpressionPattern{ - // Complex AND pattern: gpu.available.tflops >= NUMBER && gpu.labels['KEY'] == 'VALUE' - { - Pattern: regexp.MustCompile(`^gpu\.available\.tflops\s*>=\s*([0-9]+(?:\.[0-9]+)?)\s*&&\s*gpu\.labels\['([^']+)'\]\s*==\s*'([^']+)'$`), - Generator: func(matches []string) FastPathPredicate { - threshold, _ := strconv.ParseFloat(matches[1], 64) - labelKey, labelValue := matches[2], matches[3] - return func(gpu *tfv1.GPU) bool { - return gpu.Status.Available != nil && - gpu.Status.Available.Tflops.AsApproximateFloat64() >= threshold && - gpu.Labels != nil && gpu.Labels[labelKey] == labelValue - } - }, - }, - // gpu.available.tflops >= NUMBER - { - Pattern: regexp.MustCompile(`^gpu\.available\.tflops\s*>=\s*([0-9]+(?:\.[0-9]+)?)$`), - Generator: func(matches []string) FastPathPredicate { - threshold, _ := strconv.ParseFloat(matches[1], 64) - return func(gpu *tfv1.GPU) bool { - return gpu.Status.Available != nil && gpu.Status.Available.Tflops.AsApproximateFloat64() >= threshold - } - }, - }, - // gpu.available.tflops > NUMBER - { - Pattern: regexp.MustCompile(`^gpu\.available\.tflops\s*>\s*([0-9]+(?:\.[0-9]+)?)$`), - Generator: func(matches []string) FastPathPredicate { - threshold, _ := strconv.ParseFloat(matches[1], 64) - return func(gpu *tfv1.GPU) bool { - return gpu.Status.Available != nil && gpu.Status.Available.Tflops.AsApproximateFloat64() > threshold - } - }, - }, - // gpu.available.vram >= NUMBER - { - Pattern: regexp.MustCompile(`^gpu\.available\.vram\s*>=\s*([0-9]+(?:\.[0-9]+)?)$`), - Generator: func(matches []string) FastPathPredicate { - threshold, _ := strconv.ParseFloat(matches[1], 64) - return func(gpu *tfv1.GPU) bool { - return gpu.Status.Available != nil && gpu.Status.Available.Vram.AsApproximateFloat64() >= threshold - } - }, - }, - // gpu.available.vram > NUMBER - { - Pattern: regexp.MustCompile(`^gpu\.available\.vram\s*>\s*([0-9]+(?:\.[0-9]+)?)$`), - Generator: func(matches []string) FastPathPredicate { - threshold, _ := strconv.ParseFloat(matches[1], 64) - return func(gpu *tfv1.GPU) bool { - return gpu.Status.Available != nil && gpu.Status.Available.Vram.AsApproximateFloat64() > threshold - } - }, - }, - // gpu.labels['KEY'] == 'VALUE' - { - Pattern: regexp.MustCompile(`^gpu\.labels\['([^']+)'\]\s*==\s*'([^']+)'$`), - Generator: func(matches []string) FastPathPredicate { - key, value := matches[1], matches[2] - return func(gpu *tfv1.GPU) bool { - return gpu.Labels != nil && gpu.Labels[key] == value - } - }, - }, - // gpu.annotations['KEY'] == 'VALUE' - { - Pattern: regexp.MustCompile(`^gpu\.annotations\['([^']+)'\]\s*==\s*'([^']+)'$`), - Generator: func(matches []string) FastPathPredicate { - key, value := matches[1], matches[2] - return func(gpu *tfv1.GPU) bool { - return gpu.Annotations != nil && gpu.Annotations[key] == value - } - }, - }, -} - // ZeroAllocActivation provides zero-allocation variable resolution for CEL // This eliminates the need to create map[string]interface{} for each GPU type ZeroAllocActivation struct { - gpu *tfv1.GPU - workerPodKey tfv1.NameNamespace + gpuVal gpuVal + workerPodKey workerPodKeyVal usage fieldUsage } +func (a *ZeroAllocActivation) init(g *tfv1.GPU, k tfv1.NameNamespace, usage fieldUsage) { + a.gpuVal.GPU = g + a.gpuVal.labels = nil + a.gpuVal.annotations = nil + a.gpuVal.nodeSelector = nil + a.gpuVal.available = nil + a.gpuVal.runningApps = nil + a.workerPodKey.name = k.Name + a.workerPodKey.namespace = k.Namespace + a.usage = usage +} + // ResolveName implements interpreter.Activation interface func (a *ZeroAllocActivation) ResolveName(name string) (interface{}, bool) { switch name { case CELVarGPU: - return a.createGPUObject(), true + return &a.gpuVal, true case CELVarWorkerPodKey: - return a.createWorkerPodKeyObject(), true + return &a.workerPodKey, true default: return nil, false } @@ -147,20 +108,165 @@ func (a *ZeroAllocActivation) Parent() interpreter.Activation { return nil } -// createGPUObject creates GPU object on-demand without maps -func (a *ZeroAllocActivation) createGPUObject() interface{} { - // Return GPU value with lazy caching - return &gpuVal{GPU: a.gpu} +type workerPodKeyVal struct { + name string + namespace string +} + +func (w *workerPodKeyVal) Type() ref.Type { return types.MapType } +func (w *workerPodKeyVal) Value() interface{} { + return map[string]string{"name": w.name, "namespace": w.namespace} +} +func (w *workerPodKeyVal) Equal(other ref.Val) ref.Val { return types.False } +func (w *workerPodKeyVal) ConvertToNative(t reflect.Type) (interface{}, error) { + return map[string]string{"name": w.name, "namespace": w.namespace}, nil +} +func (w *workerPodKeyVal) ConvertToType(typeValue ref.Type) ref.Val { + return types.NewErr("type conversion not supported") +} +func (w *workerPodKeyVal) Get(index ref.Val) ref.Val { + key, ok := index.Value().(string) + if !ok { + return types.NewErr("index must be string") + } + switch key { + case "name": + return types.String(w.name) + case "namespace": + return types.String(w.namespace) + default: + return types.String("") + } +} +func (w *workerPodKeyVal) HasField(field string) bool { + return field == "name" || field == "namespace" +} + +type appVal struct { + name string + namespace string + count int64 } -// createWorkerPodKeyObject creates worker pod key object -func (a *ZeroAllocActivation) createWorkerPodKeyObject() interface{} { +func (a *appVal) Type() ref.Type { return types.MapType } +func (a *appVal) Value() interface{} { return nil } +func (a *appVal) Equal(other ref.Val) ref.Val { return types.False } +func (a *appVal) ConvertToNative(t reflect.Type) (interface{}, error) { return map[string]interface{}{ - "name": a.workerPodKey.Name, - "namespace": a.workerPodKey.Namespace, + "name": a.name, + "namespace": a.namespace, + "count": a.count, + }, nil +} +func (a *appVal) ConvertToType(typeValue ref.Type) ref.Val { + return types.NewErr("type conversion not supported") +} +func (a *appVal) Get(index ref.Val) ref.Val { + key, _ := index.Value().(string) + switch key { + case "name": + return types.String(a.name) + case "namespace": + return types.String(a.namespace) + case "count": + return types.Int(a.count) + default: + return types.String("") + } +} +func (a *appVal) HasField(field string) bool { + return field == "name" || field == "namespace" || field == "count" +} + +type runningAppsVal struct { + apps []tfv1.RunningAppDetail +} + +func (r *runningAppsVal) Type() ref.Type { return types.ListType } +func (r *runningAppsVal) Value() interface{} { return r.apps } +func (r *runningAppsVal) Equal(other ref.Val) ref.Val { return types.False } +func (r *runningAppsVal) ConvertToNative(t reflect.Type) (interface{}, error) { + if t.Kind() == reflect.Slice { + out := make([]map[string]interface{}, len(r.apps)) + for i, a := range r.apps { + out[i] = map[string]interface{}{ + "name": a.Name, + "namespace": a.Namespace, + "count": a.Count, + } + } + return out, nil } + return r.apps, nil +} +func (r *runningAppsVal) ConvertToType(typeValue ref.Type) ref.Val { + return types.NewErr("type conversion not supported") +} +func (r *runningAppsVal) Get(index ref.Val) ref.Val { + i, ok := index.Value().(int) + if !ok { + if i64, ok2 := index.Value().(int64); ok2 { + i = int(i64) + ok = true + } + } + if !ok || i < 0 || i >= len(r.apps) { + return types.NewErr("index out of range") + } + app := r.apps[i] + return &appVal{name: app.Name, namespace: app.Namespace, count: int64(app.Count)} +} + +func (r *runningAppsVal) Size() ref.Val { return types.Int(len(r.apps)) } + +func (r *runningAppsVal) Contains(elem ref.Val) ref.Val { + av, ok := elem.(*appVal) + if !ok { + return types.False + } + for _, a := range r.apps { + if a.Name == av.name && a.Namespace == av.namespace && int64(a.Count) == av.count { + return types.True + } + } + return types.False +} +func (r *runningAppsVal) Iterator() traits.Iterator { + return &runningAppsIterator{apps: r.apps} +} +func (r *runningAppsVal) Add(elem ref.Val) ref.Val { + return types.NewErr("runningApps list is read-only") } +type runningAppsIterator struct { + apps []tfv1.RunningAppDetail + i int +} + +func (it *runningAppsIterator) Type() ref.Type { return types.IteratorType } +func (it *runningAppsIterator) Value() interface{} { return nil } +func (it *runningAppsIterator) Equal(other ref.Val) ref.Val { return types.False } +func (it *runningAppsIterator) ConvertToNative(t reflect.Type) (interface{}, error) { + return nil, fmt.Errorf("iterator cannot convert to native") +} +func (it *runningAppsIterator) ConvertToType(typeValue ref.Type) ref.Val { + return types.NewErr("type conversion not supported") +} +func (it *runningAppsIterator) HasNext() ref.Val { + return types.Bool(it.i < len(it.apps)) +} +func (it *runningAppsIterator) Next() ref.Val { + if it.i >= len(it.apps) { + return types.NewErr("iterator past end") + } + a := it.apps[it.i] + it.i++ + return &appVal{name: a.Name, namespace: a.Namespace, count: int64(a.Count)} +} + +var _ traits.Lister = (*runningAppsVal)(nil) +var _ traits.Iterator = (*runningAppsIterator)(nil) + // gpuVal implements CEL value interface for GPU objects to eliminate map allocations type gpuVal struct { *tfv1.GPU @@ -234,7 +340,7 @@ func (v *gpuVal) Get(index ref.Val) ref.Val { case GPUFieldUUID: return types.String(v.GPU.Status.UUID) case GPUFieldPhase: - return types.String(string(v.GPU.Status.Phase)) + return getPooledPhaseString(string(v.GPU.Status.Phase)) case GPUFieldUsedBy: return types.String(string(v.GPU.Status.UsedBy)) case GPUFieldMessage: @@ -266,14 +372,11 @@ func (v *gpuVal) Get(index ref.Val) ref.Val { case GPUFieldRunningApps: // For now, keep simple implementation - can optimize later if needed if v.runningApps == nil { - apps := make([]interface{}, len(v.GPU.Status.RunningApps)) + apps := make([]tfv1.RunningAppDetail, len(v.GPU.Status.RunningApps)) for i, app := range v.GPU.Status.RunningApps { - apps[i] = map[string]interface{}{ - "name": app.Name, - "namespace": app.Namespace, - } + apps[i] = *app } - v.runningApps = types.NewDynamicList(types.DefaultTypeAdapter, apps) + v.runningApps = &runningAppsVal{apps: apps} } return v.runningApps default: @@ -323,7 +426,7 @@ func (v *availableVal) Get(index ref.Val) ref.Val { case ResourceFieldTFlops: return types.Double(0.0) case ResourceFieldVRAM: - return types.Int(0) + return types.Double(0.0) default: return types.NewErr("no such field: %s", field) } @@ -333,7 +436,7 @@ func (v *availableVal) Get(index ref.Val) ref.Val { case ResourceFieldTFlops: return types.Double(v.available.Tflops.AsApproximateFloat64()) case ResourceFieldVRAM: - return types.Int(v.available.Vram.Value()) + return types.Double(float64(v.available.Vram.Value())) default: return types.NewErr("no such field: %s", field) } @@ -404,8 +507,6 @@ type CELFilter struct { usage fieldUsage // Display expression for logging (read-only) displayExpression string - // Fast path predicate for common patterns - fastPathPredicate FastPathPredicate } // NewAllocRequestCELFilter creates a new CEL filter from allocation request @@ -429,9 +530,6 @@ func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, e // Analyze field usage in user expression only usage := analyzeFieldUsage(userExpression) - // Try to compile fast path predicate - fastPath := compileFastPath(userExpression) - // Handle nil request case name := "AllocRequest-unknown" if req != nil { @@ -446,7 +544,6 @@ func NewCELFilter(req *tfv1.AllocRequest, cache *ExpressionCache) (*CELFilter, e userExpression: userExpression, usage: usage, displayExpression: displayExpression, - fastPathPredicate: fastPath, }, nil } @@ -507,44 +604,24 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, } // Use fast path if available, otherwise fall back to CEL - if f.fastPathPredicate != nil { - // Fast path: direct Go function evaluation with optional parallelization - if len(earlyFilteredGPUs) >= ParallelThreshold { - filteredGPUs = f.filterParallel(earlyFilteredGPUs) - } else { - for _, gpu := range earlyFilteredGPUs { - if f.fastPathPredicate(gpu) { - filteredGPUs = append(filteredGPUs, gpu) - } - } - } - log.V(1).Info("CEL filter applied (fast path)", - "filter", f.name, - "displayExpression", f.displayExpression, - "userExpression", f.userExpression, - "inputGPUs", len(gpus), - "earlyFilteredGPUs", len(earlyFilteredGPUs), - "outputGPUs", len(filteredGPUs)) + // Fallback to CEL evaluation for complex expressions + if len(earlyFilteredGPUs) >= ParallelThreshold { + // Use parallel evaluation for large GPU sets + filteredGPUs = f.filterFallbackParallel(ctx, program, earlyFilteredGPUs, workerPodKey) } else { - // Fallback to CEL evaluation for complex expressions - if len(earlyFilteredGPUs) >= ParallelThreshold { - // Use parallel evaluation for large GPU sets - filteredGPUs = f.filterFallbackParallel(ctx, program, earlyFilteredGPUs, workerPodKey) - } else { - // Sequential evaluation for smaller sets - filteredGPUs = f.filterFallbackSequential(ctx, program, earlyFilteredGPUs, workerPodKey) - } - - log.V(1).Info("CEL filter applied (CEL evaluation)", - "filter", f.name, - "displayExpression", f.displayExpression, - "userExpression", f.userExpression, - "inputGPUs", len(gpus), - "earlyFilteredGPUs", len(earlyFilteredGPUs), - "outputGPUs", len(filteredGPUs)) + // Sequential evaluation for smaller sets + filteredGPUs = f.filterFallbackSequential(ctx, program, earlyFilteredGPUs, workerPodKey) } + log.V(1).Info("CEL filter applied (CEL evaluation)", + "filter", f.name, + "displayExpression", f.displayExpression, + "userExpression", f.userExpression, + "inputGPUs", len(gpus), + "earlyFilteredGPUs", len(earlyFilteredGPUs), + "outputGPUs", len(filteredGPUs)) + return filteredGPUs, nil } @@ -591,67 +668,11 @@ func createCELEnvironment() (*cel.Env, error) { ) } -// filterParallel processes GPUs in parallel for large datasets -func (f *CELFilter) filterParallel(gpus []*tfv1.GPU) []*tfv1.GPU { - numGPUs := len(gpus) - numWorkers := runtime.NumCPU() - if numWorkers > DefaultWorkerCount { - numWorkers = DefaultWorkerCount - } - - chunkSize := (numGPUs + numWorkers - 1) / numWorkers - resultChannels := make([]<-chan []*tfv1.GPU, numWorkers) - - // Create workers - for i := 0; i < numWorkers; i++ { - start := i * chunkSize - end := start + chunkSize - if end > numGPUs { - end = numGPUs - } - - if start >= end { - // No work for this worker - ch := make(chan []*tfv1.GPU, 1) - ch <- []*tfv1.GPU{} - close(ch) - resultChannels[i] = ch - continue - } - - chunk := gpus[start:end] - resultCh := make(chan []*tfv1.GPU, 1) - resultChannels[i] = resultCh - - // Start worker goroutine - go func(gpuChunk []*tfv1.GPU, resultCh chan<- []*tfv1.GPU) { - defer close(resultCh) - - filtered := make([]*tfv1.GPU, 0, len(gpuChunk)/2) // Estimate 50% pass rate - for _, gpu := range gpuChunk { - if f.fastPathPredicate(gpu) { - filtered = append(filtered, gpu) - } - } - resultCh <- filtered - }(chunk, resultCh) - } - - // Collect results - var totalFiltered []*tfv1.GPU - for _, ch := range resultChannels { - chunkResults := <-ch - totalFiltered = append(totalFiltered, chunkResults...) - } - - return totalFiltered -} - // filterFallbackSequential performs sequential CEL evaluation for smaller GPU sets func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Program, gpus []*tfv1.GPU, workerPodKey tfv1.NameNamespace) []*tfv1.GPU { filteredGPUs := make([]*tfv1.GPU, 0, len(gpus)/2) log := log.FromContext(ctx) - + var activation ZeroAllocActivation for i, gpu := range gpus { // Periodic context check every 64 GPUs for very large sets if i&63 == 0 { @@ -664,14 +685,10 @@ func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Pr } // Use zero-allocation activation instead of maps - activation := &ZeroAllocActivation{ - gpu: gpu, - workerPodKey: workerPodKey, - usage: f.usage, - } + activation.init(gpu, workerPodKey, f.usage) // Direct synchronous evaluation with custom activation - result, _, evalErr := program.Eval(activation) + result, _, evalErr := program.Eval(&activation) if evalErr != nil { log.Error(evalErr, "CEL expression evaluation failed", @@ -683,10 +700,8 @@ func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Pr } // Convert result to boolean - if boolResult, ok := result.(types.Bool); ok { - if bool(boolResult) { - filteredGPUs = append(filteredGPUs, gpu) - } + if boolResult, ok := result.(types.Bool); ok && bool(boolResult) { + filteredGPUs = append(filteredGPUs, gpu) } else { log.Error(nil, "CEL expression did not return boolean", "expression", f.userExpression, @@ -710,7 +725,7 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog chunkSize := (numGPUs + numWorkers - 1) / numWorkers resultChannels := make([]<-chan []*tfv1.GPU, numWorkers) - + var activation ZeroAllocActivation // Create workers for i := 0; i < numWorkers; i++ { start := i * chunkSize @@ -740,24 +755,18 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog for _, gpu := range gpuChunk { // Use zero-allocation activation - activation := &ZeroAllocActivation{ - gpu: gpu, - workerPodKey: workerPodKey, - usage: f.usage, - } + activation.init(gpu, workerPodKey, f.usage) // Direct synchronous evaluation - result, _, evalErr := program.Eval(activation) + result, _, evalErr := program.Eval(&activation) if evalErr != nil { // On error, exclude the GPU (fail-safe) continue } // Convert result to boolean - if boolResult, ok := result.(types.Bool); ok { - if bool(boolResult) { - filtered = append(filtered, gpu) - } + if boolResult, ok := result.(types.Bool); ok && bool(boolResult) { + filtered = append(filtered, gpu) } // On non-boolean result, exclude the GPU (fail-safe) } @@ -775,273 +784,11 @@ func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Prog return totalFiltered } -// compileFastPath tries to compile expression into a fast path predicate -// Uses AST analysis for better pattern matching than regex -func compileFastPath(expression string) FastPathPredicate { - if expression == "" { - return nil - } - - // Try AST-based compilation first (more flexible) - if pred := compileASTFastPath(expression); pred != nil { - return pred - } - - // Fall back to regex patterns for backward compatibility - for _, pattern := range fastPathPatterns { - matches := pattern.Pattern.FindStringSubmatch(expression) - if matches != nil { - return pattern.Generator(matches) - } - } - - return nil -} - -// compileASTFastPath analyzes AST to generate fast path predicates -func compileASTFastPath(expression string) FastPathPredicate { - // Parse expression to AST - env, err := createCELEnvironment() - if err != nil { - return nil - } - - _, issues := env.Parse(expression) - if issues != nil && issues.Err() != nil { - return nil - } - - // Extract conditions from expression string (simplified approach) - conditions := extractConditionsFromString(expression) - if len(conditions) == 0 { - return nil - } - - // Generate fast path predicate - return func(gpu *tfv1.GPU) bool { - for _, condition := range conditions { - if !evaluateCondition(gpu, condition) { - return false // Short-circuit on first failure (AND logic) - } - } - return true - } -} - -// astCondition represents a simple condition extracted from AST -type astCondition struct { - field string // e.g., "gpu.available.tflops", "gpu.labels['env']" - operator string // "==", "!=", ">=", ">" - value interface{} // expected value -} - -// extractConditionsFromString uses enhanced pattern matching to extract conditions -// This bridges the gap between regex and full AST until full AST implementation -func extractConditionsFromString(exprStr string) []astCondition { - var conditions []astCondition - - // Split by && to handle multiple conditions - parts := strings.Split(exprStr, " && ") - - for _, part := range parts { - part = strings.TrimSpace(part) - - // Handle gpu.available.tflops >= X - if strings.Contains(part, "gpu.available.tflops") && strings.Contains(part, ">=") { - if condition := parseNumericCondition(part, "gpu.available.tflops", ">="); condition != nil { - conditions = append(conditions, *condition) - } - } else if strings.Contains(part, "gpu.available.tflops") && strings.Contains(part, ">") { - if condition := parseNumericCondition(part, "gpu.available.tflops", ">"); condition != nil { - conditions = append(conditions, *condition) - } - } - - // Handle gpu.available.vram >= X - if strings.Contains(part, "gpu.available.vram") && strings.Contains(part, ">=") { - if condition := parseNumericCondition(part, "gpu.available.vram", ">="); condition != nil { - conditions = append(conditions, *condition) - } - } - - // Handle gpu.labels['key'] == 'value' - if strings.Contains(part, "gpu.labels[") && strings.Contains(part, "==") { - if condition := parseLabelCondition(part, "gpu.labels"); condition != nil { - conditions = append(conditions, *condition) - } - } - - // Handle gpu.annotations['key'] == 'value' - if strings.Contains(part, "gpu.annotations[") && strings.Contains(part, "==") { - if condition := parseLabelCondition(part, "gpu.annotations"); condition != nil { - conditions = append(conditions, *condition) - } - } - - // Handle gpu.gpuModel == 'value' - if strings.Contains(part, "gpu.gpuModel") && strings.Contains(part, "==") { - if condition := parseStringCondition(part, "gpu.gpuModel", "=="); condition != nil { - conditions = append(conditions, *condition) - } - } - } - - return conditions -} - -// parseNumericCondition parses numeric comparison conditions -func parseNumericCondition(expr, field, operator string) *astCondition { - parts := strings.Split(expr, operator) - if len(parts) != 2 { - return nil - } - - valueStr := strings.TrimSpace(parts[1]) - value, err := strconv.ParseFloat(valueStr, 64) - if err != nil { - return nil - } - - return &astCondition{ - field: field, - operator: operator, - value: value, - } -} - -// parseLabelCondition parses label/annotation map access conditions -func parseLabelCondition(expr, fieldPrefix string) *astCondition { - // Extract key from gpu.labels['key'] == 'value' format - keyStart := strings.Index(expr, "['") + 2 - keyEnd := strings.Index(expr[keyStart:], "']") - if keyEnd == -1 { - return nil - } - key := expr[keyStart : keyStart+keyEnd] - - // Extract value - valueStart := strings.LastIndex(expr, "'") - if valueStart == -1 { - return nil - } - // Find the quote before the last quote - prevQuotePos := strings.LastIndex(expr[:valueStart], "'") - if prevQuotePos == -1 { - return nil - } - value := expr[prevQuotePos+1 : valueStart] - - return &astCondition{ - field: fieldPrefix + "['" + key + "']", - operator: "==", - value: value, - } -} - -// parseStringCondition parses simple string equality conditions -func parseStringCondition(expr, field, operator string) *astCondition { - parts := strings.Split(expr, operator) - if len(parts) != 2 { - return nil - } - - valueStr := strings.TrimSpace(parts[1]) - // Remove quotes - if strings.HasPrefix(valueStr, "'") && strings.HasSuffix(valueStr, "'") { - valueStr = valueStr[1 : len(valueStr)-1] - } - - return &astCondition{ - field: field, - operator: operator, - value: valueStr, - } -} - -// evaluateCondition evaluates a single condition against a GPU -func evaluateCondition(gpu *tfv1.GPU, condition astCondition) bool { - switch condition.field { - case "gpu.available.tflops": - if gpu.Status.Available == nil { - return false - } - actualValue := gpu.Status.Available.Tflops.AsApproximateFloat64() - expectedValue, ok := condition.value.(float64) - if !ok { - return false - } - - switch condition.operator { - case ">=": - return actualValue >= expectedValue - case ">": - return actualValue > expectedValue - default: - return false - } - - case "gpu.available.vram": - if gpu.Status.Available == nil { - return false - } - actualValue := float64(gpu.Status.Available.Vram.Value()) - expectedValue, ok := condition.value.(float64) - if !ok { - return false - } - - switch condition.operator { - case ">=": - return actualValue >= expectedValue - case ">": - return actualValue > expectedValue - default: - return false - } - - case "gpu.gpuModel": - expectedValue, ok := condition.value.(string) - if !ok { - return false - } - return gpu.Status.GPUModel == expectedValue - - default: - // Handle label/annotation access - if strings.HasPrefix(condition.field, "gpu.labels['") { - key := strings.TrimSuffix(strings.TrimPrefix(condition.field, "gpu.labels['"), "']") - expectedValue, ok := condition.value.(string) - if !ok { - return false - } - if gpu.Labels == nil { - return expectedValue == "" - } - return gpu.Labels[key] == expectedValue - } - - if strings.HasPrefix(condition.field, "gpu.annotations['") { - key := strings.TrimSuffix(strings.TrimPrefix(condition.field, "gpu.annotations['"), "']") - expectedValue, ok := condition.value.(string) - if !ok { - return false - } - if gpu.Annotations == nil { - return expectedValue == "" - } - return gpu.Annotations[key] == expectedValue - } - - return false - } -} - // analyzeFieldUsage performs simple heuristic analysis of which fields are used in the expression func analyzeFieldUsage(expression string) fieldUsage { if expression == "" { return fieldUsage{} } - return fieldUsage{ labels: strings.Contains(expression, "labels"), annotations: strings.Contains(expression, "annotations"), diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go index 5d1e7091..26b825fd 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go @@ -256,7 +256,7 @@ func printPerformanceComparison(b *testing.B) { === GPU Filter Performance Comparison === Test Environment: -- Number of GPUs: 10000 +- Number of GPUs: 1000000 - GPU Models: A100 (33%%), V100 (33%%), H100 (33%%) - GPU Phases: Ready (90%%), Pending (10%%) diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go index ffc903ea..f882747b 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go @@ -191,6 +191,120 @@ func TestCELFilter_NormalCases(t *testing.T) { expectedCount: 1, description: "Should filter A100 GPUs with TFLOPS >= 150, production environment, and Running/Pending phase", }, + { + name: "filter by running apps - no running apps", + request: createTestAllocRequest("default", "test-workload", "", "size(gpu.runningApps) == 0"), + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + func() *tfv1.GPU { + gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "app1", Namespace: "default", Count: 1}, + } + return gpu + }(), + createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + }, + expectedCount: 2, + description: "Should return GPUs with no running apps", + }, + { + name: "filter by running apps - has specific app", + request: createTestAllocRequest("default", "test-workload", "", "gpu.runningApps.exists(app, app.name == 'training-job' && app.namespace == 'ml-team')"), + gpus: []*tfv1.GPU{ + func() *tfv1.GPU { + gpu := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "training-job", Namespace: "ml-team", Count: 2}, + {Name: "other-job", Namespace: "default", Count: 1}, + } + return gpu + }(), + func() *tfv1.GPU { + gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "other-job", Namespace: "ml-team", Count: 1}, + } + return gpu + }(), + createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + }, + expectedCount: 1, + description: "Should return GPUs running specific training job", + }, + { + name: "filter by running apps - count threshold", + request: createTestAllocRequest("default", "test-workload", "", "gpu.runningApps.all(app, app.count <= 2) && size(gpu.runningApps) > 0"), + gpus: []*tfv1.GPU{ + func() *tfv1.GPU { + gpu := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "job1", Namespace: "default", Count: 1}, + {Name: "job2", Namespace: "default", Count: 2}, + } + return gpu + }(), + func() *tfv1.GPU { + gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "job1", Namespace: "default", Count: 5}, // Count > 2 + } + return gpu + }(), + createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps + }, + expectedCount: 1, + description: "Should return GPUs where all running apps have count <= 2", + }, + { + name: "filter by running apps - complex condition", + request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && (size(gpu.runningApps) == 0 || gpu.runningApps.all(app, app.namespace != 'restricted'))"), + gpus: []*tfv1.GPU{ + createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps + func() *tfv1.GPU { + gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "job1", Namespace: "allowed", Count: 1}, + } + return gpu + }(), + func() *tfv1.GPU { + gpu := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "job1", Namespace: "restricted", Count: 1}, // Restricted namespace + } + return gpu + }(), + createTestGPU("gpu-4", "default", "V100", constants.PhaseRunning, 150.0, 40.0), // Wrong model + }, + expectedCount: 2, + description: "Should return A100 GPUs with sufficient resources and no restricted apps", + }, + { + name: "filter by running apps - namespace isolation", + request: createTestAllocRequest("default", "test-workload", "", "!gpu.runningApps.exists(app, app.namespace == 'tenant-a')"), + gpus: []*tfv1.GPU{ + func() *tfv1.GPU { + gpu := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "job1", Namespace: "tenant-b", Count: 1}, + {Name: "job2", Namespace: "shared", Count: 1}, + } + return gpu + }(), + func() *tfv1.GPU { + gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ + {Name: "job1", Namespace: "tenant-a", Count: 1}, // Should be excluded + {Name: "job2", Namespace: "tenant-b", Count: 1}, + } + return gpu + }(), + createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps + }, + expectedCount: 2, + description: "Should return GPUs not running apps from tenant-a", + }, } for _, tt := range tests { @@ -208,6 +322,8 @@ func TestCELFilter_NormalCases(t *testing.T) { // Verify results require.NoError(t, err, "Filter execution should not fail") + + // Debug output for complex condition test assert.Len(t, filteredGPUs, tt.expectedCount, tt.description) // Verify filter name From f700eac4d7457b4240be3fe1dae7bb8aef0f1277 Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 14 Sep 2025 06:48:56 -0700 Subject: [PATCH 21/34] fix lint issue --- .../filter/cel_filter/cel_filter.go | 42 ++--- .../cel_filter/cel_filter_benchmark_test.go | 37 +++-- .../filter/cel_filter/cel_filter_test.go | 156 +++++++++--------- 3 files changed, 124 insertions(+), 111 deletions(-) diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter.go b/internal/gpuallocator/filter/cel_filter/cel_filter.go index 80622f23..18a0d176 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter.go @@ -130,16 +130,16 @@ func (w *workerPodKeyVal) Get(index ref.Val) ref.Val { return types.NewErr("index must be string") } switch key { - case "name": + case GPUFieldName: return types.String(w.name) - case "namespace": + case GPUFieldNamespace: return types.String(w.namespace) default: return types.String("") } } func (w *workerPodKeyVal) HasField(field string) bool { - return field == "name" || field == "namespace" + return field == GPUFieldName || field == GPUFieldNamespace } type appVal struct { @@ -291,7 +291,7 @@ func (v *gpuVal) Value() interface{} { // Equal implements ref.Val interface func (v *gpuVal) Equal(other ref.Val) ref.Val { if otherGPU, ok := other.(*gpuVal); ok { - return types.Bool(v.GPU.UID == otherGPU.GPU.UID) + return types.Bool(v.UID == otherGPU.UID) } return types.False } @@ -332,48 +332,48 @@ func (v *gpuVal) Get(index ref.Val) ref.Val { switch field { case GPUFieldName: - return types.String(v.GPU.Name) + return types.String(v.Name) case GPUFieldNamespace: - return types.String(v.GPU.Namespace) + return types.String(v.Namespace) case GPUFieldGPUModel: - return types.String(v.GPU.Status.GPUModel) + return types.String(v.Status.GPUModel) case GPUFieldUUID: - return types.String(v.GPU.Status.UUID) + return types.String(v.Status.UUID) case GPUFieldPhase: - return getPooledPhaseString(string(v.GPU.Status.Phase)) + return getPooledPhaseString(string(v.Status.Phase)) case GPUFieldUsedBy: - return types.String(string(v.GPU.Status.UsedBy)) + return types.String(string(v.Status.UsedBy)) case GPUFieldMessage: - return types.String(v.GPU.Status.Message) + return types.String(v.Status.Message) case GPUFieldLabels: // Lazy initialization with caching if v.labels == nil { - v.labels = &labelsVal{labels: v.GPU.Labels} + v.labels = &labelsVal{labels: v.Labels} } return v.labels case GPUFieldAnnotations: // Lazy initialization with caching if v.annotations == nil { - v.annotations = &labelsVal{labels: v.GPU.Annotations} + v.annotations = &labelsVal{labels: v.Annotations} } return v.annotations case GPUFieldAvailable: // Lazy initialization with caching if v.available == nil { - v.available = &availableVal{available: v.GPU.Status.Available} + v.available = &availableVal{available: v.Status.Available} } return v.available case GPUFieldNodeSelector: // Lazy initialization with caching if v.nodeSelector == nil { - v.nodeSelector = &labelsVal{labels: v.GPU.Status.NodeSelector} + v.nodeSelector = &labelsVal{labels: v.Status.NodeSelector} } return v.nodeSelector case GPUFieldRunningApps: // For now, keep simple implementation - can optimize later if needed if v.runningApps == nil { - apps := make([]tfv1.RunningAppDetail, len(v.GPU.Status.RunningApps)) - for i, app := range v.GPU.Status.RunningApps { + apps := make([]tfv1.RunningAppDetail, len(v.Status.RunningApps)) + for i, app := range v.Status.RunningApps { apps[i] = *app } v.runningApps = &runningAppsVal{apps: apps} @@ -559,8 +559,8 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, return gpus, nil } - // Pre-allocate result slice with estimated capacity - filteredGPUs := make([]*tfv1.GPU, 0, len(gpus)) + // Pre-allocate result slice with estimated capacity for early filtering + var filteredGPUs []*tfv1.GPU // Early filtering phase: apply basic filters first to reduce CEL evaluation overhead earlyFilteredGPUs := make([]*tfv1.GPU, 0, len(gpus)) @@ -608,7 +608,7 @@ func (f *CELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, // Fallback to CEL evaluation for complex expressions if len(earlyFilteredGPUs) >= ParallelThreshold { // Use parallel evaluation for large GPU sets - filteredGPUs = f.filterFallbackParallel(ctx, program, earlyFilteredGPUs, workerPodKey) + filteredGPUs = f.filterFallbackParallel(program, earlyFilteredGPUs, workerPodKey) } else { // Sequential evaluation for smaller sets filteredGPUs = f.filterFallbackSequential(ctx, program, earlyFilteredGPUs, workerPodKey) @@ -716,7 +716,7 @@ func (f *CELFilter) filterFallbackSequential(ctx context.Context, program cel.Pr } // filterFallbackParallel performs parallel CEL evaluation for large GPU sets -func (f *CELFilter) filterFallbackParallel(ctx context.Context, program cel.Program, gpus []*tfv1.GPU, workerPodKey tfv1.NameNamespace) []*tfv1.GPU { +func (f *CELFilter) filterFallbackParallel(program cel.Program, gpus []*tfv1.GPU, workerPodKey tfv1.NameNamespace) []*tfv1.GPU { numGPUs := len(gpus) numWorkers := runtime.NumCPU() if numWorkers > DefaultWorkerCount { diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go index 26b825fd..0cd46d02 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go @@ -11,6 +11,13 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" ) +// Test constants for repeated strings +const ( + testEnvironmentProduction = "production" + testTierHighPerformance = "high-performance" + testPriorityCritical = "critical" +) + // Benchmark performance of the CEL filter compared to the original filter func BenchmarkFilterPerformance(b *testing.B) { // Create test data @@ -30,10 +37,10 @@ func BenchmarkFilterPerformance(b *testing.B) { phase = constants.PhasePending } - gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", gpuModel, phase, 150.0, 40.0) - gpu.Labels["environment"] = "production" + gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), gpuModel, phase, 150.0, 40.0) + gpu.Labels["environment"] = testEnvironmentProduction if i%2 == 0 { - gpu.Labels["tier"] = "high-performance" + gpu.Labels["tier"] = testTierHighPerformance } gpus[i] = gpu } @@ -61,7 +68,7 @@ func BenchmarkFilterPerformance(b *testing.B) { // Benchmark CEL filter - basic filtering b.Run("CELFilter_Basic", func(b *testing.B) { - request := createTestAllocRequest("default", "test-workload", "A100", "") + request := createTestAllocRequest("A100", "") cache, err := NewExpressionCache(100, 5*time.Minute) if err != nil { b.Fatal(err) @@ -84,7 +91,7 @@ func BenchmarkFilterPerformance(b *testing.B) { // Benchmark CEL filter - complex expression b.Run("CELFilter_Complex", func(b *testing.B) { - request := createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'") + request := createTestAllocRequest("A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == '"+testEnvironmentProduction+"'") cache, err := NewExpressionCache(100, 5*time.Minute) if err != nil { b.Fatal(err) @@ -116,15 +123,15 @@ func BenchmarkFilterPerformance(b *testing.B) { "gpu.gpuModel == 'A100' && gpu.available.tflops > 100.0", "gpu.gpuModel == 'V100' && gpu.available.tflops > 80.0", "gpu.gpuModel == 'H100' && gpu.available.tflops > 180.0", - "gpu.labels['environment'] == 'production'", - "gpu.labels['tier'] == 'high-performance'", + "gpu.labels['environment'] == '" + testEnvironmentProduction + "'", + "gpu.labels['tier'] == '" + testTierHighPerformance + "'", "gpu.available.vram > 30000000000", } b.ResetTimer() for i := 0; i < b.N; i++ { expression := expressions[i%len(expressions)] - request := createTestAllocRequest("default", "test-workload", "", expression) + request := createTestAllocRequest("", expression) celFilter, err := NewCELFilter(request, cache) if err != nil { @@ -189,10 +196,10 @@ func BenchmarkExpressionComplexity(b *testing.B) { const numGPUs = 100 gpus := make([]*tfv1.GPU, numGPUs) for i := 0; i < numGPUs; i++ { - gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "default", "A100", constants.PhaseRunning, 150.0, 40.0) - gpu.Labels["environment"] = "production" - gpu.Labels["tier"] = "high-performance" - gpu.Annotations["priority"] = "critical" + gpu := createTestGPU(fmt.Sprintf("gpu-%d", i), "A100", constants.PhaseRunning, 150.0, 40.0) + gpu.Labels["environment"] = testEnvironmentProduction + gpu.Labels["tier"] = testTierHighPerformance + gpu.Annotations["priority"] = testPriorityCritical gpus[i] = gpu } @@ -217,11 +224,11 @@ func BenchmarkExpressionComplexity(b *testing.B) { }, { name: "VeryComplex", - expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'", + expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == '" + testEnvironmentProduction + "'", }, { name: "UltraComplex", - expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production' && gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'", + expression: "gpu.phase == 'Running' && gpu.gpuModel == 'A100' && gpu.available.tflops >= 150.0 && gpu.labels['environment'] == '" + testEnvironmentProduction + "' && gpu.labels['tier'] == '" + testTierHighPerformance + "' && gpu.annotations['priority'] == '" + testPriorityCritical + "'", }, } @@ -232,7 +239,7 @@ func BenchmarkExpressionComplexity(b *testing.B) { b.Fatal(err) } - request := createTestAllocRequest("default", "test-workload", "", tc.expression) + request := createTestAllocRequest("", tc.expression) celFilter, err := NewCELFilter(request, cache) if err != nil { b.Fatal(err) diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go index f882747b..72481ee9 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_test.go @@ -13,12 +13,18 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// Test constants for repeated strings used only in cel_filter_test.go +const ( + testEnvProduction = "production" + testPriorCritical = "critical" +) + // Helper functions for creating test data -func createTestGPU(name, namespace, gpuModel, phase string, tflops, vram float64) *tfv1.GPU { +func createTestGPU(name, gpuModel, phase string, tflops, vram float64) *tfv1.GPU { gpu := &tfv1.GPU{ ObjectMeta: metav1.ObjectMeta{ Name: name, - Namespace: namespace, + Namespace: "default", Labels: make(map[string]string), Annotations: make(map[string]string), }, @@ -41,11 +47,11 @@ func createTestGPU(name, namespace, gpuModel, phase string, tflops, vram float64 return gpu } -func createTestAllocRequest(namespace, name, gpuModel, celExpression string) *tfv1.AllocRequest { +func createTestAllocRequest(gpuModel, celExpression string) *tfv1.AllocRequest { return &tfv1.AllocRequest{ WorkloadNameNamespace: tfv1.NameNamespace{ - Name: name, - Namespace: namespace, + Name: "test-workload", + Namespace: "default", }, GPUModel: gpuModel, CELFilterExpression: celExpression, @@ -66,71 +72,71 @@ func TestCELFilter_NormalCases(t *testing.T) { }{ { name: "filter by GPU model", - request: createTestAllocRequest("default", "test-workload", "A100", ""), + request: createTestAllocRequest("A100", ""), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), - createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), - createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "V100", constants.PhaseRunning, 100.0, 32.0), + createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0), }, expectedCount: 2, description: "Should filter GPUs matching the specified model A100", }, { name: "filter by GPU phase only", - request: createTestAllocRequest("default", "test-workload", "", ""), + request: createTestAllocRequest("", ""), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), - createTestGPU("gpu-2", "default", "A100", constants.PhasePending, 150.0, 40.0), - createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), - createTestGPU("gpu-4", "default", "A100", constants.PhaseFailed, 150.0, 40.0), + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "A100", constants.PhasePending, 150.0, 40.0), + createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-4", "A100", constants.PhaseFailed, 150.0, 40.0), }, expectedCount: 3, description: "Should return GPUs in Running and Pending phases", }, { name: "custom CEL expression - filter by available TFLOPS", - request: createTestAllocRequest("default", "test-workload", "", "gpu.available.tflops > 120.0"), + request: createTestAllocRequest("", "gpu.available.tflops > 120.0"), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), - createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), - createTestGPU("gpu-3", "default", "H100", constants.PhaseRunning, 200.0, 80.0), + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "V100", constants.PhaseRunning, 100.0, 32.0), + createTestGPU("gpu-3", "H100", constants.PhaseRunning, 200.0, 80.0), }, expectedCount: 2, description: "Should filter GPUs with TFLOPS > 120 and Running/Pending phase", }, { name: "custom CEL expression - filter by available VRAM", - request: createTestAllocRequest("default", "test-workload", "", "gpu.available.vram > 35000000000"), // > 35GB in bytes + request: createTestAllocRequest("", "gpu.available.vram > 35000000000"), // > 35GB in bytes gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // 40GB - createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), // 32GB - createTestGPU("gpu-3", "default", "H100", constants.PhaseRunning, 200.0, 80.0), // 80GB + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), // 40GB + createTestGPU("gpu-2", "V100", constants.PhaseRunning, 100.0, 32.0), // 32GB + createTestGPU("gpu-3", "H100", constants.PhaseRunning, 200.0, 80.0), // 80GB }, expectedCount: 2, description: "Should filter GPUs with VRAM > 35GB and Running/Pending phase", }, { name: "combined model and custom CEL expression", - request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0"), + request: createTestAllocRequest("A100", "gpu.available.tflops >= 150.0"), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), - createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 120.0, 40.0), - createTestGPU("gpu-3", "default", "V100", constants.PhaseRunning, 160.0, 32.0), - createTestGPU("gpu-4", "default", "A100", constants.PhaseRunning, 180.0, 40.0), + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "A100", constants.PhaseRunning, 120.0, 40.0), + createTestGPU("gpu-3", "V100", constants.PhaseRunning, 160.0, 32.0), + createTestGPU("gpu-4", "A100", constants.PhaseRunning, 180.0, 40.0), }, expectedCount: 2, description: "Should filter A100 GPUs with TFLOPS >= 150 and Running/Pending phase", }, { name: "filter by labels", - request: createTestAllocRequest("default", "test-workload", "", "gpu.labels['environment'] == 'production'"), + request: createTestAllocRequest("", "gpu.labels['environment'] == '"+testEnvProduction+"'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) - gpu1.Labels["environment"] = "production" - gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu1 := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu1.Labels["environment"] = testEnvProduction + gpu2 := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu2.Labels["environment"] = "development" - gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) - gpu3.Labels["environment"] = "production" + gpu3 := createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu3.Labels["environment"] = testEnvProduction return []*tfv1.GPU{gpu1, gpu2, gpu3} }(), expectedCount: 2, @@ -138,14 +144,14 @@ func TestCELFilter_NormalCases(t *testing.T) { }, { name: "filter by annotations", - request: createTestAllocRequest("default", "test-workload", "", "gpu.annotations['priority'] == 'critical'"), + request: createTestAllocRequest("", "gpu.annotations['priority'] == '"+testPriorCritical+"'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) - gpu1.Annotations["priority"] = "critical" - gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu1 := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu1.Annotations["priority"] = testPriorCritical + gpu2 := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu2.Annotations["priority"] = "low" - gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) - gpu3.Annotations["priority"] = "critical" + gpu3 := createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu3.Annotations["priority"] = testPriorCritical return []*tfv1.GPU{gpu1, gpu2, gpu3} }(), expectedCount: 2, @@ -153,15 +159,15 @@ func TestCELFilter_NormalCases(t *testing.T) { }, { name: "combined labels and annotations filter", - request: createTestAllocRequest("default", "test-workload", "", "gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'"), + request: createTestAllocRequest("", "gpu.labels['tier'] == 'high-performance' && gpu.annotations['priority'] == 'critical'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu1 := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0) gpu1.Labels["tier"] = "high-performance" - gpu1.Annotations["priority"] = "critical" - gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu1.Annotations["priority"] = testPriorCritical + gpu2 := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu2.Labels["tier"] = "standard" gpu2.Annotations["priority"] = "critical" - gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu3 := createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0) gpu3.Labels["tier"] = "high-performance" gpu3.Annotations["priority"] = "low" return []*tfv1.GPU{gpu1, gpu2, gpu3} @@ -171,20 +177,20 @@ func TestCELFilter_NormalCases(t *testing.T) { }, { name: "empty GPU list", - request: createTestAllocRequest("default", "test-workload", "A100", ""), + request: createTestAllocRequest("A100", ""), gpus: []*tfv1.GPU{}, expectedCount: 0, description: "Should handle empty GPU list gracefully", }, { name: "complex combined expression with model, resources, and metadata", - request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'"), + request: createTestAllocRequest("A100", "gpu.available.tflops >= 150.0 && gpu.labels['environment'] == 'production'"), gpus: func() []*tfv1.GPU { - gpu1 := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 180.0, 40.0) - gpu1.Labels["environment"] = "production" - gpu2 := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 120.0, 40.0) + gpu1 := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 180.0, 40.0) + gpu1.Labels["environment"] = testEnvProduction + gpu2 := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 120.0, 40.0) gpu2.Labels["environment"] = "production" - gpu3 := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 200.0, 40.0) + gpu3 := createTestGPU("gpu-3", "A100", constants.PhaseRunning, 200.0, 40.0) gpu3.Labels["environment"] = "development" return []*tfv1.GPU{gpu1, gpu2, gpu3} }(), @@ -193,27 +199,27 @@ func TestCELFilter_NormalCases(t *testing.T) { }, { name: "filter by running apps - no running apps", - request: createTestAllocRequest("default", "test-workload", "", "size(gpu.runningApps) == 0"), + request: createTestAllocRequest("", "size(gpu.runningApps) == 0"), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), func() *tfv1.GPU { - gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "app1", Namespace: "default", Count: 1}, } return gpu }(), - createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0), }, expectedCount: 2, description: "Should return GPUs with no running apps", }, { name: "filter by running apps - has specific app", - request: createTestAllocRequest("default", "test-workload", "", "gpu.runningApps.exists(app, app.name == 'training-job' && app.namespace == 'ml-team')"), + request: createTestAllocRequest("", "gpu.runningApps.exists(app, app.name == 'training-job' && app.namespace == 'ml-team')"), gpus: []*tfv1.GPU{ func() *tfv1.GPU { - gpu := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "training-job", Namespace: "ml-team", Count: 2}, {Name: "other-job", Namespace: "default", Count: 1}, @@ -221,23 +227,23 @@ func TestCELFilter_NormalCases(t *testing.T) { return gpu }(), func() *tfv1.GPU { - gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "other-job", Namespace: "ml-team", Count: 1}, } return gpu }(), - createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0), }, expectedCount: 1, description: "Should return GPUs running specific training job", }, { name: "filter by running apps - count threshold", - request: createTestAllocRequest("default", "test-workload", "", "gpu.runningApps.all(app, app.count <= 2) && size(gpu.runningApps) > 0"), + request: createTestAllocRequest("", "gpu.runningApps.all(app, app.count <= 2) && size(gpu.runningApps) > 0"), gpus: []*tfv1.GPU{ func() *tfv1.GPU { - gpu := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "job1", Namespace: "default", Count: 1}, {Name: "job2", Namespace: "default", Count: 2}, @@ -245,47 +251,47 @@ func TestCELFilter_NormalCases(t *testing.T) { return gpu }(), func() *tfv1.GPU { - gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "job1", Namespace: "default", Count: 5}, // Count > 2 } return gpu }(), - createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps + createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps }, expectedCount: 1, description: "Should return GPUs where all running apps have count <= 2", }, { name: "filter by running apps - complex condition", - request: createTestAllocRequest("default", "test-workload", "A100", "gpu.available.tflops >= 150.0 && (size(gpu.runningApps) == 0 || gpu.runningApps.all(app, app.namespace != 'restricted'))"), + request: createTestAllocRequest("A100", "gpu.available.tflops >= 150.0 && (size(gpu.runningApps) == 0 || gpu.runningApps.all(app, app.namespace != 'restricted'))"), gpus: []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps func() *tfv1.GPU { - gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "job1", Namespace: "allowed", Count: 1}, } return gpu }(), func() *tfv1.GPU { - gpu := createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "job1", Namespace: "restricted", Count: 1}, // Restricted namespace } return gpu }(), - createTestGPU("gpu-4", "default", "V100", constants.PhaseRunning, 150.0, 40.0), // Wrong model + createTestGPU("gpu-4", "V100", constants.PhaseRunning, 150.0, 40.0), // Wrong model }, expectedCount: 2, description: "Should return A100 GPUs with sufficient resources and no restricted apps", }, { name: "filter by running apps - namespace isolation", - request: createTestAllocRequest("default", "test-workload", "", "!gpu.runningApps.exists(app, app.namespace == 'tenant-a')"), + request: createTestAllocRequest("", "!gpu.runningApps.exists(app, app.namespace == 'tenant-a')"), gpus: []*tfv1.GPU{ func() *tfv1.GPU { - gpu := createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "job1", Namespace: "tenant-b", Count: 1}, {Name: "job2", Namespace: "shared", Count: 1}, @@ -293,14 +299,14 @@ func TestCELFilter_NormalCases(t *testing.T) { return gpu }(), func() *tfv1.GPU { - gpu := createTestGPU("gpu-2", "default", "A100", constants.PhaseRunning, 150.0, 40.0) + gpu := createTestGPU("gpu-2", "A100", constants.PhaseRunning, 150.0, 40.0) gpu.Status.RunningApps = []*tfv1.RunningAppDetail{ {Name: "job1", Namespace: "tenant-a", Count: 1}, // Should be excluded {Name: "job2", Namespace: "tenant-b", Count: 1}, } return gpu }(), - createTestGPU("gpu-3", "default", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps + createTestGPU("gpu-3", "A100", constants.PhaseRunning, 150.0, 40.0), // No running apps }, expectedCount: 2, description: "Should return GPUs not running apps from tenant-a", @@ -341,11 +347,11 @@ func TestCELFilter_EdgeAndExceptionCases(t *testing.T) { t.Run("CEL expressions edge cases", func(t *testing.T) { // Test GPUs for execution testGPUs := []*tfv1.GPU{ - createTestGPU("gpu-1", "default", "A100", constants.PhaseRunning, 150.0, 40.0), - createTestGPU("gpu-2", "default", "V100", constants.PhaseRunning, 100.0, 32.0), + createTestGPU("gpu-1", "A100", constants.PhaseRunning, 150.0, 40.0), + createTestGPU("gpu-2", "V100", constants.PhaseRunning, 100.0, 32.0), } // Add GPU with nil resources - gpuWithNilResources := createTestGPU("gpu-nil", "default", "A100", constants.PhaseRunning, 0, 0) + gpuWithNilResources := createTestGPU("gpu-nil", "A100", constants.PhaseRunning, 0, 0) gpuWithNilResources.Status.Available = nil testGPUs = append(testGPUs, gpuWithNilResources) @@ -455,7 +461,7 @@ func TestCELFilter_EdgeAndExceptionCases(t *testing.T) { cache, err := NewExpressionCache(10, 5*time.Minute) require.NoError(t, err) - request := createTestAllocRequest("default", "test-workload", "", tt.expression) + request := createTestAllocRequest("", tt.expression) celFilter, err := NewCELFilter(request, cache) if tt.shouldFail { From de5b0c1df59d94df33e233cd069a131482ba7e08 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:14:52 +0000 Subject: [PATCH 22/34] chore(deps): bump github.com/aws/aws-sdk-go-v2 from 1.38.3 to 1.39.0 (#362) --- go.mod | 4 ++-- go.sum | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index e8da7faf..ff05257d 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/DATA-DOG/go-sqlmock v1.5.2 github.com/NVIDIA/go-nvml v0.13.0-1 github.com/aliyun/alibaba-cloud-sdk-go v1.63.107 - github.com/aws/aws-sdk-go-v2 v1.38.3 + github.com/aws/aws-sdk-go-v2 v1.39.0 github.com/aws/aws-sdk-go-v2/service/ec2 v1.251.0 github.com/awslabs/operatorpkg v0.0.0-20250903180825-ba7ac0af36e5 github.com/gin-contrib/gzip v1.2.3 @@ -30,6 +30,7 @@ require ( gorm.io/gorm v1.30.3 k8s.io/api v0.34.0 k8s.io/apimachinery v0.34.0 + k8s.io/apiserver v0.34.0 k8s.io/client-go v0.34.0 k8s.io/component-base v0.34.0 k8s.io/component-helpers v0.34.0 @@ -174,7 +175,6 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.34.0 // indirect - k8s.io/apiserver v0.34.0 // indirect k8s.io/cloud-provider v0.34.0 // indirect k8s.io/controller-manager v0.34.0 // indirect k8s.io/csi-translation-lib v0.34.0 // indirect diff --git a/go.sum b/go.sum index 446e3470..b924ab09 100644 --- a/go.sum +++ b/go.sum @@ -24,8 +24,8 @@ github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYW github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go-v2 v1.38.3 h1:B6cV4oxnMs45fql4yRH+/Po/YU+597zgWqvDpYMturk= -github.com/aws/aws-sdk-go-v2 v1.38.3/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= +github.com/aws/aws-sdk-go-v2 v1.39.0 h1:xm5WV/2L4emMRmMjHFykqiA4M/ra0DJVSWUkDyBjbg4= +github.com/aws/aws-sdk-go-v2 v1.39.0/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 h1:uF68eJA6+S9iVr9WgX1NaRGyQ/6MdIyc4JNUo6TN1FA= github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6/go.mod h1:qlPeVZCGPiobx8wb1ft0GHT5l+dc6ldnwInDFaMvC7Y= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 h1:pa1DEC6JoI0zduhZePp3zmhWvk/xxm4NB8Hy/Tlsgos= From 3d9b2c43340e0e63f7384c7399fabe306342ed9d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:50:55 +0000 Subject: [PATCH 23/34] chore(deps): bump gorm.io/gorm from 1.30.3 to 1.31.0 (#361) --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index ff05257d..f70cbce3 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,7 @@ require ( gomodules.xyz/jsonpatch/v2 v2.5.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gorm.io/driver/mysql v1.6.0 - gorm.io/gorm v1.30.3 + gorm.io/gorm v1.31.0 k8s.io/api v0.34.0 k8s.io/apimachinery v0.34.0 k8s.io/apiserver v0.34.0 diff --git a/go.sum b/go.sum index b924ab09..b5f04e5f 100644 --- a/go.sum +++ b/go.sum @@ -482,8 +482,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gorm.io/driver/mysql v1.6.0 h1:eNbLmNTpPpTOVZi8MMxCi2aaIm0ZpInbORNXDwyLGvg= gorm.io/driver/mysql v1.6.0/go.mod h1:D/oCC2GWK3M/dqoLxnOlaNKmXz8WNTfcS9y5ovaSqKo= -gorm.io/gorm v1.30.3 h1:QiG8upl0Sg9ba2Zatfjy0fy4It2iNBL2/eMdvEkdXNs= -gorm.io/gorm v1.30.3/go.mod h1:8Z33v652h4//uMA76KjeDH8mJXPm1QNCYrMeatR0DOE= +gorm.io/gorm v1.31.0 h1:0VlycGreVhK7RF/Bwt51Fk8v0xLiiiFdbGDPIZQ7mJY= +gorm.io/gorm v1.31.0/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs= k8s.io/api v0.34.0 h1:L+JtP2wDbEYPUeNGbeSa/5GwFtIA662EmT2YSLOkAVE= k8s.io/api v0.34.0/go.mod h1:YzgkIzOOlhl9uwWCZNqpw6RJy9L2FK4dlJeayUoydug= k8s.io/apiextensions-apiserver v0.34.0 h1:B3hiB32jV7BcyKcMU5fDaDxk882YrJ1KU+ZSkA9Qxoc= From ec36d4ad845dac0fa788ba52f6f74f2346aac0e0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:59:46 +0000 Subject: [PATCH 24/34] chore(deps): bump k8s.io/client-go from 0.34.0 to 0.34.1 (#364) --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index f70cbce3..991d385c 100644 --- a/go.mod +++ b/go.mod @@ -28,10 +28,10 @@ require ( gopkg.in/natefinch/lumberjack.v2 v2.2.1 gorm.io/driver/mysql v1.6.0 gorm.io/gorm v1.31.0 - k8s.io/api v0.34.0 - k8s.io/apimachinery v0.34.0 + k8s.io/api v0.34.1 + k8s.io/apimachinery v0.34.1 k8s.io/apiserver v0.34.0 - k8s.io/client-go v0.34.0 + k8s.io/client-go v0.34.1 k8s.io/component-base v0.34.0 k8s.io/component-helpers v0.34.0 k8s.io/klog/v2 v2.130.1 diff --git a/go.sum b/go.sum index b5f04e5f..59cbea42 100644 --- a/go.sum +++ b/go.sum @@ -484,16 +484,16 @@ gorm.io/driver/mysql v1.6.0 h1:eNbLmNTpPpTOVZi8MMxCi2aaIm0ZpInbORNXDwyLGvg= gorm.io/driver/mysql v1.6.0/go.mod h1:D/oCC2GWK3M/dqoLxnOlaNKmXz8WNTfcS9y5ovaSqKo= gorm.io/gorm v1.31.0 h1:0VlycGreVhK7RF/Bwt51Fk8v0xLiiiFdbGDPIZQ7mJY= gorm.io/gorm v1.31.0/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs= -k8s.io/api v0.34.0 h1:L+JtP2wDbEYPUeNGbeSa/5GwFtIA662EmT2YSLOkAVE= -k8s.io/api v0.34.0/go.mod h1:YzgkIzOOlhl9uwWCZNqpw6RJy9L2FK4dlJeayUoydug= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= k8s.io/apiextensions-apiserver v0.34.0 h1:B3hiB32jV7BcyKcMU5fDaDxk882YrJ1KU+ZSkA9Qxoc= k8s.io/apiextensions-apiserver v0.34.0/go.mod h1:hLI4GxE1BDBy9adJKxUxCEHBGZtGfIg98Q+JmTD7+g0= -k8s.io/apimachinery v0.34.0 h1:eR1WO5fo0HyoQZt1wdISpFDffnWOvFLOOeJ7MgIv4z0= -k8s.io/apimachinery v0.34.0/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/apiserver v0.34.0 h1:Z51fw1iGMqN7uJ1kEaynf2Aec1Y774PqU+FVWCFV3Jg= k8s.io/apiserver v0.34.0/go.mod h1:52ti5YhxAvewmmpVRqlASvaqxt0gKJxvCeW7ZrwgazQ= -k8s.io/client-go v0.34.0 h1:YoWv5r7bsBfb0Hs2jh8SOvFbKzzxyNo0nSb0zC19KZo= -k8s.io/client-go v0.34.0/go.mod h1:ozgMnEKXkRjeMvBZdV1AijMHLTh3pbACPvK7zFR+QQY= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= k8s.io/cloud-provider v0.34.0 h1:OgrNE+WSgfvDBQf6WS9qFM7Xr37bc0Og5kkL4hyWDmU= k8s.io/cloud-provider v0.34.0/go.mod h1:JbMa0t6JIGDMLI7Py6bdp9TN6cfuHrWGq+E/X+Ljkmo= k8s.io/component-base v0.34.0 h1:bS8Ua3zlJzapklsB1dZgjEJuJEeHjj8yTu1gxE2zQX8= From 40b98a8f40fe5be2b8f4ddeee9d71babec0453fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 13:47:37 +0000 Subject: [PATCH 25/34] chore(deps): bump k8s.io/component-helpers from 0.34.0 to 0.34.1 (#360) --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 991d385c..e955a347 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,7 @@ require ( k8s.io/apiserver v0.34.0 k8s.io/client-go v0.34.1 k8s.io/component-base v0.34.0 - k8s.io/component-helpers v0.34.0 + k8s.io/component-helpers v0.34.1 k8s.io/klog/v2 v2.130.1 k8s.io/kube-scheduler v0.34.0 k8s.io/kubernetes v1.34.0 diff --git a/go.sum b/go.sum index 59cbea42..d46ef39a 100644 --- a/go.sum +++ b/go.sum @@ -498,8 +498,8 @@ k8s.io/cloud-provider v0.34.0 h1:OgrNE+WSgfvDBQf6WS9qFM7Xr37bc0Og5kkL4hyWDmU= k8s.io/cloud-provider v0.34.0/go.mod h1:JbMa0t6JIGDMLI7Py6bdp9TN6cfuHrWGq+E/X+Ljkmo= k8s.io/component-base v0.34.0 h1:bS8Ua3zlJzapklsB1dZgjEJuJEeHjj8yTu1gxE2zQX8= k8s.io/component-base v0.34.0/go.mod h1:RSCqUdvIjjrEm81epPcjQ/DS+49fADvGSCkIP3IC6vg= -k8s.io/component-helpers v0.34.0 h1:5T7P9XGMoUy1JDNKzHf0p/upYbeUf8ZaSf9jbx0QlIo= -k8s.io/component-helpers v0.34.0/go.mod h1:kaOyl5tdtnymriYcVZg4uwDBe2d1wlIpXyDkt6sVnt4= +k8s.io/component-helpers v0.34.1 h1:gWhH3CCdwAx5P3oJqZKb4Lg5FYZTWVbdWtOI8n9U4XY= +k8s.io/component-helpers v0.34.1/go.mod h1:4VgnUH7UA/shuBur+OWoQC0xfb69sy/93ss0ybZqm3c= k8s.io/controller-manager v0.34.0 h1:oCHoqS8dcFp7zDSu7HUvTpakq3isSxil3GprGGlJMsE= k8s.io/controller-manager v0.34.0/go.mod h1:XFto21U+Mm9BT8r/Jd5E4tHCGtwjKAUFOuDcqaj2VK0= k8s.io/csi-translation-lib v0.34.0 h1:WhCkq35XATZ+x6NKqI4u7XSYtmucuCN7jDk+mmm9XUU= From a45ba609eede9a2146098a93fa2ba32c96556ebe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 13:48:08 +0000 Subject: [PATCH 26/34] chore(deps): bump sigs.k8s.io/controller-runtime from 0.22.0 to 0.22.1 (#363) --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e955a347..dc198d32 100644 --- a/go.mod +++ b/go.mod @@ -38,7 +38,7 @@ require ( k8s.io/kube-scheduler v0.34.0 k8s.io/kubernetes v1.34.0 k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d - sigs.k8s.io/controller-runtime v0.22.0 + sigs.k8s.io/controller-runtime v0.22.1 sigs.k8s.io/karpenter v1.6.2 sigs.k8s.io/yaml v1.6.0 ) diff --git a/go.sum b/go.sum index d46ef39a..0130fbbf 100644 --- a/go.sum +++ b/go.sum @@ -523,8 +523,8 @@ k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 h1:qPrZsv1cwQiFeieFlRqT627fVZ+tyfou/+S5S0H5ua0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.22.0 h1:mTOfibb8Hxwpx3xEkR56i7xSjB+nH4hZG37SrlCY5e0= -sigs.k8s.io/controller-runtime v0.22.0/go.mod h1:FwiwRjkRPbiN+zp2QRp7wlTCzbUXxZ/D4OzuQUDwBHY= +sigs.k8s.io/controller-runtime v0.22.1 h1:Ah1T7I+0A7ize291nJZdS1CabF/lB4E++WizgV24Eqg= +sigs.k8s.io/controller-runtime v0.22.1/go.mod h1:FwiwRjkRPbiN+zp2QRp7wlTCzbUXxZ/D4OzuQUDwBHY= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/karpenter v1.6.2 h1:WFayZ49CSOaDMku1iYBTsD3A9hOB2yU/U95VcSAJ8KM= From 5867f3ccc17a48ffc4cf19a2fd5f1a28deae6ede Mon Sep 17 00:00:00 2001 From: Joey Yang <14833440+Code2Life@users.noreply.github.com> Date: Wed, 17 Sep 2025 22:13:01 +0800 Subject: [PATCH 27/34] feat: preempt support for GPU workers (#366) * fix: gpu info update * feat: preempt scheduling, fix metrics scheduling bugs, add evict protection * fix: unit test issue * fix: preempt unit testing * fix: lint issue, add qos to priorityClassName converting --- .vscode/settings.json | 2 + api/v1/gpupool_types.go | 6 + api/v1/gpuresourcequota_types.go | 2 + charts/tensor-fusion/Chart.yaml | 2 +- .../crds/tensor-fusion.ai_gpupools.yaml | 6 + ...tensor-fusion.ai_tensorfusionclusters.yaml | 6 + .../templates/controller-deployment.yaml | 1 + .../templates/gpu-public-gpu-info.yaml | 18 +- .../templates/priorityclass.yaml | 23 ++ charts/tensor-fusion/values.yaml | 4 +- cmd/main.go | 2 +- .../crd/bases/tensor-fusion.ai_gpupools.yaml | 6 + ...tensor-fusion.ai_tensorfusionclusters.yaml | 6 + internal/config/rules.go | 2 +- internal/constants/constants.go | 6 +- internal/controller/pod_controller.go | 10 +- internal/controller/pod_controller_test.go | 3 - .../tensorfusioncluster_controller.go | 36 +-- internal/gpuallocator/gpuallocator.go | 104 +++++- internal/metrics/recorder.go | 125 ++++++-- internal/metrics/types.go | 4 + internal/quota/quota_store.go | 71 +++-- .../scheduler/gpuresources/gpuresources.go | 108 ++++++- .../gpuresources/gpuresources_test.go | 13 +- internal/utils/compose.go | 1 + internal/webhook/v1/pod_webhook.go | 23 +- internal/webhook/v1/tf_parser.go | 2 - patches/scheduler-pdb-1.patch | 31 +- test/sched/gpufit_bench_test.go | 1 - test/sched/preemption_test.go | 299 ++++++++++++++++++ test/sched/scheduler_bench_test.go | 3 +- test/sched/setup.go | 32 +- 32 files changed, 833 insertions(+), 125 deletions(-) create mode 100644 charts/tensor-fusion/templates/priorityclass.yaml create mode 100644 test/sched/preemption_test.go diff --git a/.vscode/settings.json b/.vscode/settings.json index 2a261510..a5da5620 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -47,6 +47,7 @@ "envtest", "essd", "Eventf", + "evictable", "featuregate", "finalizer", "Finalizers", @@ -133,6 +134,7 @@ "schedulingconfigtemplate", "schedulingconfigtemplates", "schedulingcorev", + "schedv", "serviceaccount", "shirou", "shortuuid", diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go index 08d139b5..ca9224c4 100644 --- a/api/v1/gpupool_types.go +++ b/api/v1/gpupool_types.go @@ -238,6 +238,12 @@ type QosConfig struct { Definitions []QosDefinition `json:"definitions,omitempty"` DefaultQoS QoSLevel `json:"defaultQoS,omitempty"` Pricing []QosPricing `json:"pricing,omitempty"` + + // Eviction protection price ratio applied to cost calculation during protection period + // This multiplier increases pricing for protected workloads to discourage preemption + // +optional + // +kubebuilder:default="1.2" + EvictionProtectionPriceRatio string `json:"evictionProtectionPriceRatio,omitempty"` } type QosDefinition struct { diff --git a/api/v1/gpuresourcequota_types.go b/api/v1/gpuresourcequota_types.go index 1b28520a..bb8a5ff8 100644 --- a/api/v1/gpuresourcequota_types.go +++ b/api/v1/gpuresourcequota_types.go @@ -186,6 +186,8 @@ type AllocRequest struct { // record the pod meta for quota check PodMeta metav1.ObjectMeta + + QoS QoSLevel } func (p *AllocRequest) Clone() fwk.StateData { diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml index d2dc9f06..042d05c2 100644 --- a/charts/tensor-fusion/Chart.yaml +++ b/charts/tensor-fusion/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.5.8 +version: 1.5.9 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml index 8bc65e66..2158529c 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml @@ -562,6 +562,12 @@ spec: type: integer type: object type: array + evictionProtectionPriceRatio: + default: "1.2" + description: |- + Eviction protection price ratio applied to cost calculation during protection period + This multiplier increases pricing for protected workloads to discourage preemption + type: string pricing: items: properties: diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml index 45bc9a47..496541bc 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml @@ -629,6 +629,12 @@ spec: type: integer type: object type: array + evictionProtectionPriceRatio: + default: "1.2" + description: |- + Eviction protection price ratio applied to cost calculation during protection period + This multiplier increases pricing for protected workloads to discourage preemption + type: string pricing: items: properties: diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml index ca09a6a1..c16c4aab 100644 --- a/charts/tensor-fusion/templates/controller-deployment.yaml +++ b/charts/tensor-fusion/templates/controller-deployment.yaml @@ -32,6 +32,7 @@ spec: {{- end }} serviceAccountName: {{ include "tensor-fusion.serviceAccountName" . }} enableServiceLinks: false + priorityClassName: "system-cluster-critical" containers: - name: controller image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag | default .Chart.AppVersion }}" diff --git a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml index d473fcfa..2c88583b 100644 --- a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml +++ b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml @@ -45,6 +45,18 @@ data: costPerHour: 1.64 fp16TFlops: 312 + - model: A100_PCIe_40GB + fullModelName: "NVIDIA A100-PCIE-40GB" + vendor: NVIDIA + costPerHour: 1.64 + fp16TFlops: 312 + + - model: A100_PCIe_80GB + fullModelName: "NVIDIA A100-PCIE-80GB" + vendor: NVIDIA + costPerHour: 1.64 + fp16TFlops: 312 + - model: A100_SXM_40G fullModelName: "NVIDIA A100-SXM4-40GB" vendor: NVIDIA @@ -70,13 +82,13 @@ data: fp16TFlops: 312 - model: A800_PCIe_80G - fullModelName: "NVIDIA A800 80GB PCIe" + fullModelName: "NVIDIA A800-PCIE-80GB" vendor: NVIDIA costPerHour: 1.64 fp16TFlops: 312 - model: A800_PCIe_40G - fullModelName: "NVIDIA A800 40GB PCIe" + fullModelName: "NVIDIA A800-PCIE-40GB" vendor: NVIDIA costPerHour: 1.64 fp16TFlops: 312 @@ -95,7 +107,7 @@ data: fp16TFlops: 125 - model: A40 - fullModelName: "NVIDIA A40 48GB PCIe" + fullModelName: "NVIDIA A40-PCIE-48GB" vendor: NVIDIA costPerHour: 0.4 fp16TFlops: 149.7 diff --git a/charts/tensor-fusion/templates/priorityclass.yaml b/charts/tensor-fusion/templates/priorityclass.yaml new file mode 100644 index 00000000..e1f493b8 --- /dev/null +++ b/charts/tensor-fusion/templates/priorityclass.yaml @@ -0,0 +1,23 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: tensor-fusion-critical +value: 100000 +globalDefault: false +description: "TensorFusion critical priority" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: tensor-fusion-high +value: 10000 +globalDefault: false +description: "TensorFusion high priority" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: tensor-fusion-medium +value: 0 +globalDefault: false +description: "TensorFusion medium priority" diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml index 6b9fcc0c..2c06aba6 100644 --- a/charts/tensor-fusion/values.yaml +++ b/charts/tensor-fusion/values.yaml @@ -169,8 +169,8 @@ schedulerConfig: kind: KubeSchedulerConfiguration clientConnection: kubeconfig: "" - qps: 50 - burst: 100 + qps: 1000 + burst: 2000 profiles: # Refer: https://kubernetes.io/docs/reference/scheduling/config/ - schedulerName: tensor-fusion-scheduler diff --git a/cmd/main.go b/cmd/main.go index f00a6b2e..c0bd95ea 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -586,7 +586,7 @@ func startMetricsRecorder( // Worker level map will be updated by cluster reconcile // Key is poolName, second level key is QoS level - WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing), + WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing, 8), } if enableLeaderElection { go func() { diff --git a/config/crd/bases/tensor-fusion.ai_gpupools.yaml b/config/crd/bases/tensor-fusion.ai_gpupools.yaml index 8bc65e66..2158529c 100644 --- a/config/crd/bases/tensor-fusion.ai_gpupools.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpupools.yaml @@ -562,6 +562,12 @@ spec: type: integer type: object type: array + evictionProtectionPriceRatio: + default: "1.2" + description: |- + Eviction protection price ratio applied to cost calculation during protection period + This multiplier increases pricing for protected workloads to discourage preemption + type: string pricing: items: properties: diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml index 45bc9a47..496541bc 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml @@ -629,6 +629,12 @@ spec: type: integer type: object type: array + evictionProtectionPriceRatio: + default: "1.2" + description: |- + Eviction protection price ratio applied to cost calculation during protection period + This multiplier increases pricing for protected workloads to discourage preemption + type: string pricing: items: properties: diff --git a/internal/config/rules.go b/internal/config/rules.go index dd3713bd..8bbfb556 100644 --- a/internal/config/rules.go +++ b/internal/config/rules.go @@ -132,7 +132,7 @@ func (r *AlertRule) toPostableAlert(alertQueryResult map[string]interface{}, sta labels := LabelSet{ "alertname": r.Name, "severity": r.Severity, - "job": constants.AlertJobName, + "job": constants.TensorFusionSystemName, "instance": instance, } annotations := LabelSet{ diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 81470022..8ccbcba1 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -107,6 +107,8 @@ const ( // For grey release TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas" TensorFusionDefaultPoolKeyAnnotation = Domain + "/is-default-pool" + // Eviction protection annotation for controlling pod eviction timing + EvictionProtectionAnnotation = Domain + "/eviction-protection" NamespaceDefaultVal = "tensor-fusion-sys" @@ -176,7 +178,7 @@ const TFDataPath = "/run/tensor-fusion" const TFDataPathWorkerExpr = "shm/$(POD_NAMESPACE)/$(POD_NAME)" const DataVolumeName = "tf-data" const TensorFusionPoolManualCompaction = Domain + "/manual-compaction" -const AlertJobName = "tensor-fusion" +const TensorFusionSystemName = "tensor-fusion" const ( LeaderInfoConfigMapName = "tensor-fusion-operator-leader-info" @@ -202,3 +204,5 @@ const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid" const SchedulerSimulationKey = "simulate-schedule" const MobileGpuClockSpeedMultiplier = 0.75 +const DefaultEvictionProtectionPriceRatio = 1.2 +const NodeCriticalPriorityClassName = "system-node-critical" diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go index ab335948..a7bf7c2f 100644 --- a/internal/controller/pod_controller.go +++ b/internal/controller/pod_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strconv" + "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" @@ -66,6 +67,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R if err := r.Get(ctx, req.NamespacedName, pod); err != nil { if errors.IsNotFound(err) { r.Allocator.DeallocByPodIdentifier(ctx, req.NamespacedName) + metrics.RemoveWorkerMetrics(req.Name, time.Now()) log.Info("Released GPU resources when pod deleted", "pod", req.NamespacedName) return ctrl.Result{}, nil } @@ -106,8 +108,9 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R } if pod.Labels[constants.LabelComponent] == constants.ComponentWorker { - metrics.SetWorkerMetricsByWorkload(pod) - + if pod.DeletionTimestamp.IsZero() { + metrics.SetWorkerMetricsByWorkload(pod) + } shouldReturn, err := r.handleWorkerPodFinalizer(ctx, pod) if err != nil { return ctrl.Result{}, err @@ -148,7 +151,8 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R func (r *PodReconciler) handleWorkerPodFinalizer(ctx context.Context, pod *corev1.Pod) (bool, error) { // Handle our GPU resource cleanup finalizer shouldReturn, err := utils.HandleFinalizer(ctx, pod, r.Client, func(ctx context.Context, obj *corev1.Pod) (bool, error) { - metrics.RemoveWorkerMetrics(pod.Name, pod.DeletionTimestamp.Time) + // if the Pod keep terminating, should update deletion timestamp for raw cost calculation + metrics.RemoveWorkerMetrics(pod.Name, time.Now()) counter := &v1.TensorFusionPodCounter{Client: r.Client} if err := counter.Decrease(ctx, pod); err != nil { return false, err diff --git a/internal/controller/pod_controller_test.go b/internal/controller/pod_controller_test.go index b36f140f..cf53d119 100644 --- a/internal/controller/pod_controller_test.go +++ b/internal/controller/pod_controller_test.go @@ -230,9 +230,6 @@ var _ = Describe("Pod Controller", func() { }, } _ = k8sClient.Delete(ctx, connection) - Eventually(func() error { - return k8sClient.Get(ctx, client.ObjectKeyFromObject(connection), connection) - }).Should(Satisfy(errors.IsNotFound)) }) It("should successfully create TensorFusion connection for client pod", func() { diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go index d4f464c3..3c64429e 100644 --- a/internal/controller/tensorfusioncluster_controller.go +++ b/internal/controller/tensorfusioncluster_controller.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "strconv" - "strings" "sync" "golang.org/x/time/rate" @@ -304,7 +303,7 @@ func (r *TensorFusionClusterReconciler) reconcileGPUPool(ctx context.Context, tf } err = r.Create(ctx, gpupool) anyPoolChanged = true - r.updateMetricsRecorder(ctx, gpupool) + r.MetricsRecorder.UpdateMetricsRecorder(gpupool, true) if err != nil { errors = append(errors, fmt.Errorf("failed to create GPUPool %s: %w", key, err)) continue @@ -327,7 +326,7 @@ func (r *TensorFusionClusterReconciler) reconcileGPUPool(ctx context.Context, tf } anyPoolChanged = true } - r.updateMetricsRecorder(ctx, existingPool) + r.MetricsRecorder.UpdateMetricsRecorder(existingPool, specChanged) } } @@ -440,34 +439,3 @@ func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager, addLi Owns(&tfv1.GPUPool{}). Complete(r) } - -// Update metrics recorder's raw billing map -func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Context, pool *tfv1.GPUPool) { - const dollarSign = "$" - log := log.FromContext(ctx) - if pool.Spec.QosConfig == nil { - log.Info("QosConfig is nil, skip updating metrics recorder", "pool", pool.Name) - return - } - - qosConfig := pool.Spec.QosConfig - if _, ok := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]; !ok { - r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] = make(map[string]metrics.RawBillingPricing) - } - pricingDetail := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] - for _, pricing := range qosConfig.Pricing { - tflopsPerHour, _ := strconv.ParseFloat(strings.TrimPrefix(pricing.Requests.PerFP16TFlopsPerHour, dollarSign), 64) - vramPerHour, _ := strconv.ParseFloat(strings.TrimPrefix(pricing.Requests.PerGBOfVRAMPerHour, dollarSign), 64) - limitOverRequestChargingRatio, _ := strconv.ParseFloat(pricing.LimitsOverRequestsChargingRatio, 64) - - pricingDetail[string(pricing.Qos)] = metrics.RawBillingPricing{ - TflopsPerSecond: tflopsPerHour / float64(3600), - VramPerSecond: vramPerHour / float64(3600), - - TflopsOverRequestPerSecond: tflopsPerHour / float64(3600) * limitOverRequestChargingRatio, - VramOverRequestPerSecond: vramPerHour / float64(3600) * limitOverRequestChargingRatio, - } - } - - log.V(5).Info("Updated metrics recorder", "pool", pool.Name, "pricing", pricingDetail) -} diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index d2259a34..fb475377 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -192,6 +192,37 @@ func (s *GpuAllocator) Filter( return filteredGPUs, filterDetails, nil } +func (s *GpuAllocator) FilterWithPreempt( + req *tfv1.AllocRequest, + preemptAllocRequests []*tfv1.AllocRequest, +) ([]*tfv1.GPU, []filter.FilterDetail, error) { + toFilterGPUs := []*tfv1.GPU{} + for _, preemptAllocRequest := range preemptAllocRequests { + for _, gpuName := range preemptAllocRequest.GPUNames { + gpu := s.gpuStore[types.NamespacedName{Name: gpuName}] + if gpu == nil { + return nil, nil, fmt.Errorf("gpu %s not found", gpuName) + } + gpuCopy := gpu.DeepCopy() + gpuCopy.Status.Available.Tflops.Add(preemptAllocRequest.Request.Tflops) + gpuCopy.Status.Available.Vram.Add(preemptAllocRequest.Request.Vram) + toFilterGPUs = append(toFilterGPUs, gpuCopy) + } + } + + filterRegistry := s.filterRegistry.With(filter.NewResourceFilter(req.Request)) + // Add GPU model filter if specified + if req.GPUModel != "" { + filterRegistry = filterRegistry.With(filter.NewGPUModelFilter(req.GPUModel)) + } + // No need to check count and other filters since it's always in the same node during each preempt trial + filteredGPUs, filterDetails, err := filterRegistry.Apply(s.ctx, req.WorkloadNameNamespace, toFilterGPUs, false) + if err != nil { + return nil, nil, fmt.Errorf("apply filters: %w", err) + } + return filteredGPUs, filterDetails, nil +} + func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) ([]*tfv1.GPU, error) { pool := &tfv1.GPUPool{} if err := s.Get(s.ctx, client.ObjectKey{Name: req.PoolName}, pool); err != nil { @@ -314,9 +345,8 @@ func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error) { func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocRequest, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error) { <-s.initializedCh - // Fast quota check (fail fast if quota insufficient) if err := s.quotaStore.CheckQuotaAvailable(req.WorkloadNameNamespace.Namespace, req); err != nil { - return nil, nil, fmt.Errorf("quota check failed: %w", err) + return nil, nil, err } // Get GPUs from the pool using the in-memory store @@ -935,7 +965,7 @@ func (s *GpuAllocator) handleGPUUpdate(ctx context.Context, gpu *tfv1.GPU) { log.V(6).Info("Updated GPU in store (new entry)", "name", key.Name, "phase", gpu.Status.Phase) } - s.addOrUpdateGPUMaps(gpu) + s.addOrUpdateGPUMaps(s.gpuStore[key]) } func (s *GpuAllocator) addOrUpdateGPUMaps(gpuInMem *tfv1.GPU) { @@ -1138,6 +1168,68 @@ func (s *GpuAllocator) ReconcileAllocationState() { }) } +func (s *GpuAllocator) ReconcileAllocationStateForTesting() { + s.reconcileAllocationState() +} + +func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt( + nodeName string, allocReq *tfv1.AllocRequest, toPreemptPods sets.Set[types.NamespacedName], +) error { + <-s.initializedCh + // Only need to check total quotas when preempting + toPreemptUsage := &tfv1.GPUResourceUsage{ + Requests: tfv1.Resource{ + Tflops: resource.Quantity{}, + Vram: resource.Quantity{}, + }, + Limits: tfv1.Resource{ + Tflops: resource.Quantity{}, + Vram: resource.Quantity{}, + }, + } + workers := s.nodeWorkerStore[nodeName] + preemptAllocRequests := make([]*tfv1.AllocRequest, 0, len(workers)) + for workerName := range workers { + if !toPreemptPods.Has(workerName) { + continue + } + podUID := s.podNamespaceNsToPodUID[workerName.String()] + if podUID == "" { + continue + } + existingAllocation := s.uniqueAllocation[podUID] + if existingAllocation == nil { + continue + } + toPreemptUsage.Requests.Tflops.Add(existingAllocation.Request.Tflops) + toPreemptUsage.Requests.Vram.Add(existingAllocation.Request.Vram) + toPreemptUsage.Limits.Tflops.Add(existingAllocation.Limit.Tflops) + toPreemptUsage.Limits.Vram.Add(existingAllocation.Limit.Vram) + preemptAllocRequests = append(preemptAllocRequests, existingAllocation) + } + + if log.FromContext(s.ctx).V(5).Enabled() { + log.FromContext(s.ctx).V(5).Info("Preempting node and check quotas", "nodeName", nodeName, "toPreemptUsage", toPreemptUsage) + } + + if err := s.quotaStore.CheckTotalQuotaRelaxed(allocReq, toPreemptUsage); err != nil { + return fmt.Errorf("quota check failed during preempt: %w", err) + } + + // Get GPUs from the pool using the in-memory store + if allocReq.PoolName == "" { + return fmt.Errorf("GPU Pool name is empty, can not find GPUs during preempt") + } + filteredGPUs, _, err := s.FilterWithPreempt(allocReq, preemptAllocRequests) + if err != nil { + return err + } + if len(filteredGPUs) < int(allocReq.Count) { + return fmt.Errorf("no gpus available or valid in pool %s after filtering during preempt", allocReq.PoolName) + } + return nil +} + func (s *GpuAllocator) reconcileAllocationState() { ctx := s.ctx logger := log.FromContext(ctx) @@ -1314,6 +1406,11 @@ func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest return &tfv1.AllocRequest{}, "gpu count annotation is too large", nil } + qosLevel := tfv1.QoSLevel(pod.Annotations[constants.QoSLevelAnnotation]) + if qosLevel == "" { + qosLevel = tfv1.QoSMedium + } + allocRequest := tfv1.AllocRequest{ PoolName: pod.Annotations[constants.GpuPoolKey], Request: gpuRequestResource, @@ -1326,6 +1423,7 @@ func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest Namespace: pod.Namespace, }, PodMeta: pod.ObjectMeta, + QoS: qosLevel, } // for already allocated workers, set the GPU device IDs for further scaling and retrieval diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go index f1c14a39..7f47bab6 100644 --- a/internal/metrics/recorder.go +++ b/internal/metrics/recorder.go @@ -4,6 +4,7 @@ import ( "io" "math" "strconv" + "strings" "sync" "time" @@ -19,15 +20,17 @@ import ( // Worker level metrics, include worker resources/costs status // map updated in one reconcile loop in single goroutine, thus no RW lock needed var workerMetricsLock sync.RWMutex -var workerMetricsMap = map[string]*WorkerResourceMetrics{} +var workerMetricsMap = make(map[string]*WorkerResourceMetrics, 200) // Node level metrics, include node allocation/costs status var nodeMetricsLock sync.RWMutex -var nodeMetricsMap = map[string]*NodeResourceMetrics{} +var nodeMetricsMap = make(map[string]*NodeResourceMetrics, 100) // Pool level metrics, include pool allocation/costs status var poolMetricsLock sync.RWMutex -var poolMetricsMap = map[string]*PoolResourceMetrics{} +var poolMetricsMap = make(map[string]*PoolResourceMetrics, 4) + +var settingLock sync.RWMutex var log = ctrl.Log.WithName("metrics-recorder") @@ -37,6 +40,9 @@ type MetricsRecorder struct { // Raw billing result for node and workers HourlyUnitPriceMap map[string]float64 + // Pool level eviction protection price ratio map, key is pool name + PoolEvictionProtectionPriceRatioMap map[string]string + // Worker level unit price map, key is pool name, second level key is QoS level WorkerUnitPriceMap map[string]map[string]RawBillingPricing } @@ -80,14 +86,16 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) { // Initialize metrics if _, ok := workerMetricsMap[pod.Name]; !ok { workerMetricsMap[pod.Name] = &WorkerResourceMetrics{ - WorkerName: pod.Name, - WorkloadName: pod.Labels[constants.WorkloadKey], - PoolName: pod.Annotations[constants.GpuPoolKey], - Namespace: pod.Namespace, - QoS: pod.Annotations[constants.QoSLevelAnnotation], - podLabels: pod.Labels, - RawCost: 0, - LastRecordTime: time.Now(), + WorkerName: pod.Name, + WorkloadName: pod.Labels[constants.WorkloadKey], + PoolName: pod.Annotations[constants.GpuPoolKey], + Namespace: pod.Namespace, + QoS: pod.Annotations[constants.QoSLevelAnnotation], + podLabels: pod.Labels, + RawCost: 0, + LastRecordTime: time.Now(), + creationTime: pod.CreationTimestamp.Time, + evictionProtection: pod.Annotations[constants.EvictionProtectionAnnotation], } } @@ -287,13 +295,17 @@ func (mr *MetricsRecorder) Start() { // Clean up worker metrics that have been deleted go func() { for { - time.Sleep(5 * time.Minute) + time.Sleep(1 * time.Minute) workerMetricsLock.Lock() - for _, metrics := range workerMetricsMap { + var keysToDelete []string + for key, metrics := range workerMetricsMap { if metrics.deletionTimestamp != nil && !metrics.deletionTimestamp.IsZero() { - delete(workerMetricsMap, metrics.WorkerName) + keysToDelete = append(keysToDelete, key) } } + for _, key := range keysToDelete { + delete(workerMetricsMap, key) + } workerMetricsLock.Unlock() } }() @@ -306,13 +318,12 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) { now := time.Now() enc := NewEncoder(config.GetGlobalConfig().MetricsFormat) - workerMetricsLock.RLock() + workerMetricsLock.Lock() activeWorkerCnt := 0 activeWorkerAndNodeByPool := map[string]*ActiveNodeAndWorker{} for _, metrics := range workerMetricsMap { - if metrics.deletionTimestamp != nil && !metrics.deletionTimestamp.IsZero() { metrics.RawCost = mr.getWorkerRawCost(metrics, metrics.deletionTimestamp.Sub(metrics.LastRecordTime)) } else { @@ -333,7 +344,9 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) { nodeCnt: 0, } } - activeWorkerAndNodeByPool[metrics.PoolName].workerCnt++ + if metrics.deletionTimestamp == nil || metrics.deletionTimestamp.IsZero() { + activeWorkerAndNodeByPool[metrics.PoolName].workerCnt++ + } enc.StartLine("tf_worker_resources") enc.AddTag("namespace", metrics.Namespace) @@ -362,7 +375,7 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) { enc.EndLine(now) } - workerMetricsLock.RUnlock() + workerMetricsLock.Unlock() nodeMetricsLock.RLock() @@ -439,7 +452,51 @@ func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) { log.Info("metrics and raw billing recorded:", "workerCount", activeWorkerCnt, "nodeCount", len(nodeMetricsMap)) } +// Update metrics recorder's raw billing map +func (r *MetricsRecorder) UpdateMetricsRecorder(pool *tfv1.GPUPool, specChanged bool) { + const dollarSign = "$" + settingLock.Lock() + defer settingLock.Unlock() + if pool.Spec.QosConfig == nil { + log.Info("QosConfig is nil, skip updating metrics recorder", "pool", pool.Name) + return + } + + qosConfig := pool.Spec.QosConfig + if _, ok := r.WorkerUnitPriceMap[pool.Name]; !ok { + r.WorkerUnitPriceMap[pool.Name] = make(map[string]RawBillingPricing) + } + + if r.PoolEvictionProtectionPriceRatioMap == nil { + r.PoolEvictionProtectionPriceRatioMap = make(map[string]string, 4) + } + r.PoolEvictionProtectionPriceRatioMap[pool.Name] = qosConfig.EvictionProtectionPriceRatio + + pricingDetail := r.WorkerUnitPriceMap[pool.Name] + if !specChanged && len(pricingDetail) == 0 { + return + } + // Pricing potentially changed + for _, pricing := range qosConfig.Pricing { + tflopsPerHour, _ := strconv.ParseFloat(strings.TrimPrefix(pricing.Requests.PerFP16TFlopsPerHour, dollarSign), 64) + vramPerHour, _ := strconv.ParseFloat(strings.TrimPrefix(pricing.Requests.PerGBOfVRAMPerHour, dollarSign), 64) + limitOverRequestChargingRatio, _ := strconv.ParseFloat(pricing.LimitsOverRequestsChargingRatio, 64) + + pricingDetail[string(pricing.Qos)] = RawBillingPricing{ + TflopsPerSecond: tflopsPerHour / float64(3600), + VramPerSecond: vramPerHour / float64(3600), + + TflopsOverRequestPerSecond: tflopsPerHour / float64(3600) * limitOverRequestChargingRatio, + VramOverRequestPerSecond: vramPerHour / float64(3600) * limitOverRequestChargingRatio, + } + } + + log.V(5).Info("Updated metrics recorder", "pool", pool.Name, "pricing", pricingDetail) +} + func (mr *MetricsRecorder) getWorkerRawCost(metrics *WorkerResourceMetrics, duration time.Duration) float64 { + settingLock.RLock() + defer settingLock.RUnlock() qosPricing, ok := mr.WorkerUnitPriceMap[metrics.PoolName] // The qos pricing for this pool not set if !ok { @@ -464,7 +521,37 @@ func (mr *MetricsRecorder) getWorkerRawCost(metrics *WorkerResourceMetrics, dura rawCostVRAMLimitOverRequest := (metrics.VramBytesLimit - metrics.VramBytesRequest) * pricing.VramOverRequestPerSecond / constants.GiBToBytes rawCostPerVRAM := pricing.VramPerSecond * metrics.VramBytesRequest / constants.GiBToBytes - return (rawCostPerTflops + rawCostPerVRAM + rawCostTflopsLimitOverRequest + rawCostVRAMLimitOverRequest) * duration.Seconds() * float64(metrics.GPUCount) + baseCost := (rawCostPerTflops + rawCostPerVRAM + rawCostTflopsLimitOverRequest + rawCostVRAMLimitOverRequest) * duration.Seconds() * float64(metrics.GPUCount) + + // Apply eviction protection price ratio if the pod is under protection and QoS is not critical + if metrics.evictionProtection != "" && qosLevel != constants.QoSLevelCritical { + if isUnderProtection := mr.isUnderEvictionProtection(metrics); isUnderProtection { + protectionPriceRatio := mr.PoolEvictionProtectionPriceRatioMap[metrics.PoolName] + protectionPriceRatioFloat, _ := strconv.ParseFloat(protectionPriceRatio, 64) + if protectionPriceRatioFloat < 1 { + protectionPriceRatioFloat = constants.DefaultEvictionProtectionPriceRatio + } + baseCost *= protectionPriceRatioFloat + } + } + + return baseCost +} + +// isUnderEvictionProtection checks if a worker is under eviction protection +func (mr *MetricsRecorder) isUnderEvictionProtection(metrics *WorkerResourceMetrics) bool { + if metrics.evictionProtection == "" { + return false + } + + // Parse protection duration (1h, 5h, 24h, etc.) + duration, err := time.ParseDuration(metrics.evictionProtection) + if err != nil { + return false + } + + protectionEndTime := metrics.creationTime.Add(duration) + return time.Now().Before(protectionEndTime) } // unit price data comes from global config map, and multi-GPU instance should normalized with per GPU pricing, e.g. 8xA100 p4d.24xlarge price should divide by 8 diff --git a/internal/metrics/types.go b/internal/metrics/types.go index ff3449cb..df06f169 100644 --- a/internal/metrics/types.go +++ b/internal/metrics/types.go @@ -51,6 +51,10 @@ type WorkerResourceMetrics struct { // For more accurate metrics, should record the deletion timestamp to calculate duration for the last metrics deletionTimestamp *time.Time + // Fields for eviction protection tracking - private, not stored in TSDB + creationTime time.Time + evictionProtection string + podLabels map[string]string } diff --git a/internal/quota/quota_store.go b/internal/quota/quota_store.go index 4edc7445..d9450236 100644 --- a/internal/quota/quota_store.go +++ b/internal/quota/quota_store.go @@ -79,7 +79,16 @@ func (qs *QuotaStore) CheckQuotaAvailable(namespace string, req *tfv1.AllocReque if err := qs.checkSingleQuotas(entry, req); err != nil { return err } - return qs.checkTotalQuotas(entry, req) + return qs.checkTotalQuotas(entry, req, nil) +} + +func (qs *QuotaStore) CheckTotalQuotaRelaxed(req *tfv1.AllocRequest, toReleaseResource *tfv1.GPUResourceUsage) error { + entry, exists := qs.QuotaStore[req.WorkloadNameNamespace.Namespace] + if !exists { + // No quota defined for this namespace, allow allocation + return nil + } + return qs.checkTotalQuotas(entry, req, toReleaseResource) } func (qs *QuotaStore) AdjustQuota(namespace string, reqDelta tfv1.Resource, limitDelta tfv1.Resource) { @@ -103,41 +112,51 @@ func (qs *QuotaStore) checkSingleQuotas(entry *QuotaStoreEntry, req *tfv1.AllocR if single.MaxLimits != nil { if !single.MaxLimits.Tflops.IsZero() && req.Limit.Tflops.Cmp(single.MaxLimits.Tflops) > 0 { return &QuotaExceededError{ - Namespace: entry.Quota.Namespace, - Resource: MaxTFlopsLimitResource, - Requested: req.Limit.Tflops, - Limit: single.MaxLimits.Tflops, + Namespace: entry.Quota.Namespace, + Resource: MaxTFlopsLimitResource, + Requested: req.Limit.Tflops, + Limit: single.MaxLimits.Tflops, + Unresolvable: true, } } // Check single VRAM limit (per GPU) if !single.MaxLimits.Vram.IsZero() && req.Request.Vram.Cmp(single.MaxLimits.Vram) > 0 { return &QuotaExceededError{ - Namespace: entry.Quota.Namespace, - Resource: MaxVRAMLimitResource, - Requested: req.Request.Vram, - Limit: single.MaxLimits.Vram, + Namespace: entry.Quota.Namespace, + Resource: MaxVRAMLimitResource, + Requested: req.Request.Vram, + Limit: single.MaxLimits.Vram, + Unresolvable: true, } } // Check single GPU count limit (per worker) if single.MaxGPUCount != nil && int32(req.Count) > *single.MaxGPUCount { return &QuotaExceededError{ - Namespace: entry.Quota.Namespace, - Resource: MaxGPULimitResource, - Requested: *resource.NewQuantity(int64(req.Count), resource.DecimalSI), - Limit: *resource.NewQuantity(int64(*single.MaxGPUCount), resource.DecimalSI), + Namespace: entry.Quota.Namespace, + Resource: MaxGPULimitResource, + Requested: *resource.NewQuantity(int64(req.Count), resource.DecimalSI), + Limit: *resource.NewQuantity(int64(*single.MaxGPUCount), resource.DecimalSI), + Unresolvable: true, } } } return nil } -func (qs *QuotaStore) checkTotalQuotas(entry *QuotaStoreEntry, req *tfv1.AllocRequest) error { +func (qs *QuotaStore) checkTotalQuotas(entry *QuotaStoreEntry, req *tfv1.AllocRequest, toReleaseResource *tfv1.GPUResourceUsage) error { quotaNs := entry.Quota.Namespace + + // Check total requests if entry.Quota.Spec.Total.Requests != nil { total := entry.Quota.Spec.Total.Requests - current := entry.CurrentUsage.Requests + current := *entry.CurrentUsage.Requests.DeepCopy() + + if toReleaseResource != nil { + current.Tflops.Sub(toReleaseResource.Requests.Tflops) + current.Vram.Sub(toReleaseResource.Requests.Vram) + } err := checkTotalExceeded(req, total, current, quotaNs, true) if err != nil { return err @@ -147,13 +166,24 @@ func (qs *QuotaStore) checkTotalQuotas(entry *QuotaStoreEntry, req *tfv1.AllocRe // Check total limits if entry.Quota.Spec.Total.Limits != nil { total := entry.Quota.Spec.Total.Limits - usage := entry.CurrentUsage.Limits + usage := *entry.CurrentUsage.Limits.DeepCopy() + + if toReleaseResource != nil { + usage.Tflops.Sub(toReleaseResource.Limits.Tflops) + usage.Vram.Sub(toReleaseResource.Limits.Vram) + } err := checkTotalExceeded(req, total, usage, quotaNs, false) if err != nil { return err } } + // If it's preempt case, skip checking total workers since it's + // replacing existing workers rather than creating new ones + if toReleaseResource != nil { + return nil + } + // Check total workers, each allocation will create one worker instance if entry.Quota.Spec.Total.MaxWorkers != nil { if entry.CurrentUsage.Workers >= *entry.Quota.Spec.Total.MaxWorkers { @@ -451,10 +481,11 @@ func (qs *QuotaStore) SyncQuotasToK8s(ctx context.Context) { // QuotaExceededError represents a quota exceeded error with detailed information type QuotaExceededError struct { - Namespace string - Resource string - Requested resource.Quantity - Limit resource.Quantity + Namespace string + Resource string + Requested resource.Quantity + Limit resource.Quantity + Unresolvable bool } func (e *QuotaExceededError) Error() string { diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go index 861b95eb..8dbd16cc 100644 --- a/internal/scheduler/gpuresources/gpuresources.go +++ b/internal/scheduler/gpuresources/gpuresources.go @@ -6,12 +6,14 @@ import ( "sort" "strconv" "strings" + "sync" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/NexusGPU/tensor-fusion/internal/metrics" + "github.com/NexusGPU/tensor-fusion/internal/quota" "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/samber/lo" v1 "k8s.io/api/core/v1" @@ -57,6 +59,12 @@ type GPUSchedulingStateData struct { // In Reserve stage, bind GPUs to pod, update allocator cache // In PostBind stage, fetch final GPUs call Pod patch API to update annotation FinalGPUs []string + + // Preempt pods + PreemptPods sync.Map + + // IsPreemption + IsPreemption bool } func (p *GPUSchedulingStateData) Clone() fwk.StateData { @@ -135,7 +143,16 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeWarning, "GPUQuotaOrCapacityNotEnough", "check quota and filter", "TensorFusion schedule failed, no enough resource or quotas: "+err.Error()) s.logger.Error(err, "failed to check quota and filter", "pod", pod.Name) - return nil, fwk.NewStatus(fwk.Unschedulable, err.Error()) + + if quotaErr, ok := err.(*quota.QuotaExceededError); ok { + if quotaErr.Unresolvable { + return nil, fwk.NewStatus(fwk.UnschedulableAndUnresolvable, quotaErr.Error()) + } else { + return nil, fwk.NewStatus(fwk.Unschedulable, err.Error()) + } + } else { + return nil, fwk.NewStatus(fwk.Unschedulable, err.Error()) + } } validNodesValidGPUs := lo.GroupBy(filteredGPUs, func(gpu *tfv1.GPU) string { @@ -143,10 +160,14 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po }) validNodeNonMatchingGPUs := make(map[string][]*tfv1.GPU, len(validNodesValidGPUs)) - nodeNames := sets.New[string]() + cnt := 0 + allGPUNodeNames := sets.New[string]() nodeGPUs := s.allocator.GetNodeGpuStore() + for k := range nodeGPUs { + allGPUNodeNames.Insert(k) + } for k, matchedGPUs := range validNodesValidGPUs { - nodeNames.Insert(k) + cnt++ // get all GPUs on this node allGPUs := nodeGPUs[k] @@ -180,7 +201,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po } } } - s.logger.Info("filtered valid node GPUs", "nodes count", nodeNames.Len(), "pod", pod.Name) + s.logger.Info("filtered valid node GPUs", "nodes count", cnt, "pod", pod.Name) // assign score based on different strategies score := s.allocator.Score(ctx, s.cfg, allocRequest, validNodesValidGPUs) @@ -189,7 +210,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po notMatchingGPUScore := s.allocator.Score(ctx, s.cfg, allocRequest, validNodeNonMatchingGPUs) s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeNormal, "PreScheduleDone", "pre filter for TensorFusion workload", - "TensorFusion pre schedule done, valid GPU node count: "+strconv.Itoa(nodeNames.Len())) + "TensorFusion pre schedule done, valid GPU node count: "+strconv.Itoa(cnt)) if s.logger.V(6).Enabled() { jsonStr, _ := json.Marshal(validNodesValidGPUs) @@ -202,15 +223,66 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po ValidNodeGPUScore: score, ValidNodeNotMatchingGPUScore: notMatchingGPUScore, FinalGPUs: []string{}, + PreemptPods: sync.Map{}, + IsPreemption: false, }) return &framework.PreFilterResult{ - NodeNames: nodeNames, + NodeNames: allGPUNodeNames, }, fwk.NewStatus(fwk.Success) } func (s *GPUFit) PreFilterExtensions() framework.PreFilterExtensions { - return nil + return s +} + +func (s *GPUFit) AddPod(ctx context.Context, state fwk.CycleState, pod *v1.Pod, podInfoToAdd fwk.PodInfo, nodeInfo fwk.NodeInfo) *fwk.Status { + stateData, err := state.Read(CycleStateGPUSchedulingResult) + if err != nil { + return fwk.NewStatus(fwk.Error, err.Error()) + } + stateDataParsed := stateData.(*GPUSchedulingStateData) + if pods, ok := stateDataParsed.PreemptPods.Load(nodeInfo.Node().Name); ok { + podsParsed := pods.(sets.Set[types.NamespacedName]) + + nameNs := types.NamespacedName{ + Namespace: podInfoToAdd.GetPod().Namespace, + Name: podInfoToAdd.GetPod().Name, + } + if podsParsed.Has(nameNs) { + podsParsed.Delete(nameNs) + } + } + return fwk.NewStatus(fwk.Success, "") +} + +func (s *GPUFit) RemovePod(ctx context.Context, state fwk.CycleState, pod *v1.Pod, podInfoToRemove fwk.PodInfo, nodeInfo fwk.NodeInfo) *fwk.Status { + stateData, err := state.Read(CycleStateGPUSchedulingResult) + if err != nil { + if fwk.ErrNotFound == err { + stateData = &GPUSchedulingStateData{ + PreemptPods: sync.Map{}, + } + state.Write(CycleStateGPUSchedulingResult, stateData) + } else { + return fwk.NewStatus(fwk.Error, err.Error()) + } + } + stateDataParsed := stateData.(*GPUSchedulingStateData) + stateDataParsed.IsPreemption = true + if pods, ok := stateDataParsed.PreemptPods.Load(nodeInfo.Node().Name); ok { + parsedPods := pods.(sets.Set[types.NamespacedName]) + parsedPods.Insert(types.NamespacedName{ + Namespace: podInfoToRemove.GetPod().Namespace, + Name: podInfoToRemove.GetPod().Name, + }) + } else { + stateDataParsed.PreemptPods.Store(nodeInfo.Node().Name, sets.New(types.NamespacedName{ + Namespace: podInfoToRemove.GetPod().Namespace, + Name: podInfoToRemove.GetPod().Name, + })) + } + return fwk.NewStatus(fwk.Success, "") } func (s *GPUFit) Filter(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeInfo fwk.NodeInfo) *fwk.Status { @@ -222,6 +294,28 @@ func (s *GPUFit) Filter(ctx context.Context, state fwk.CycleState, pod *v1.Pod, if err != nil { return fwk.NewStatus(fwk.Error, err.Error()) } + + // k8s will RemoveAll Pods, and run Filter for high priority pod, + // then Scheduler framework will reprieve victims one by one until filter returns unschedulable + if filterResult.(*GPUSchedulingStateData).IsPreemption { + allocRequest, err := state.Read(CycleStateAllocateRequest) + allocRequestParsed := allocRequest.(*tfv1.AllocRequest) + if err != nil { + return fwk.NewStatus(fwk.Error, err.Error()) + } + podsToPreempt, ok := filterResult.(*GPUSchedulingStateData).PreemptPods.Load(nodeInfo.Node().Name) + if !ok { + return fwk.NewStatus(fwk.Unschedulable, "no pods to preempt") + } + podsToPreemptParsed := podsToPreempt.(sets.Set[types.NamespacedName]) + err = s.allocator.CheckQuotaAndFilterSingleNodePreempt( + nodeInfo.Node().Name, allocRequestParsed, podsToPreemptParsed) + if err != nil { + return fwk.NewStatus(fwk.Unschedulable, err.Error()) + } + return fwk.NewStatus(fwk.Success, "") + } + nodeName := nodeInfo.Node().Name if _, ok := filterResult.(*GPUSchedulingStateData).NodeGPUs[nodeName]; !ok { return fwk.NewStatus(fwk.Unschedulable, "no valid node found, gpu capacity not enough") diff --git a/internal/scheduler/gpuresources/gpuresources_test.go b/internal/scheduler/gpuresources/gpuresources_test.go index 71af8c0f..5fa25150 100644 --- a/internal/scheduler/gpuresources/gpuresources_test.go +++ b/internal/scheduler/gpuresources/gpuresources_test.go @@ -7,6 +7,7 @@ import ( "testing" "time" + "github.com/samber/lo" "github.com/stretchr/testify/suite" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -352,7 +353,7 @@ func (s *GPUResourcesSuite) TestPreFilter() { s.Equal(tt.expectedStatus, status.Code(), status.Message()) if tt.expectedStatus == fwk.Success { s.Require().NotNil(res) - nodes := sort.StringSlice(res.NodeNames.UnsortedList()) + nodes := sort.StringSlice(getPreFilterResult(state)) nodes.Sort() s.Equal(tt.expectedNodes, strings.Join(nodes, " ")) } @@ -623,7 +624,7 @@ func (s *GPUResourcesSuite) TestScoreExtensions() { func (s *GPUResourcesSuite) TestPreFilterExtensions() { log.FromContext(s.ctx).Info("Running TestPreFilterExtensions") - s.Nil(s.plugin.PreFilterExtensions()) + s.NotNil(s.plugin.PreFilterExtensions()) } func (s *GPUResourcesSuite) TestName() { @@ -728,3 +729,11 @@ func (s *GPUResourcesSuite) TestScore_ErrorHandling() { _, status = s.plugin.Score(s.ctx, state, pod, nodeInfo) s.Equal(fwk.Unschedulable, status.Code()) } + +func getPreFilterResult(state *framework.CycleState) []string { + data, err := state.Read(CycleStateGPUSchedulingResult) + if err != nil { + return nil + } + return lo.Keys(data.(*GPUSchedulingStateData).NodeGPUs) +} diff --git a/internal/utils/compose.go b/internal/utils/compose.go index 2a62af0b..8802c6ce 100644 --- a/internal/utils/compose.go +++ b/internal/utils/compose.go @@ -350,6 +350,7 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo // Hypervisor needs to read /proc to map pod with processID spec.HostPID = true spec.TerminationGracePeriodSeconds = constants.GracefulPeriodSeconds + spec.PriorityClassName = constants.NodeCriticalPriorityClassName enableVector := pool.Spec.ComponentConfig.Hypervisor != nil && pool.Spec.ComponentConfig.Hypervisor.EnableVector diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 8c5aca06..6ea04125 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -164,6 +164,12 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque utils.AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod, tfInfo) utils.AddTFDefaultClientConfBeforePatch(ctx, pod, pool, tfInfo, containerIndices) + // Add priorityClass if contains higher QoS level and Pod priority class not specified + if pod.Spec.PriorityClassName == "" && + (tfInfo.Profile.Qos == tfv1.QoSHigh || tfInfo.Profile.Qos == tfv1.QoSCritical) { + pod.Spec.PriorityClassName = constants.TensorFusionSystemName + string(tfInfo.Profile.Qos) + } + // Inject initContainer and env variables patches, err := m.patchTFClient( pod, pool, tfInfo.Profile.IsLocalGPU, currentBytes, containerIndices, @@ -517,16 +523,17 @@ func (m *TensorFusionPodMutator) assignClusterHostPortFromLeader(pod *corev1.Pod } func calculateQoSLevel(profile *tfv1.WorkloadProfileSpec, pool *tfv1.GPUPool) tfv1.QoSLevel { - sameReqLimits := profile.Resources.Limits.Tflops.Cmp(profile.Resources.Requests.Tflops) == 0 && - profile.Resources.Limits.Vram.Cmp(profile.Resources.Requests.Vram) == 0 - - // set to critical if req == limits, same logic as Kubernetes QoS - if sameReqLimits { - return constants.QoSLevelCritical - } - // when not set, assign default QoS if profile.Qos == "" { + sameReqLimits := profile.Resources.Limits.Tflops.Cmp(profile.Resources.Requests.Tflops) == 0 && + profile.Resources.Limits.Vram.Cmp(profile.Resources.Requests.Vram) == 0 + + // set to high if req == limits, same logic as Kubernetes QoS + // critical QoS can preempt other pods, have to be set manually + if sameReqLimits { + return constants.QoSLevelHigh + } + if pool.Spec.QosConfig == nil || pool.Spec.QosConfig.DefaultQoS == "" { return constants.QoSLevelMedium } diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index 51da5358..2541b08b 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -258,7 +258,5 @@ func handleDedicatedGPU(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) workloadProfile.Spec.Resources.Requests.Vram = resource.Vram workloadProfile.Spec.Resources.Limits.Tflops = resource.Tflops workloadProfile.Spec.Resources.Limits.Vram = resource.Vram - workloadProfile.Spec.Qos = tfv1.QoSCritical - return nil } diff --git a/patches/scheduler-pdb-1.patch b/patches/scheduler-pdb-1.patch index ae9b966e..3a35e841 100644 --- a/patches/scheduler-pdb-1.patch +++ b/patches/scheduler-pdb-1.patch @@ -1,16 +1,38 @@ ---- ../vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go 2025-08-06 17:45:27 -+++ ../vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go 2025-08-06 17:45:19 -@@ -20,7 +20,9 @@ +--- ../vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go 2025-09-15 17:45:27 ++++ ../vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go 2025-09-15 17:45:19 +@@ -20,7 +20,10 @@ "context" "fmt" "math/rand" + "os" "sort" + "strconv" ++ "time" v1 "k8s.io/api/core/v1" policy "k8s.io/api/policy/v1" -@@ -364,5 +366,13 @@ +@@ -119,6 +122,20 @@ + // Default behavior: No additional filtering, beyond the internal requirement that the victim pod + // have lower priority than the preemptor pod. + pl.IsEligiblePod = func(nodeInfo fwk.NodeInfo, victim fwk.PodInfo, preemptor *v1.Pod) bool { ++ victimAnnotation := victim.GetPod().Annotations ++ if victimAnnotation == nil { ++ return true ++ } ++ if protectionPeriod, ok := victimAnnotation["tensor-fusion.ai/eviction-protection"]; ok { ++ duration, err := time.ParseDuration(protectionPeriod) ++ if err != nil { ++ return true ++ } ++ // Still in protection period, not allow to preempt ++ if time.Now().Before(victim.GetPod().CreationTimestamp.Add(duration)) { ++ return false ++ } ++ } + return true + } + +@@ -430,5 +447,13 @@ } func getPDBLister(informerFactory informers.SharedInformerFactory) policylisters.PodDisruptionBudgetLister { @@ -24,3 +46,4 @@ + } return informerFactory.Policy().V1().PodDisruptionBudgets().Lister() } + \ No newline at end of file diff --git a/test/sched/gpufit_bench_test.go b/test/sched/gpufit_bench_test.go index 3acb53d4..147d31e8 100644 --- a/test/sched/gpufit_bench_test.go +++ b/test/sched/gpufit_bench_test.go @@ -20,7 +20,6 @@ func BenchmarkGPUFitPlugin(b *testing.B) { NumNodes: 500, NumGPUs: 3000, NumPods: 10000, - BatchSize: 1, PoolName: "test-pool", Namespace: "test-ns", Timeout: 5 * time.Minute, diff --git a/test/sched/preemption_test.go b/test/sched/preemption_test.go new file mode 100644 index 00000000..1715d61b --- /dev/null +++ b/test/sched/preemption_test.go @@ -0,0 +1,299 @@ +package sched + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/NexusGPU/tensor-fusion/cmd/sched" + "github.com/NexusGPU/tensor-fusion/internal/constants" + gpuResourceFitPlugin "github.com/NexusGPU/tensor-fusion/internal/scheduler/gpuresources" + gpuTopoPlugin "github.com/NexusGPU/tensor-fusion/internal/scheduler/gputopo" + "github.com/NexusGPU/tensor-fusion/internal/utils" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zapcore" + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/klog/v2" + "k8s.io/kubernetes/cmd/kube-scheduler/app" + "k8s.io/kubernetes/pkg/scheduler" + st "k8s.io/kubernetes/pkg/scheduler/testing" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +// PreemptionTestSuite holds common test setup for preemption tests +type PreemptionTestSuite struct { + ctx context.Context + cancel context.CancelFunc + k8sClient client.Client + scheduler *scheduler.Scheduler + fixture *BenchmarkFixture + testEnv *envtest.Environment + kubeconfigPath string +} + +// SetupSuite initializes the test environment for preemption tests +func (pts *PreemptionTestSuite) SetupSuite(t *testing.T) { + klog.SetLogger(zap.New(zap.WriteTo(discardWriter{}), zap.UseDevMode(false), zap.Level(zapcore.InfoLevel))) + + // Setup test environment + ver, cfg, err := setupKubernetes() + require.NoError(t, err) + pts.testEnv = testEnv + + kubeconfigPath, err := writeKubeconfigToTempFileAndSetEnv(cfg) + require.NoError(t, err) + pts.kubeconfigPath = kubeconfigPath + + k8sClient, err := client.New(cfg, client.Options{Scheme: scheme.Scheme}) + require.NoError(t, err) + pts.k8sClient = k8sClient + + // Configure test with limited resources for preemption scenarios + benchConfig := BenchmarkConfig{ + NumNodes: 2, + NumGPUs: 4, + PoolName: "preemption-test-pool", + Namespace: "preemption-test-ns", + Timeout: 1 * time.Minute, + } + + mockBench := &testing.B{} + fixture := NewBenchmarkFixture(mockBench, benchConfig, k8sClient, true) + pts.fixture = fixture + + utils.SetProgressiveMigration(false) + + gpuResourceFitOpt := app.WithPlugin( + gpuResourceFitPlugin.Name, + gpuResourceFitPlugin.NewWithDeps(fixture.allocator, fixture.client), + ) + gpuTopoOpt := app.WithPlugin( + gpuTopoPlugin.Name, + gpuTopoPlugin.NewWithDeps(fixture.allocator, fixture.client), + ) + + ctx, cancel := context.WithCancel(context.Background()) + pts.ctx = ctx + pts.cancel = cancel + + cc, scheduler, err := sched.SetupScheduler(ctx, nil, + "../../config/samples/scheduler-config.yaml", true, ver, gpuResourceFitOpt, gpuTopoOpt) + require.NoError(t, err) + pts.scheduler = scheduler + scheduler.SchedulingQueue.Run(klog.FromContext(ctx)) + + // Start scheduler components + cc.EventBroadcaster.StartRecordingToSink(ctx.Done()) + cc.InformerFactory.Start(ctx.Done()) + cc.InformerFactory.WaitForCacheSync(ctx.Done()) + require.NoError(t, scheduler.WaitForHandlersSync(ctx)) +} + +// TearDownSuite cleans up the test environment +func (pts *PreemptionTestSuite) TearDownSuite(t *testing.T) { + if pts.cancel != nil { + pts.cancel() + } + if pts.fixture != nil { + pts.fixture.Close() + } + if pts.kubeconfigPath != "" { + require.NoError(t, cleanupKubeconfigTempFile(pts.kubeconfigPath)) + } + if pts.testEnv != nil { + require.NoError(t, pts.testEnv.Stop()) + } +} + +// discardWriter implements io.Writer to discard log output during tests +type discardWriter struct{} + +func (discardWriter) Write(p []byte) (n int, err error) { + return len(p), nil +} + +// TestPreemption tests comprehensive preemption scenarios +func TestPreemption(t *testing.T) { + suite := &PreemptionTestSuite{} + suite.SetupSuite(t) + defer suite.TearDownSuite(t) + testGPUResourcePreemption(t, suite) +} + +// TestPreemptionEvictProtection tests comprehensive preemption scenarios +func TestPreemptionEvictProtection(t *testing.T) { + suite := &PreemptionTestSuite{} + suite.SetupSuite(t) + defer suite.TearDownSuite(t) + testGPUResourceEvictProtection(t, suite) +} + +// testGPUResourcePreemption tests GPU shortage detection logic +func testGPUResourcePreemption(t *testing.T, suite *PreemptionTestSuite) { + // Mock cluster resources + // {"2250", "141Gi"}, // Simulate B200 + // {"989", "80Gi"}, // Simulate H100 + // {"450", "48Gi"}, // Simulate L40s + // {"312", "40Gi"}, // Simulate A100 + + // Create pods that will exhaust resources + toBeVictimPods := createPreemptionTestPodsWithQoS("victim", constants.QoSLevelMedium, 7+3+1+1, "300", "1Gi") + + for _, pod := range toBeVictimPods { + require.NoError(t, suite.k8sClient.Create(suite.ctx, pod)) + defer func() { + _ = suite.k8sClient.Delete(suite.ctx, pod) + }() + } + + // Try scheduling all pending pods + for range 12 { + suite.scheduler.ScheduleOne(suite.ctx) + } + + // schedule high priority pod + highPriorityPod := createPreemptionTestPodsWithQoS("high-priority", constants.QoSLevelHigh, 1, "300", "1Gi")[0] + require.NoError(t, suite.k8sClient.Create(suite.ctx, highPriorityPod)) + defer func() { + _ = suite.k8sClient.Delete(suite.ctx, highPriorityPod) + }() + + suite.scheduler.ScheduleOne(suite.ctx) + + // schedule critical priority pod + criticalPriorityPod := createPreemptionTestPodsWithQoS( + "critical-priority", constants.QoSLevelCritical, 1, "300", "1Gi")[0] + require.NoError(t, suite.k8sClient.Create(suite.ctx, criticalPriorityPod)) + defer func() { + _ = suite.k8sClient.Delete(suite.ctx, criticalPriorityPod) + }() + suite.scheduler.ScheduleOne(suite.ctx) + + // Preemption should be triggered and victims deleted, wait informer sync + time.Sleep(1 * time.Second) + + podList := &v1.PodList{} + err := suite.k8sClient.List(suite.ctx, podList, &client.ListOptions{Namespace: "preemption-test-ns"}) + require.NoError(t, err) + scheduledNodeMap := make(map[string]string) + for _, pod := range podList.Items { + scheduledNodeMap[pod.Name] = pod.Spec.NodeName + } + // 2 Pods deleted, 14 - 2 = 12 + require.Equal(t, 12, len(podList.Items)) + + // without Pod Controller, directly reconcile all state to simulate the Pod deletion + suite.fixture.allocator.ReconcileAllocationStateForTesting() + + // Trigger next 2 scheduling cycle, make sure the two higher priority pods are scheduled + suite.scheduler.ScheduleOne(suite.ctx) + suite.scheduler.ScheduleOne(suite.ctx) + + time.Sleep(1 * time.Second) + + err = suite.k8sClient.List(suite.ctx, podList, &client.ListOptions{Namespace: "preemption-test-ns"}) + require.NoError(t, err) + for _, pod := range podList.Items { + if strings.Contains(pod.Name, "victim") { + continue + } + scheduledNodeMap[pod.Name] = pod.Spec.NodeName + } + // not empty indicates the high priority pod is scheduled + require.NotEmpty(t, scheduledNodeMap["high-priority-0"]) + require.NotEmpty(t, scheduledNodeMap["critical-priority-0"]) +} + +func testGPUResourceEvictProtection(t *testing.T, suite *PreemptionTestSuite) { + toBeVictimPods := createPreemptionTestPodsWithQoS("victim", constants.QoSLevelMedium, 1, "2000", "2Gi") + toBeVictimPods[0].Annotations[constants.EvictionProtectionAnnotation] = "2s" + require.NoError(t, suite.k8sClient.Create(suite.ctx, toBeVictimPods[0])) + defer func() { + _ = suite.k8sClient.Delete(suite.ctx, toBeVictimPods[0]) + }() + + suite.scheduler.ScheduleOne(suite.ctx) + + toBeVictimPods = createPreemptionTestPodsWithQoS("high-priority", constants.QoSLevelHigh, 1, "2000", "2Gi") + require.NoError(t, suite.k8sClient.Create(suite.ctx, toBeVictimPods[0])) + defer func() { + _ = suite.k8sClient.Delete(suite.ctx, toBeVictimPods[0]) + }() + + // should not evict since it's inside protection period + suite.scheduler.ScheduleOne(suite.ctx) + + podList := &v1.PodList{} + err := suite.k8sClient.List(suite.ctx, podList, &client.ListOptions{Namespace: "preemption-test-ns"}) + require.NoError(t, err) + require.Equal(t, 2, len(podList.Items)) + + // should evict since protection period over + time.Sleep(2 * time.Second) + suite.scheduler.ScheduleOne(suite.ctx) + + suite.fixture.allocator.ReconcileAllocationStateForTesting() + + // Should schedule the new high priority pod + suite.scheduler.ScheduleOne(suite.ctx) + // waiting for binding cycle take effect + time.Sleep(300 * time.Millisecond) + + podList = &v1.PodList{} + err = suite.k8sClient.List(suite.ctx, podList, &client.ListOptions{Namespace: "preemption-test-ns"}) + require.NoError(t, err) + require.Equal(t, 1, len(podList.Items)) + require.Equal(t, "high-priority-0", podList.Items[0].Name) + require.Equal(t, "node-0", podList.Items[0].Spec.NodeName) +} + +// Helper functions +func createPreemptionTestPodsWithQoS(baseName, qosLevel string, count int, tflops, vram string) []*v1.Pod { + pods := make([]*v1.Pod, count) + for i := 0; i < count; i++ { + pod := st.MakePod(). + Namespace("preemption-test-ns"). + Name(fmt.Sprintf("%s-%d", baseName, i)). + UID(fmt.Sprintf("%s-%d", baseName, i)). + SchedulerName("tensor-fusion-scheduler"). + Res(map[v1.ResourceName]string{ + v1.ResourceCPU: "100m", + v1.ResourceMemory: "256Mi", + }). + Toleration("node.kubernetes.io/not-ready"). + ZeroTerminationGracePeriod().Obj() + + pod.Labels = map[string]string{ + constants.LabelComponent: constants.ComponentWorker, + constants.WorkloadKey: "test-workload", + } + + pod.Annotations = map[string]string{ + constants.GpuPoolKey: "preemption-test-pool", + constants.QoSLevelAnnotation: qosLevel, + constants.TFLOPSRequestAnnotation: tflops, + constants.VRAMRequestAnnotation: vram, + constants.TFLOPSLimitAnnotation: tflops, + constants.VRAMLimitAnnotation: vram, + constants.GpuCountAnnotation: "1", + } + pod.Spec.PriorityClassName = "tensor-fusion-" + qosLevel + + pods[i] = pod + } + return pods +} + +// func createPreemptionTestPodsWithEvictionProtection( +// namespace, baseName, qosLevel, protectionDuration string, count int, tflops, vram string) []*v1.Pod { +// pods := createPreemptionTestPodsWithQoS(namespace, baseName, qosLevel, count, tflops, vram) +// for _, pod := range pods { +// pod.Annotations[constants.EvictionProtectionAnnotation] = protectionDuration +// } +// return pods +// } diff --git a/test/sched/scheduler_bench_test.go b/test/sched/scheduler_bench_test.go index fde318bd..bbed548f 100644 --- a/test/sched/scheduler_bench_test.go +++ b/test/sched/scheduler_bench_test.go @@ -36,10 +36,9 @@ func defaultBenchmarkConfig() BenchmarkConfig { NumNodes: 1000, NumGPUs: 4000, NumPods: 10000, - BatchSize: 100, PoolName: "benchmark-pool", Namespace: "benchmark-ns", - Timeout: 10 * time.Minute, + Timeout: 5 * time.Minute, } } diff --git a/test/sched/setup.go b/test/sched/setup.go index 6fa4167d..5dc80e32 100644 --- a/test/sched/setup.go +++ b/test/sched/setup.go @@ -14,6 +14,7 @@ import ( gpuResourceFitPlugin "github.com/NexusGPU/tensor-fusion/internal/scheduler/gpuresources" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" + schedv1 "k8s.io/api/scheduling/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -31,6 +32,7 @@ import ( "k8s.io/kubernetes/pkg/scheduler/metrics" st "k8s.io/kubernetes/pkg/scheduler/testing" tf "k8s.io/kubernetes/pkg/scheduler/testing/framework" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -40,7 +42,6 @@ type BenchmarkConfig struct { NumNodes int NumGPUs int NumPods int - BatchSize int PoolName string Namespace string Timeout time.Duration @@ -89,7 +90,7 @@ func NewBenchmarkFixture( b.Logf("%d Pods created, Needed TFLOPS: %f, Needed VRAM: %f", len(pods), neededTflops, neededVRAM) // Batch create resources for better performance - k8sNativeObjects := batchCreateResources(b, ctx, client, nodes, gpus, pods, realAPIServer) + k8sNativeObjects := batchCreateResources(b, ctx, client, config.Namespace, nodes, gpus, pods, realAPIServer) // Setup allocator allocator := setupAllocator(b, ctx, client) @@ -178,10 +179,10 @@ func generateGPUs(totalGPUs int, nodes []*v1.Node, poolName string) ([]*tfv1.GPU // Pre-define GPU specs to avoid repeated allocations gpuSpecs := []struct{ tflops, vram string }{ - {"2250", "141Gi"}, // High-end - {"989", "80Gi"}, // Mid-range - {"450", "48Gi"}, // Entry-level - {"312", "40Gi"}, // Budget + {"2250", "141Gi"}, // Simulate B200 + {"989", "80Gi"}, // Simulate H100 + {"450", "48Gi"}, // Simulate L40s + {"312", "40Gi"}, // Simulate A100 } gpuIndex := 0 @@ -287,12 +288,27 @@ func generatePods(count int, namespace, poolName string) ([]*v1.Pod, float64, fl // Helper functions for setup func batchCreateResources( - b *testing.B, ctx context.Context, client client.Client, + b *testing.B, ctx context.Context, client client.Client, namespace string, nodes []*v1.Node, gpus []*tfv1.GPU, pods []*v1.Pod, realAPIServer bool, ) []runtime.Object { + // Create priority classes + require.NoError(b, client.Create(ctx, &schedv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{Name: "tensor-fusion-" + constants.QoSLevelCritical}, + Value: 100000, + })) + require.NoError(b, client.Create(ctx, &schedv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{Name: "tensor-fusion-" + constants.QoSLevelHigh}, + Value: 10000, + })) + require.NoError(b, client.Create(ctx, &schedv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{Name: "tensor-fusion-" + constants.QoSLevelMedium}, + Value: 100, + PreemptionPolicy: ptr.To(v1.PreemptNever), + })) + k8sObjs := []runtime.Object{} require.NoError(b, client.Create(ctx, &v1.Namespace{ - ObjectMeta: metav1.ObjectMeta{Name: "benchmark-ns"}, + ObjectMeta: metav1.ObjectMeta{Name: namespace}, })) timer := time.Now() From 4fc9dc9bbf43e00a8e63aec889b087be7b994aa9 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Wed, 17 Sep 2025 22:16:01 +0800 Subject: [PATCH 28/34] fix: add resource validation in Bind to prevent GPU over-allocation (#365) - Add double-check for TFLOPs and VRAM availability before allocation --- internal/gpuallocator/gpuallocator.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index fb475377..a4ec9958 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -285,6 +285,19 @@ func (s *GpuAllocator) Bind( gpuNodeName = gpu.Status.NodeSelector[constants.KubernetesHostNameLabel] } + // Double-check resource availability to prevent over-allocation + if gpu.Status.Available == nil { + return nil, fmt.Errorf("GPU %s has nil available resources", selectedGPU) + } + if gpu.Status.Available.Tflops.Cmp(req.Request.Tflops) < 0 { + return nil, fmt.Errorf("GPU %s insufficient TFLOPs: available %s, requested %s", + selectedGPU, gpu.Status.Available.Tflops.String(), req.Request.Tflops.String()) + } + if gpu.Status.Available.Vram.Cmp(req.Request.Vram) < 0 { + return nil, fmt.Errorf("GPU %s insufficient VRAM: available %s, requested %s", + selectedGPU, gpu.Status.Available.Vram.String(), req.Request.Vram.String()) + } + // reduce available resource on the GPU status gpu.Status.Available.Tflops.Sub(req.Request.Tflops) gpu.Status.Available.Vram.Sub(req.Request.Vram) From 5f25794b3dbd34edd8716e1cfcaef59cf0a2bbdd Mon Sep 17 00:00:00 2001 From: dylan Date: Mon, 22 Sep 2025 08:35:59 -0700 Subject: [PATCH 29/34] webhook & gpu resource fit dra support --- api/v1/schedulingconfigtemplate_types.go | 16 + api/v1/zz_generated.deepcopy.go | 25 ++ ...r-fusion.ai_schedulingconfigtemplates.yaml | 14 + cmd/main.go | 2 + ...r-fusion.ai_schedulingconfigtemplates.yaml | 14 + internal/constants/constants.go | 9 + .../scheduler/gpuresources/gpuresources.go | 45 ++ .../gpuresources/gpuresources_dra_test.go | 237 ++++++++++ internal/utils/compose.go | 6 + internal/webhook/v1/pod_dra.go | 307 +++++++++++++ internal/webhook/v1/pod_webhook.go | 45 +- internal/webhook/v1/pod_webhook_dra_test.go | 413 ++++++++++++++++++ internal/webhook/v1/pod_webhook_test.go | 9 +- internal/webhook/v1/tf_parser.go | 6 + 14 files changed, 1132 insertions(+), 16 deletions(-) create mode 100644 internal/scheduler/gpuresources/gpuresources_dra_test.go create mode 100644 internal/webhook/v1/pod_dra.go create mode 100644 internal/webhook/v1/pod_webhook_dra_test.go diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index 44f07bef..b3243344 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -39,6 +39,10 @@ type SchedulingConfigTemplateSpec struct { // single GPU device multi-process queuing and fair scheduling with QoS constraint // +optional Hypervisor *HypervisorScheduling `json:"hypervisor,omitempty"` + + // enable Dynamic Resource Allocation (DRA) for GPU resource management + // +optional + DRA *DRAConfig `json:"dra,omitempty"` } type PlacementConfig struct { @@ -206,6 +210,18 @@ type MultiProcessQueuing struct { QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"` } +// DRAConfig configures Dynamic Resource Allocation support +type DRAConfig struct { + // Enable DRA mode for all workloads in this configuration template + // +optional + Enable *bool `json:"enable,omitempty"` + + // ResourceClass specifies the DRA resource class name to use + // +kubebuilder:default="tensorfusion.ai/gpu" + // +optional + ResourceClass string `json:"resourceClass,omitempty"` +} + // SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate. type SchedulingConfigTemplateStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 9be4f47c..5699677a 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -332,6 +332,26 @@ func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DRAConfig) DeepCopyInto(out *DRAConfig) { + *out = *in + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRAConfig. +func (in *DRAConfig) DeepCopy() *DRAConfig { + if in == nil { + return nil + } + out := new(DRAConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GPU) DeepCopyInto(out *GPU) { *out = *in @@ -1963,6 +1983,11 @@ func (in *SchedulingConfigTemplateSpec) DeepCopyInto(out *SchedulingConfigTempla *out = new(HypervisorScheduling) (*in).DeepCopyInto(*out) } + if in.DRA != nil { + in, out := &in.DRA, &out.DRA + *out = new(DRAConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateSpec. diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 91a01eae..7c0c281b 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -143,6 +143,20 @@ spec: type: string type: object type: object + dra: + description: enable Dynamic Resource Allocation (DRA) for GPU resource + management + properties: + enable: + description: Enable DRA mode for all workloads in this configuration + template + type: boolean + resourceClass: + default: tensorfusion.ai/gpu + description: ResourceClass specifies the DRA resource class name + to use + type: string + type: object hypervisor: description: single GPU device multi-process queuing and fair scheduling with QoS constraint diff --git a/cmd/main.go b/cmd/main.go index 92021131..7f5f8721 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -33,6 +33,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/klog/v2" + resourcev1beta2 "k8s.io/api/resource/v1beta2" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -99,6 +100,7 @@ var alertEvaluatorReady chan struct{} func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(tfv1.AddToScheme(scheme)) + utilruntime.Must(resourcev1beta2.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 91a01eae..7c0c281b 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -143,6 +143,20 @@ spec: type: string type: object type: object + dra: + description: enable Dynamic Resource Allocation (DRA) for GPU resource + management + properties: + enable: + description: Enable DRA mode for all workloads in this configuration + template + type: boolean + resourceClass: + default: tensorfusion.ai/gpu + description: ResourceClass specifies the DRA resource class name + to use + type: string + type: object hypervisor: description: single GPU device multi-process queuing and fair scheduling with QoS constraint diff --git a/internal/constants/constants.go b/internal/constants/constants.go index b1aa6b64..22d465cc 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -121,6 +121,15 @@ const ( QoSLevelMedium = "medium" QoSLevelHigh = "high" QoSLevelCritical = "critical" + + // DRA support + // annotation for pod to indicate if DRA is enabled + DRAEnabledAnnotation = Domain + "/dra-enabled" + DRAResourceClaimName = "tensor-fusion-resource-claim-%s" + // resource claim name for request + DRAResourceClaimRequestName = "tensor-fusion-resource-claim-request-%s" + + DRAClaimDefineName = "tensor-fusion-gpu-claim" ) // for avoid golang lint issues diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go index 16dd1c61..949b7582 100644 --- a/internal/scheduler/gpuresources/gpuresources.go +++ b/internal/scheduler/gpuresources/gpuresources.go @@ -26,6 +26,7 @@ import ( const Name = "GPUResourcesFit" const CycleStateAllocateRequest = "allocateRequest" const CycleStateGPUSchedulingResult = "gpuSchedulingResult" + const SchedulerSimulationKey = "schedulerSimulation" var _ framework.PreFilterPlugin = &GPUFit{} @@ -105,6 +106,11 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod }, framework.NewStatus(framework.Success, "progressive migration for native resources claim") } + // Check if DRA mode is enabled for this pod + if isDRAEnabled(pod) && hasDRAClaim(pod) { + return nil, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU prefilter") + } + // Skip non tensor-fusion mode if !utils.IsTensorFusionWorker(pod) { return nil, framework.NewStatus(framework.Skip, "skip for non tensor-fusion mode") @@ -207,6 +213,11 @@ func (s *GPUFit) PreFilterExtensions() framework.PreFilterExtensions { } func (s *GPUFit) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { + // Check if DRA mode is enabled for this pod + if isDRAEnabled(pod) && hasDRAClaim(pod) { + return framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU filter") + } + if !utils.IsTensorFusionWorker(pod) { return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode") } @@ -228,6 +239,11 @@ func (s *GPUFit) Score( pod *v1.Pod, nodeInfo *framework.NodeInfo, ) (int64, *framework.Status) { + // Check if DRA mode is enabled for this pod + if isDRAEnabled(pod) && hasDRAClaim(pod) { + return 0, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU scoring") + } + // Skip non tensor-fusion mode scheduling if !utils.IsTensorFusionWorker(pod) { return 0, framework.NewStatus(framework.Success, "") @@ -266,6 +282,11 @@ func (s *GPUFit) ScoreExtensions() framework.ScoreExtensions { } func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { + // Check if DRA mode is enabled for this pod + if isDRAEnabled(pod) && hasDRAClaim(pod) { + return framework.NewStatus(framework.Success, "DRA mode enabled, skipping custom GPU reservation") + } + if !utils.IsTensorFusionWorker(pod) { return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode") } @@ -312,6 +333,11 @@ func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod * } func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { + // Check if DRA mode is enabled for this pod + if isDRAEnabled(pod) && hasDRAClaim(pod) { + return // DRA handles unreservation + } + if !utils.IsTensorFusionWorker(pod) { return } @@ -331,6 +357,11 @@ func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod } func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { + // Check if DRA mode is enabled for this pod + if isDRAEnabled(pod) && hasDRAClaim(pod) { + return // DRA handles post-bind actions + } + if !utils.IsTensorFusionWorker(pod) { return } @@ -359,3 +390,17 @@ func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod "Attach GPU device ID info", "Attach TensorFusion GPU device IDs to Pod: "+gpuIDs) } } + +// isDRAEnabled checks if DRA is enabled for a pod +func isDRAEnabled(pod *v1.Pod) bool { + if pod.Annotations == nil { + return false + } + val, ok := pod.Annotations[constants.DRAEnabledAnnotation] + return ok && val == constants.TrueStringValue +} + +// hasDRAClaim checks if a pod has DRA ResourceClaim references +func hasDRAClaim(pod *v1.Pod) bool { + return len(pod.Spec.ResourceClaims) > 0 +} diff --git a/internal/scheduler/gpuresources/gpuresources_dra_test.go b/internal/scheduler/gpuresources/gpuresources_dra_test.go new file mode 100644 index 00000000..021be137 --- /dev/null +++ b/internal/scheduler/gpuresources/gpuresources_dra_test.go @@ -0,0 +1,237 @@ +package gpuresources + +import ( + "testing" + + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/NexusGPU/tensor-fusion/internal/constants" +) + +func TestIsDRAEnabled(t *testing.T) { + tests := []struct { + name string + annotations map[string]string + expected bool + }{ + { + name: "DRA enabled annotation", + annotations: map[string]string{ + constants.DRAEnabledAnnotation: constants.TrueStringValue, + }, + expected: true, + }, + { + name: "DRA disabled annotation", + annotations: map[string]string{ + constants.DRAEnabledAnnotation: constants.FalseStringValue, + }, + expected: false, + }, + { + name: "no annotation", + expected: false, + }, + { + name: "other annotations", + annotations: map[string]string{ + "other.annotation": "value", + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: tt.annotations, + }, + } + + result := isDRAEnabled(pod) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestHasDRAClaimScheduler(t *testing.T) { + tests := []struct { + name string + pod *corev1.Pod + expected bool + }{ + { + name: "pod with resource claims", + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + ResourceClaims: []corev1.PodResourceClaim{ + {Name: "gpu-claim"}, + }, + }, + }, + expected: true, + }, + { + name: "pod with multiple resource claims", + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + ResourceClaims: []corev1.PodResourceClaim{ + {Name: "gpu-claim"}, + {Name: "other-claim"}, + }, + }, + }, + expected: true, + }, + { + name: "pod without resource claims", + pod: &corev1.Pod{ + Spec: corev1.PodSpec{}, + }, + expected: false, + }, + { + name: "pod with empty resource claims", + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + ResourceClaims: []corev1.PodResourceClaim{}, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := hasDRAClaim(tt.pod) + assert.Equal(t, tt.expected, result) + }) + } +} + +// Integration test for DRA detection logic +func TestDRADetectionIntegration(t *testing.T) { + tests := []struct { + name string + draAnnotation string + hasResourceClaims bool + expectedDRA bool + expectedClaim bool + }{ + { + name: "DRA enabled with claims", + draAnnotation: constants.TrueStringValue, + hasResourceClaims: true, + expectedDRA: true, + expectedClaim: true, + }, + { + name: "DRA enabled without claims", + draAnnotation: constants.TrueStringValue, + hasResourceClaims: false, + expectedDRA: true, + expectedClaim: false, + }, + { + name: "DRA disabled with claims", + draAnnotation: constants.FalseStringValue, + hasResourceClaims: true, + expectedDRA: false, + expectedClaim: true, + }, + { + name: "no DRA annotation, no claims", + hasResourceClaims: false, + expectedDRA: false, + expectedClaim: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: make(map[string]string), + }, + Spec: corev1.PodSpec{}, + } + + if tt.draAnnotation != "" { + pod.Annotations[constants.DRAEnabledAnnotation] = tt.draAnnotation + } + + if tt.hasResourceClaims { + pod.Spec.ResourceClaims = []corev1.PodResourceClaim{ + {Name: "test-claim"}, + } + } + + draEnabled := isDRAEnabled(pod) + hasClaim := hasDRAClaim(pod) + + assert.Equal(t, tt.expectedDRA, draEnabled, "DRA enabled detection mismatch") + assert.Equal(t, tt.expectedClaim, hasClaim, "Resource claim detection mismatch") + }) + } +} + +// Test the combination logic that scheduler uses +func TestSchedulerDRALogic(t *testing.T) { + tests := []struct { + name string + draAnnotation string + hasResourceClaims bool + shouldSkipScheduler bool + }{ + { + name: "DRA enabled with claims - should skip", + draAnnotation: constants.TrueStringValue, + hasResourceClaims: true, + shouldSkipScheduler: true, + }, + { + name: "DRA enabled without claims - should not skip", + draAnnotation: constants.TrueStringValue, + hasResourceClaims: false, + shouldSkipScheduler: false, + }, + { + name: "DRA disabled with claims - should not skip", + draAnnotation: constants.FalseStringValue, + hasResourceClaims: true, + shouldSkipScheduler: false, + }, + { + name: "no DRA, no claims - should not skip", + shouldSkipScheduler: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: make(map[string]string), + }, + Spec: corev1.PodSpec{}, + } + + if tt.draAnnotation != "" { + pod.Annotations[constants.DRAEnabledAnnotation] = tt.draAnnotation + } + + if tt.hasResourceClaims { + pod.Spec.ResourceClaims = []corev1.PodResourceClaim{ + {Name: "test-claim"}, + } + } + + // This is the actual logic used in the scheduler + shouldSkip := isDRAEnabled(pod) && hasDRAClaim(pod) + assert.Equal(t, tt.shouldSkipScheduler, shouldSkip) + }) + } +} diff --git a/internal/utils/compose.go b/internal/utils/compose.go index e7170881..e9fb79ce 100644 --- a/internal/utils/compose.go +++ b/internal/utils/compose.go @@ -79,6 +79,8 @@ type TensorFusionInfo struct { // Pod mutating webhook can not get Pod UID sometimes, // thus need pod controller to set the owner reference PendingSetPodAsOwner bool + // DRA support + DRAEnabled bool } func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo TensorFusionInfo) { @@ -113,6 +115,10 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens pod.Annotations[constants.IsLocalGPUAnnotation] = strconv.FormatBool(tfInfo.Profile.IsLocalGPU) // add inject container annotation for client Pod, in case user doesn't specify it pod.Annotations[constants.InjectContainerAnnotation] = strings.Join(tfInfo.ContainerNames, ",") + // add DRA enabled annotation + if tfInfo.DRAEnabled { + pod.Annotations[constants.DRAEnabledAnnotation] = constants.TrueStringValue + } } func AppendTFWorkerLabelsAndAnnotationsAfterTemplate( diff --git a/internal/webhook/v1/pod_dra.go b/internal/webhook/v1/pod_dra.go new file mode 100644 index 00000000..ef2bd0b3 --- /dev/null +++ b/internal/webhook/v1/pod_dra.go @@ -0,0 +1,307 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" +) + +// DRAProcessor handles all DRA-related operations for pod admission +type DRAProcessor struct { + client.Client + enableDRA bool + resourceClass string // cached resource class to avoid repeated API calls + configLoaded bool // tracks if configuration has been loaded +} + +// generateUniqueID creates a random 8-character hex string for resource claim names +func generateUniqueID() string { + bytes := make([]byte, 4) + _, _ = rand.Read(bytes) // crypto/rand.Read always returns len(bytes), nil on success + return hex.EncodeToString(bytes) +} + +// NewDRAProcessor creates a new DRA processor +func NewDRAProcessor(client client.Client) *DRAProcessor { + return &DRAProcessor{ + Client: client, + enableDRA: false, + } +} + +// InitializeDRAConfig is kept for backward compatibility but now does nothing +// Configuration is loaded lazily on first use +func (p *DRAProcessor) InitializeDRAConfig(ctx context.Context) error { + // No-op - configuration is now loaded lazily + if p.configLoaded { + return nil + } + + // Set defaults first + p.enableDRA = false + + templateList := &tfv1.SchedulingConfigTemplateList{} + // Use the provided context to respect cancellation + err := p.List(ctx, templateList) + if err != nil { + // Log error but don't fail - fall back to defaults + // This allows webhook to work even if templates are unavailable + p.configLoaded = true + return nil + } + + // Check if any template has DRA enabled and cache the resource class + for _, template := range templateList.Items { + if template.Spec.DRA != nil { + if template.Spec.DRA.Enable != nil && *template.Spec.DRA.Enable { + p.enableDRA = true + } + // Cache the resource class from the template + if template.Spec.DRA.ResourceClass != "" { + p.resourceClass = template.Spec.DRA.ResourceClass + } + } + } + + if p.enableDRA && p.resourceClass == "" { + return fmt.Errorf("resource class is not set") + } + + p.configLoaded = true + return nil +} + +// IsDRAEnabled checks if DRA is enabled for a specific pod +func (p *DRAProcessor) IsDRAEnabled(ctx context.Context, pod *corev1.Pod) bool { + + // Check pod-level annotation first (explicit override) + if val, ok := pod.Annotations[constants.DRAEnabledAnnotation]; ok && val == constants.TrueStringValue { + return true + } + + // Check pod-level annotation for explicit disable + if val, ok := pod.Annotations[constants.DRAEnabledAnnotation]; ok && val == constants.FalseStringValue { + return false + } + + // Fall back to global configuration + return p.enableDRA +} + +// HasDRAClaim checks if a pod has DRA ResourceClaim references +func HasDRAClaim(pod *corev1.Pod) bool { + return len(pod.Spec.ResourceClaims) > 0 +} + +// convertToResourceClaim converts GPU resource requests to ResourceClaim +func (p *DRAProcessor) convertToResourceClaim(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (*resourcev1beta2.ResourceClaim, error) { + + // Build CEL selector using DRA helper + celSelector, err := BuildCELSelector(pod, tfInfo) + if err != nil { + return nil, fmt.Errorf("failed to build CEL selector: %w", err) + } + + // Generate unique claim name with random suffix to avoid conflicts + var baseName string + + if pod.GenerateName != "" { + baseName = strings.TrimSuffix(pod.GenerateName, "-") + } else if pod.Name != "" { + baseName = pod.Name + } + + uniqueID := generateUniqueID() + claimName := fmt.Sprintf(constants.DRAResourceClaimName, baseName, uniqueID) + + // Use cached resource class instead of making API calls + resourceClass := p.resourceClass + + claim := &resourcev1beta2.ResourceClaim{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "resource.k8s.io/v1beta2", + Kind: "ResourceClaim", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: claimName, + Namespace: pod.Namespace, + // Note: We cannot set OwnerReference here because pod.UID is empty during admission. + // The controller will set the proper owner reference once the Pod is created. + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: fmt.Sprintf(constants.DRAResourceClaimRequestName, generateUniqueID()), + Exactly: &resourcev1beta2.ExactDeviceRequest{ + DeviceClassName: resourceClass, + Selectors: []resourcev1beta2.DeviceSelector{ + { + CEL: &resourcev1beta2.CELDeviceSelector{ + Expression: celSelector, + }, + }, + }, + Count: int64(tfInfo.Profile.GPUCount), + }, + }, + }, + }, + }, + } + + return claim, nil +} + +// injectResourceClaimRef adds ResourceClaim reference to Pod spec +func (p *DRAProcessor) injectResourceClaimRef(pod *corev1.Pod, claim *resourcev1beta2.ResourceClaim, containerIndices []int) { + // Add ResourceClaim reference to pod.Spec.ResourceClaims + if pod.Spec.ResourceClaims == nil { + pod.Spec.ResourceClaims = []corev1.PodResourceClaim{} + } + + claimRef := corev1.PodResourceClaim{ + Name: constants.DRAClaimDefineName, + ResourceClaimName: &claim.Name, + } + + // Check if the claim reference already exists to maintain idempotency + claimExists := false + for i, existingClaim := range pod.Spec.ResourceClaims { + if existingClaim.Name == constants.DRAClaimDefineName { + // Update existing claim to point to the new ResourceClaim name + pod.Spec.ResourceClaims[i].ResourceClaimName = &claim.Name + claimExists = true + break + } + } + + if !claimExists { + pod.Spec.ResourceClaims = append(pod.Spec.ResourceClaims, claimRef) + } + + // Add resource claim consumption to containers + for _, containerIndex := range containerIndices { + container := &pod.Spec.Containers[containerIndex] + if container.Resources.Claims == nil { + container.Resources.Claims = []corev1.ResourceClaim{} + } + + // Check if the container already has this claim to maintain idempotency + hasGPUClaim := false + for _, existingClaim := range container.Resources.Claims { + if existingClaim.Name == constants.DRAClaimDefineName { + hasGPUClaim = true + break + } + } + + if !hasGPUClaim { + container.Resources.Claims = append(container.Resources.Claims, corev1.ResourceClaim{ + Name: constants.DRAClaimDefineName, + }) + } + } +} + +// createResourceClaim creates a ResourceClaim object with proper error handling and retries +func (p *DRAProcessor) createResourceClaim(ctx context.Context, claim *resourcev1beta2.ResourceClaim) error { + // Try to create the ResourceClaim + if err := p.Create(ctx, claim); err != nil { + if errors.IsAlreadyExists(err) { + // Check if the existing claim is for the same pod + existingClaim := &resourcev1beta2.ResourceClaim{} + getErr := p.Get(ctx, client.ObjectKey{Name: claim.Name, Namespace: claim.Namespace}, existingClaim) + if getErr != nil { + return fmt.Errorf("failed to check existing ResourceClaim: %w", getErr) + } + // Different pod or missing labels, this is an error + return fmt.Errorf("ResourceClaim %s already exists for a different pod", claim.Name) + } + + if errors.IsInvalid(err) { + return fmt.Errorf("ResourceClaim is invalid: %w", err) + } + + if errors.IsForbidden(err) { + return fmt.Errorf("insufficient permissions to create ResourceClaim: %w", err) + } + } + + return nil +} + +// Note: patchTFClientForDRA is temporarily handled in the main pod_webhook.go +// until we can properly abstract all the TF client patching logic + +// HandleDRAAdmission handles the complete DRA admission process +func (p *DRAProcessor) HandleDRAAdmission(ctx context.Context, pod *corev1.Pod, tfInfo *utils.TensorFusionInfo, containerIndices []int) error { + // Convert GPU resources to ResourceClaim + resourceClaim, err := p.convertToResourceClaim(pod, tfInfo) + if err != nil { + return fmt.Errorf("failed to convert to ResourceClaim: %w", err) + } + + // Create ResourceClaim + if err := p.createResourceClaim(ctx, resourceClaim); err != nil { + return fmt.Errorf("failed to create ResourceClaim: %w", err) + } + // Inject ResourceClaim reference to Pod + p.injectResourceClaimRef(pod, resourceClaim, containerIndices) + return nil +} + +// TODO: support more attributes for filtering +func BuildCELSelector(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (string, error) { + var conditions []string + + // 1. Basic resource requirements using standard DRA quantity attributes + requests := tfInfo.Profile.Resources.Requests + if !requests.Tflops.IsZero() { + conditions = append(conditions, fmt.Sprintf(`device.attributes["tflops"].quantity >= quantity("%s")`, requests.Tflops.String())) + } + if !requests.Vram.IsZero() { + conditions = append(conditions, fmt.Sprintf(`device.attributes["vram"].quantity >= quantity("%s")`, requests.Vram.String())) + } + + // 2. GPU model filter (if specified - basic attribute that should be widely supported) + if tfInfo.Profile.GPUModel != "" { + conditions = append(conditions, fmt.Sprintf(`device.attributes["model"] == "%s"`, tfInfo.Profile.GPUModel)) + } + + // Return a basic condition if no specific requirements + if len(conditions) == 0 { + // Simple condition that should work with most DRA drivers + return `device.attributes.exists("type")`, nil + } + + return strings.Join(conditions, " && "), nil +} diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 53610ffe..08962028 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -49,14 +49,21 @@ var httpClient = &http.Client{Timeout: 10 * time.Second} func SetupPodWebhookWithManager(mgr ctrl.Manager, portAllocator *portallocator.PortAllocator) error { webhookServer := mgr.GetWebhookServer() - webhookServer.Register("/mutate-v1-pod", - &admission.Webhook{ - Handler: &TensorFusionPodMutator{ - decoder: admission.NewDecoder(runtime.NewScheme()), - Client: mgr.GetClient(), - portAllocator: portAllocator, - }, - }) + // Initialize DRA processor + draProcessor := NewDRAProcessor(mgr.GetClient()) + if err := draProcessor.InitializeDRAConfig(context.Background()); err != nil { + return fmt.Errorf("failed to initialize DRA config: %w", err) + } + + // Initialize DRA setting from global configuration + mutator := &TensorFusionPodMutator{ + decoder: admission.NewDecoder(runtime.NewScheme()), + Client: mgr.GetClient(), + portAllocator: portAllocator, + draProcessor: draProcessor, + } + + webhookServer.Register("/mutate-v1-pod", &admission.Webhook{Handler: mutator}) return nil } @@ -64,6 +71,7 @@ type TensorFusionPodMutator struct { Client client.Client decoder admission.Decoder portAllocator *portallocator.PortAllocator + draProcessor *DRAProcessor } // Handle implements admission.Handler interface. @@ -100,7 +108,7 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque return admission.Errored(http.StatusBadRequest, fmt.Errorf("failed to marshal current pod: %w", err)) } - tfInfo, err := ParseTensorFusionInfo(ctx, m.Client, pod) + tfInfo, err := ParseTensorFusionInfo(ctx, m.Client, m.draProcessor, pod) if err != nil { return admission.Errored(http.StatusInternalServerError, fmt.Errorf("parse tf resources: %w", err)) } @@ -159,16 +167,28 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque return admission.Allowed("no valid container to inject tensor-fusion, skipped") } - // Add defaults and tensor-fusion injection logic + // Handle DRA-specific processing if enabled + if tfInfo.DRAEnabled { + // Process DRA workload + if err := m.draProcessor.HandleDRAAdmission(ctx, pod, &tfInfo, containerIndices); err != nil { + return admission.Errored(http.StatusInternalServerError, fmt.Errorf("failed to handle DRA admission: %w", err)) + } + } + + // Common processing for both DRA and regular modes utils.AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod, tfInfo) utils.AddTFDefaultClientConfBeforePatch(ctx, pod, pool, tfInfo, containerIndices) // Inject initContainer and env variables patches, err := m.patchTFClient( - pod, pool, tfInfo.Profile.IsLocalGPU, currentBytes, containerIndices, + ctx, pod, pool, tfInfo.Profile.IsLocalGPU, currentBytes, containerIndices, ) if err != nil { - log.Error(err, "failed to patch tf client", "pod", req.Name, "namespace", req.Namespace) + mode := "regular" + if tfInfo.DRAEnabled { + mode = "DRA" + } + log.Error(err, "failed to patch tf client", "mode", mode, "pod", req.Name, "namespace", req.Namespace) return admission.Errored(http.StatusInternalServerError, err) } @@ -266,6 +286,7 @@ func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod } func (m *TensorFusionPodMutator) patchTFClient( + ctx context.Context, pod *corev1.Pod, pool *tfv1.GPUPool, isLocalGPU bool, diff --git a/internal/webhook/v1/pod_webhook_dra_test.go b/internal/webhook/v1/pod_webhook_dra_test.go new file mode 100644 index 00000000..09a738b0 --- /dev/null +++ b/internal/webhook/v1/pod_webhook_dra_test.go @@ -0,0 +1,413 @@ +package v1 + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" +) + +func TestDRAProcessor_IsDRAEnabled(t *testing.T) { + tests := []struct { + name string + processorDRA bool + podAnnotations map[string]string + expected bool + }{ + { + name: "global DRA enabled, no pod annotation", + processorDRA: true, + expected: true, + }, + { + name: "global DRA disabled, no pod annotation", + processorDRA: false, + expected: false, + }, + { + name: "global DRA disabled, pod annotation enabled", + processorDRA: false, + podAnnotations: map[string]string{ + constants.DRAEnabledAnnotation: constants.TrueStringValue, + }, + expected: true, + }, + { + name: "global DRA enabled, pod annotation disabled", + processorDRA: true, + podAnnotations: map[string]string{ + constants.DRAEnabledAnnotation: constants.FalseStringValue, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + processor := &DRAProcessor{ + enableDRA: tt.processorDRA, + configLoaded: true, // Skip config loading in tests + } + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: tt.podAnnotations, + }, + } + + result := processor.IsDRAEnabled(context.Background(), pod) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestDRAProcessor_convertToResourceClaim(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, tfv1.AddToScheme(scheme)) + require.NoError(t, resourcev1beta2.AddToScheme(scheme)) + + // Create a SchedulingConfigTemplate with DRA config + template := &tfv1.SchedulingConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-template", + }, + Spec: tfv1.SchedulingConfigTemplateSpec{ + DRA: &tfv1.DRAConfig{ + Enable: &[]bool{true}[0], + ResourceClass: "custom.tensorfusion.ai/gpu", + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(template). + Build() + + processor := &DRAProcessor{ + Client: fakeClient, + } + + // Initialize DRA config to set up the resource class cache + err := processor.InitializeDRAConfig(context.Background()) + require.NoError(t, err) + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test-namespace", + GenerateName: "test-pod-", + UID: types.UID("test-uid"), + }, + } + + tfInfo := &utils.TensorFusionInfo{ + Profile: &tfv1.WorkloadProfileSpec{ + GPUCount: 1, + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + }, + }, + } + + claim, err := processor.convertToResourceClaim(pod, tfInfo) + require.NoError(t, err) + require.NotNil(t, claim) + + // Verify claim structure + assert.Contains(t, claim.Name, "test-pod-") + assert.Contains(t, claim.Name, "-gpu-claim") + assert.Equal(t, "test-namespace", claim.Namespace) + assert.Equal(t, "resource.k8s.io/v1beta2", claim.APIVersion) + assert.Equal(t, "ResourceClaim", claim.Kind) + + // Verify labels instead of owner references (since we removed owner references during admission) + require.NotNil(t, claim.Labels) + assert.Equal(t, "test-pod-", claim.Labels["tensorfusion.ai/pod"]) // Uses GenerateName as podIdentifier + assert.Equal(t, "gpu", claim.Labels["tensorfusion.ai/claim-for"]) + + // Verify device claim + require.Len(t, claim.Spec.Devices.Requests, 1) + deviceReq := claim.Spec.Devices.Requests[0] + assert.Equal(t, "gpu", deviceReq.Name) + + // Verify ExactDeviceRequest structure + require.NotNil(t, deviceReq.Exactly) + exactReq := deviceReq.Exactly + assert.Equal(t, "custom.tensorfusion.ai/gpu", exactReq.DeviceClassName) // Uses cached resource class from template + assert.Equal(t, int64(1), exactReq.Count) + + // Verify CEL selector + require.Len(t, exactReq.Selectors, 1) + require.NotNil(t, exactReq.Selectors[0].CEL) + + // The simplified CEL selector should only contain basic resource requirements + celExpression := exactReq.Selectors[0].CEL.Expression + + // Verify it contains the expected resource filters (simplified version) + assert.Contains(t, celExpression, `device.attributes["tflops"].quantity >= quantity("10")`) + assert.Contains(t, celExpression, `device.attributes["vram"].quantity >= quantity("8Gi")`) + + // Verify conditions are combined with AND + assert.Contains(t, celExpression, " && ") +} + +func TestDRAProcessor_injectResourceClaimRef(t *testing.T) { + processor := &DRAProcessor{} + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test-namespace", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "container1"}, + {Name: "container2"}, + }, + }, + } + + claim := &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "test-namespace", + }, + } + + containerIndices := []int{0, 1} + + processor.injectResourceClaimRef(pod, claim, containerIndices) + + // Verify pod resource claims + require.Len(t, pod.Spec.ResourceClaims, 1) + podClaim := pod.Spec.ResourceClaims[0] + assert.Equal(t, "gpu-claim", podClaim.Name) + require.NotNil(t, podClaim.ResourceClaimName) + assert.Equal(t, "test-claim", *podClaim.ResourceClaimName) + + // Verify container resource claims + for _, idx := range containerIndices { + container := pod.Spec.Containers[idx] + require.Len(t, container.Resources.Claims, 1) + assert.Equal(t, "gpu-claim", container.Resources.Claims[0].Name) + } + + // Verify annotations + require.NotNil(t, pod.Annotations) + assert.Equal(t, constants.TrueStringValue, pod.Annotations[constants.DRAEnabledAnnotation]) +} + +func TestDRAProcessor_createResourceClaim(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, resourcev1beta2.AddToScheme(scheme)) + + tests := []struct { + name string + existingClaim *resourcev1beta2.ResourceClaim + expectError bool + errorType string + }{ + { + name: "successful creation", + expectError: false, + }, + { + name: "claim already exists with same pod", + existingClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "test-namespace", + Labels: map[string]string{ + "tensorfusion.ai/pod": "test-pod", + "tensorfusion.ai/claim-for": "gpu", + }, + }, + }, + expectError: false, + }, + { + name: "claim already exists with different pod", + existingClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "test-namespace", + Labels: map[string]string{ + "tensorfusion.ai/pod": "different-pod", + "tensorfusion.ai/claim-for": "gpu", + }, + }, + }, + expectError: true, + errorType: "conflict", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var fakeClient client.Client + if tt.existingClaim != nil { + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tt.existingClaim). + Build() + } else { + fakeClient = fake.NewClientBuilder(). + WithScheme(scheme). + Build() + } + + processor := &DRAProcessor{ + Client: fakeClient, + } + + claim := &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "test-namespace", + Labels: map[string]string{ + "tensorfusion.ai/pod": "test-pod", + "tensorfusion.ai/claim-for": "gpu", + }, + }, + } + + err := processor.createResourceClaim(context.Background(), claim) + + if tt.expectError { + require.Error(t, err) + if tt.errorType == "conflict" { + assert.Contains(t, err.Error(), "already exists for a different pod") + } + } else { + require.NoError(t, err) + } + }) + } +} + +func TestHasDRAClaim(t *testing.T) { + tests := []struct { + name string + pod *corev1.Pod + expected bool + }{ + { + name: "pod with resource claims", + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + ResourceClaims: []corev1.PodResourceClaim{ + {Name: "gpu-claim"}, + }, + }, + }, + expected: true, + }, + { + name: "pod without resource claims", + pod: &corev1.Pod{ + Spec: corev1.PodSpec{}, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HasDRAClaim(tt.pod) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestDRAProcessor_LazyConfigLoading(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, tfv1.AddToScheme(scheme)) + + tests := []struct { + name string + templates []tfv1.SchedulingConfigTemplate + expected bool + }{ + { + name: "DRA enabled in template", + templates: []tfv1.SchedulingConfigTemplate{ + { + ObjectMeta: metav1.ObjectMeta{Name: "template1"}, + Spec: tfv1.SchedulingConfigTemplateSpec{ + DRA: &tfv1.DRAConfig{ + Enable: &[]bool{true}[0], + ResourceClass: "test.ai/gpu", + }, + }, + }, + }, + expected: true, + }, + { + name: "DRA disabled in template", + templates: []tfv1.SchedulingConfigTemplate{ + { + ObjectMeta: metav1.ObjectMeta{Name: "template1"}, + Spec: tfv1.SchedulingConfigTemplateSpec{ + DRA: &tfv1.DRAConfig{ + Enable: &[]bool{false}[0], + }, + }, + }, + }, + expected: false, + }, + { + name: "no templates", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + objects := make([]client.Object, len(tt.templates)) + for i, template := range tt.templates { + objects[i] = &template + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + Build() + + processor := &DRAProcessor{ + Client: fakeClient, + } + + // Test lazy loading by calling a method that triggers config loading + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{}, + }, + } + + result := processor.IsDRAEnabled(context.Background(), pod) + assert.Equal(t, tt.expected, result) + + // Verify config was loaded + assert.True(t, processor.configLoaded) + }) + } +} diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index 55f29233..718b0f8d 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -57,8 +57,9 @@ var _ = Describe("TensorFusionPodMutator", func() { decoder = admission.NewDecoder(scheme) mutator = &TensorFusionPodMutator{ - Client: k8sClient, - decoder: decoder, + Client: k8sClient, + decoder: decoder, + draProcessor: NewDRAProcessor(k8sClient), } }) @@ -532,7 +533,7 @@ var _ = Describe("TensorFusionPodMutator", func() { }, }, } - tfInfo, err := ParseTensorFusionInfo(ctx, k8sClient, pod) + tfInfo, err := ParseTensorFusionInfo(ctx, k8sClient, mutator.draProcessor, pod) Expect(err).NotTo(HaveOccurred()) Expect(tfInfo.ContainerNames).To(HaveLen(1)) Expect(tfInfo.ContainerNames[0]).To(Equal("test-container")) @@ -564,7 +565,7 @@ var _ = Describe("TensorFusionPodMutator", func() { currentBytes, err := json.Marshal(pod) Expect(err).NotTo(HaveOccurred()) - patch, err := mutator.patchTFClient(pod, pool, false, currentBytes, []int{0}) + patch, err := mutator.patchTFClient(context.Background(), pod, pool, false, currentBytes, []int{0}) Expect(err).NotTo(HaveOccurred()) Expect(patch).NotTo(BeEmpty()) // There should be at least 2 patches (initContainers and the container env patches) diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index bf805b76..1cfcd8f9 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -28,6 +28,7 @@ type TFResource struct { func ParseTensorFusionInfo( ctx context.Context, k8sClient client.Client, + draProcessor *DRAProcessor, pod *corev1.Pod, ) (utils.TensorFusionInfo, error) { var info utils.TensorFusionInfo @@ -115,6 +116,11 @@ func ParseTensorFusionInfo( workloadProfile.Spec.GPUModel = gpuModel } + // Parse DRA enabled annotation + if draProcessor.IsDRAEnabled(ctx, pod) { + info.DRAEnabled = true + } + info.Profile = &workloadProfile.Spec info.ContainerNames = containerNames return info, nil From 4959c61425890c7d4b83def9eff9862b595c993b Mon Sep 17 00:00:00 2001 From: dylan Date: Tue, 23 Sep 2025 09:29:39 -0700 Subject: [PATCH 30/34] resource template support --- api/v1/schedulingconfigtemplate_types.go | 5 +- cmd/main.go | 10 + internal/constants/constants.go | 12 +- .../dra/resourceclaim_controller.go | 184 ++++++ .../dra/resourceclaim_controller_test.go | 557 ++++++++++++++++++ internal/webhook/v1/pod_dra.go | 224 ++----- internal/webhook/v1/pod_webhook_dra_test.go | 220 ++----- 7 files changed, 867 insertions(+), 345 deletions(-) create mode 100644 internal/controller/dra/resourceclaim_controller.go create mode 100644 internal/controller/dra/resourceclaim_controller_test.go diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index b3243344..8611ed99 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -216,10 +216,9 @@ type DRAConfig struct { // +optional Enable *bool `json:"enable,omitempty"` - // ResourceClass specifies the DRA resource class name to use - // +kubebuilder:default="tensorfusion.ai/gpu" + // ResourceClaimTemplateName specifies the ResourceClaim template name to use // +optional - ResourceClass string `json:"resourceClass,omitempty"` + ResourceClaimTemplateName string `json:"resourceClaimTemplateName,omitempty"` } // SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate. diff --git a/cmd/main.go b/cmd/main.go index 7f5f8721..5c994fcc 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -56,6 +56,7 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/controller" + "github.com/NexusGPU/tensor-fusion/internal/controller/dra" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/NexusGPU/tensor-fusion/internal/metrics" "github.com/NexusGPU/tensor-fusion/internal/portallocator" @@ -397,6 +398,15 @@ func startCustomResourceController( setupLog.Error(err, "unable to create controller", "controller", "Pod") os.Exit(1) } + + // Setup ResourceClaim controller for DRA Phase 2 + if err = (&dra.ResourceClaimReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ResourceClaim") + os.Exit(1) + } if err = (&controller.NodeReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 22d465cc..9a44345c 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -124,12 +124,20 @@ const ( // DRA support // annotation for pod to indicate if DRA is enabled - DRAEnabledAnnotation = Domain + "/dra-enabled" - DRAResourceClaimName = "tensor-fusion-resource-claim-%s" + DRAEnabledAnnotation = Domain + "/dra-enabled" + DRACelExpressionAnnotation = Domain + "/dra-cel-expression" + + DRADriverName = Domain + ".dra-driver" + DRAResourceClaimName = "tensor-fusion-resource-claim-%s-%s" // resource claim name for request DRAResourceClaimRequestName = "tensor-fusion-resource-claim-request-%s" DRAClaimDefineName = "tensor-fusion-gpu-claim" + + TensorFusionResourceClaimTemplateLabel = Domain + "/resource-claim-template" + + // ResourceClaimTemplate related constants + DRAResourceClaimTemplateName = "tensor-fusion-gpu-template" ) // for avoid golang lint issues diff --git a/internal/controller/dra/resourceclaim_controller.go b/internal/controller/dra/resourceclaim_controller.go new file mode 100644 index 00000000..679fb8cb --- /dev/null +++ b/internal/controller/dra/resourceclaim_controller.go @@ -0,0 +1,184 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dra + +import ( + "context" + "fmt" + + resourcev1beta2 "k8s.io/api/resource/v1beta2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/NexusGPU/tensor-fusion/internal/constants" +) + +// ResourceClaimReconciler reconciles ResourceClaim objects +type ResourceClaimReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaims,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +func (r *ResourceClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + + // Fetch the ResourceClaim instance + resourceClaim := &resourcev1beta2.ResourceClaim{} + if err := r.Get(ctx, req.NamespacedName, resourceClaim); err != nil { + if errors.IsNotFound(err) { + // Request object not found, could have been deleted after reconcile request. + // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. + // Return and don't requeue + log.Info("ResourceClaim resource not found. Ignoring since object must be deleted") + return ctrl.Result{}, nil + } + // Error reading the object - requeue the request. + log.Error(err, "Failed to get ResourceClaim") + return ctrl.Result{}, err + } + + // Check if this ResourceClaim is created from our ResourceClaimTemplate + if resourceClaim.Labels == nil { + // No labels, not our ResourceClaim + return ctrl.Result{}, nil + } + + labelValue, exists := resourceClaim.Labels[constants.TensorFusionResourceClaimTemplateLabel] + if !exists || labelValue != constants.TrueStringValue { + // Not our ResourceClaim, ignore + return ctrl.Result{}, nil + } + + log.Info("Processing TensorFusion ResourceClaim", "name", resourceClaim.Name, "namespace", resourceClaim.Namespace) + + // Find the owner Pod to get the CEL expression annotation + ownerPod, err := r.findOwnerPod(ctx, resourceClaim) + if err != nil { + log.Error(err, "Failed to find owner Pod") + return ctrl.Result{}, err + } + + if ownerPod == nil { + log.Info("Owner Pod not found, ResourceClaim may not have OwnerReference yet") + return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil + } + + // Get CEL expression from Pod annotation + celExpression := ownerPod.Annotations[constants.DRACelExpressionAnnotation] + if celExpression == "" { + log.Info("No CEL expression found in Pod annotation", "pod", ownerPod.Name) + return ctrl.Result{}, nil + } + + // Update ResourceClaim with CEL expression + if err := r.updateResourceClaimCEL(ctx, resourceClaim, celExpression); err != nil { + log.Error(err, "Failed to update ResourceClaim CEL expression") + return ctrl.Result{}, err + } + + log.Info("Successfully updated ResourceClaim with CEL expression", "cel", celExpression) + return ctrl.Result{}, nil +} + +// findOwnerPod finds the Pod that owns this ResourceClaim +func (r *ResourceClaimReconciler) findOwnerPod(ctx context.Context, resourceClaim *resourcev1beta2.ResourceClaim) (*corev1.Pod, error) { + // Find the Pod OwnerReference (there should be exactly one) + var podOwnerRef *metav1.OwnerReference + for i, ownerRef := range resourceClaim.OwnerReferences { + if ownerRef.Kind == "Pod" && ownerRef.APIVersion == "v1" { + podOwnerRef = &resourceClaim.OwnerReferences[i] + break + } + } + + if podOwnerRef == nil { + return nil, nil // No Pod owner found + } + + // Get the Pod by name and namespace (UID is automatically verified by Kubernetes) + pod := &corev1.Pod{} + err := r.Get(ctx, types.NamespacedName{ + Name: podOwnerRef.Name, + Namespace: resourceClaim.Namespace, + }, pod) + if err != nil { + if errors.IsNotFound(err) { + return nil, nil // Pod was deleted + } + return nil, fmt.Errorf("failed to get owner Pod %s/%s: %w", resourceClaim.Namespace, podOwnerRef.Name, err) + } + + // Verify the UID matches (additional safety check) + if pod.UID != podOwnerRef.UID { + return nil, fmt.Errorf("Pod UID mismatch: expected %s, got %s", podOwnerRef.UID, pod.UID) + } + + return pod, nil +} + +// updateResourceClaimCEL updates the ResourceClaim's CEL selector expression +func (r *ResourceClaimReconciler) updateResourceClaimCEL(ctx context.Context, resourceClaim *resourcev1beta2.ResourceClaim, celExpression string) error { + // Check if we need to update + if len(resourceClaim.Spec.Devices.Requests) == 0 { + return fmt.Errorf("no device requests found in ResourceClaim") + } + + deviceReq := &resourceClaim.Spec.Devices.Requests[0] + if deviceReq.Exactly == nil { + return fmt.Errorf("no ExactDeviceRequest found") + } + + // Check if CEL expression is already set correctly + if len(deviceReq.Exactly.Selectors) > 0 && + deviceReq.Exactly.Selectors[0].CEL != nil && + deviceReq.Exactly.Selectors[0].CEL.Expression == celExpression { + // Already updated + return nil + } + + // Update the CEL expression + if len(deviceReq.Exactly.Selectors) == 0 { + deviceReq.Exactly.Selectors = []resourcev1beta2.DeviceSelector{{}} + } + + if deviceReq.Exactly.Selectors[0].CEL == nil { + deviceReq.Exactly.Selectors[0].CEL = &resourcev1beta2.CELDeviceSelector{} + } + + deviceReq.Exactly.Selectors[0].CEL.Expression = celExpression + + // Update the ResourceClaim + return r.Update(ctx, resourceClaim) +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ResourceClaimReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&resourcev1beta2.ResourceClaim{}). + Complete(r) +} \ No newline at end of file diff --git a/internal/controller/dra/resourceclaim_controller_test.go b/internal/controller/dra/resourceclaim_controller_test.go new file mode 100644 index 00000000..ece541f9 --- /dev/null +++ b/internal/controller/dra/resourceclaim_controller_test.go @@ -0,0 +1,557 @@ +package dra + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/NexusGPU/tensor-fusion/internal/constants" +) + +func TestResourceClaimReconciler_Reconcile(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, resourcev1beta2.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + tests := []struct { + name string + resourceClaim *resourcev1beta2.ResourceClaim + pod *corev1.Pod + expectedResult ctrl.Result + expectError bool + expectUpdate bool + }{ + { + name: "ResourceClaim not found", + expectedResult: ctrl.Result{}, + expectError: false, + }, + { + name: "ResourceClaim without TensorFusion label", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + }, + expectedResult: ctrl.Result{}, + expectError: false, + }, + { + name: "ResourceClaim with wrong label value", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + Labels: map[string]string{ + constants.TensorFusionResourceClaimTemplateLabel: "false", + }, + }, + }, + expectedResult: ctrl.Result{}, + expectError: false, + }, + { + name: "ResourceClaim without owner Pod", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + Labels: map[string]string{ + constants.TensorFusionResourceClaimTemplateLabel: constants.TrueStringValue, + }, + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + Exactly: &resourcev1beta2.ExactDeviceRequest{ + Count: 1, + }, + }, + }, + }, + }, + }, + expectedResult: ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, + expectError: false, + }, + { + name: "Owner Pod without CEL annotation", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + Labels: map[string]string{ + constants.TensorFusionResourceClaimTemplateLabel: constants.TrueStringValue, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: "test-pod", + UID: "pod-uid-123", + }, + }, + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + Exactly: &resourcev1beta2.ExactDeviceRequest{ + Count: 1, + }, + }, + }, + }, + }, + }, + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "pod-uid-123", + }, + }, + expectedResult: ctrl.Result{}, + expectError: false, + }, + { + name: "Successful CEL expression update", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + Labels: map[string]string{ + constants.TensorFusionResourceClaimTemplateLabel: constants.TrueStringValue, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: "test-pod", + UID: "pod-uid-123", + }, + }, + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + Exactly: &resourcev1beta2.ExactDeviceRequest{ + Count: 1, + }, + }, + }, + }, + }, + }, + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "pod-uid-123", + Annotations: map[string]string{ + constants.DRACelExpressionAnnotation: `device.attributes["tflops"].quantity >= quantity("10")`, + }, + }, + }, + expectedResult: ctrl.Result{}, + expectError: false, + expectUpdate: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var objects []runtime.Object + if tt.resourceClaim != nil { + objects = append(objects, tt.resourceClaim) + } + if tt.pod != nil { + objects = append(objects, tt.pod) + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(objects...). + Build() + + reconciler := &ResourceClaimReconciler{ + Client: fakeClient, + Scheme: scheme, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "test-claim", + Namespace: "default", + }, + } + + result, err := reconciler.Reconcile(context.Background(), req) + + if tt.expectError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + assert.Equal(t, tt.expectedResult, result) + + // Check if ResourceClaim was updated with CEL expression + if tt.expectUpdate && tt.resourceClaim != nil { + updatedClaim := &resourcev1beta2.ResourceClaim{} + err := fakeClient.Get(context.Background(), types.NamespacedName{ + Name: tt.resourceClaim.Name, + Namespace: tt.resourceClaim.Namespace, + }, updatedClaim) + require.NoError(t, err) + + require.Len(t, updatedClaim.Spec.Devices.Requests, 1) + deviceReq := updatedClaim.Spec.Devices.Requests[0] + require.NotNil(t, deviceReq.Exactly) + require.Len(t, deviceReq.Exactly.Selectors, 1) + require.NotNil(t, deviceReq.Exactly.Selectors[0].CEL) + assert.Equal(t, `device.attributes["tflops"].quantity >= quantity("10")`, deviceReq.Exactly.Selectors[0].CEL.Expression) + } + }) + } +} + +func TestResourceClaimReconciler_findOwnerPod(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, resourcev1beta2.AddToScheme(scheme)) + + tests := []struct { + name string + resourceClaim *resourcev1beta2.ResourceClaim + pod *corev1.Pod + expectedPod *corev1.Pod + expectError bool + }{ + { + name: "No owner references", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + }, + expectedPod: nil, + expectError: false, + }, + { + name: "No Pod owner reference", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "test-deployment", + UID: "deployment-uid-123", + }, + }, + }, + }, + expectedPod: nil, + expectError: false, + }, + { + name: "Pod owner not found", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: "nonexistent-pod", + UID: "pod-uid-123", + }, + }, + }, + }, + expectedPod: nil, + expectError: false, + }, + { + name: "Pod UID mismatch", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: "test-pod", + UID: "pod-uid-123", + }, + }, + }, + }, + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "different-uid", + }, + }, + expectedPod: nil, + expectError: true, + }, + { + name: "Successful Pod lookup", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: "test-pod", + UID: "pod-uid-123", + }, + }, + }, + }, + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "pod-uid-123", + }, + }, + expectedPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "pod-uid-123", + }, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var objects []runtime.Object + if tt.pod != nil { + objects = append(objects, tt.pod) + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(objects...). + Build() + + reconciler := &ResourceClaimReconciler{ + Client: fakeClient, + Scheme: scheme, + } + + pod, err := reconciler.findOwnerPod(context.Background(), tt.resourceClaim) + + if tt.expectError { + require.Error(t, err) + assert.Nil(t, pod) + } else { + require.NoError(t, err) + if tt.expectedPod == nil { + assert.Nil(t, pod) + } else { + require.NotNil(t, pod) + assert.Equal(t, tt.expectedPod.Name, pod.Name) + assert.Equal(t, tt.expectedPod.Namespace, pod.Namespace) + assert.Equal(t, tt.expectedPod.UID, pod.UID) + } + } + }) + } +} + +func TestResourceClaimReconciler_updateResourceClaimCEL(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, resourcev1beta2.AddToScheme(scheme)) + + tests := []struct { + name string + resourceClaim *resourcev1beta2.ResourceClaim + celExpression string + expectError bool + expectUpdate bool + }{ + { + name: "No device requests", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{}, + }, + }, + }, + celExpression: `device.attributes["tflops"].quantity >= quantity("10")`, + expectError: true, + }, + { + name: "No ExactDeviceRequest", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + // Exactly is nil + }, + }, + }, + }, + }, + celExpression: `device.attributes["tflops"].quantity >= quantity("10")`, + expectError: true, + }, + { + name: "CEL expression already set correctly", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + Exactly: &resourcev1beta2.ExactDeviceRequest{ + Count: 1, + Selectors: []resourcev1beta2.DeviceSelector{ + { + CEL: &resourcev1beta2.CELDeviceSelector{ + Expression: `device.attributes["tflops"].quantity >= quantity("10")`, + }, + }, + }, + }, + }, + }, + }, + }, + }, + celExpression: `device.attributes["tflops"].quantity >= quantity("10")`, + expectError: false, + expectUpdate: false, // No update needed + }, + { + name: "Successful CEL expression update - empty selectors", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + Exactly: &resourcev1beta2.ExactDeviceRequest{ + Count: 1, + }, + }, + }, + }, + }, + }, + celExpression: `device.attributes["tflops"].quantity >= quantity("10")`, + expectError: false, + expectUpdate: true, + }, + { + name: "Successful CEL expression update - nil CEL", + resourceClaim: &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-claim", + Namespace: "default", + }, + Spec: resourcev1beta2.ResourceClaimSpec{ + Devices: resourcev1beta2.DeviceClaim{ + Requests: []resourcev1beta2.DeviceRequest{ + { + Name: "gpu-request", + Exactly: &resourcev1beta2.ExactDeviceRequest{ + Count: 1, + Selectors: []resourcev1beta2.DeviceSelector{ + { + // CEL is nil + }, + }, + }, + }, + }, + }, + }, + }, + celExpression: `device.attributes["vram"].quantity >= quantity("8Gi")`, + expectError: false, + expectUpdate: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(tt.resourceClaim). + Build() + + reconciler := &ResourceClaimReconciler{ + Client: fakeClient, + Scheme: scheme, + } + + err := reconciler.updateResourceClaimCEL(context.Background(), tt.resourceClaim, tt.celExpression) + + if tt.expectError { + require.Error(t, err) + } else { + require.NoError(t, err) + + if tt.expectUpdate { + // Verify the CEL expression was set correctly + require.Len(t, tt.resourceClaim.Spec.Devices.Requests, 1) + deviceReq := tt.resourceClaim.Spec.Devices.Requests[0] + require.NotNil(t, deviceReq.Exactly) + require.Len(t, deviceReq.Exactly.Selectors, 1) + require.NotNil(t, deviceReq.Exactly.Selectors[0].CEL) + assert.Equal(t, tt.celExpression, deviceReq.Exactly.Selectors[0].CEL.Expression) + } + } + }) + } +} \ No newline at end of file diff --git a/internal/webhook/v1/pod_dra.go b/internal/webhook/v1/pod_dra.go index ef2bd0b3..87f73154 100644 --- a/internal/webhook/v1/pod_dra.go +++ b/internal/webhook/v1/pod_dra.go @@ -18,15 +18,10 @@ package v1 import ( "context" - "crypto/rand" - "encoding/hex" "fmt" "strings" corev1 "k8s.io/api/core/v1" - resourcev1beta2 "k8s.io/api/resource/v1beta2" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -37,16 +32,9 @@ import ( // DRAProcessor handles all DRA-related operations for pod admission type DRAProcessor struct { client.Client - enableDRA bool - resourceClass string // cached resource class to avoid repeated API calls - configLoaded bool // tracks if configuration has been loaded -} - -// generateUniqueID creates a random 8-character hex string for resource claim names -func generateUniqueID() string { - bytes := make([]byte, 4) - _, _ = rand.Read(bytes) // crypto/rand.Read always returns len(bytes), nil on success - return hex.EncodeToString(bytes) + enableDRA bool + resourceClaimTemplateName string // cached ResourceClaimTemplate name + configLoaded bool // tracks if configuration has been loaded } // NewDRAProcessor creates a new DRA processor @@ -67,6 +55,7 @@ func (p *DRAProcessor) InitializeDRAConfig(ctx context.Context) error { // Set defaults first p.enableDRA = false + p.resourceClaimTemplateName = constants.DRAResourceClaimTemplateName templateList := &tfv1.SchedulingConfigTemplateList{} // Use the provided context to respect cancellation @@ -78,29 +67,29 @@ func (p *DRAProcessor) InitializeDRAConfig(ctx context.Context) error { return nil } - // Check if any template has DRA enabled and cache the resource class + // Check if any template has DRA enabled and cache the ResourceClaimTemplateName for _, template := range templateList.Items { if template.Spec.DRA != nil { if template.Spec.DRA.Enable != nil && *template.Spec.DRA.Enable { p.enableDRA = true } - // Cache the resource class from the template - if template.Spec.DRA.ResourceClass != "" { - p.resourceClass = template.Spec.DRA.ResourceClass + // Cache the ResourceClaimTemplateName from the template + if template.Spec.DRA.ResourceClaimTemplateName != "" { + p.resourceClaimTemplateName = template.Spec.DRA.ResourceClaimTemplateName } } } - if p.enableDRA && p.resourceClass == "" { - return fmt.Errorf("resource class is not set") - } - p.configLoaded = true return nil } // IsDRAEnabled checks if DRA is enabled for a specific pod func (p *DRAProcessor) IsDRAEnabled(ctx context.Context, pod *corev1.Pod) bool { + // Load configuration if not yet loaded (lazy loading) + if !p.configLoaded { + _ = p.InitializeDRAConfig(ctx) // Ignore error to maintain backward compatibility + } // Check pod-level annotation first (explicit override) if val, ok := pod.Annotations[constants.DRAEnabledAnnotation]; ok && val == constants.TrueStringValue { @@ -121,161 +110,29 @@ func HasDRAClaim(pod *corev1.Pod) bool { return len(pod.Spec.ResourceClaims) > 0 } -// convertToResourceClaim converts GPU resource requests to ResourceClaim -func (p *DRAProcessor) convertToResourceClaim(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (*resourcev1beta2.ResourceClaim, error) { +// HandleDRAAdmission handles the complete DRA admission process +func (p *DRAProcessor) HandleDRAAdmission(ctx context.Context, pod *corev1.Pod, tfInfo *utils.TensorFusionInfo, containerIndices []int) error { + // Load DRA configuration if needed + if err := p.InitializeDRAConfig(ctx); err != nil { + return fmt.Errorf("failed to load DRA config: %w", err) + } - // Build CEL selector using DRA helper + // Convert GPU resources to ResourceClaimTemplate reference and store CEL in annotation celSelector, err := BuildCELSelector(pod, tfInfo) if err != nil { - return nil, fmt.Errorf("failed to build CEL selector: %w", err) - } - - // Generate unique claim name with random suffix to avoid conflicts - var baseName string - - if pod.GenerateName != "" { - baseName = strings.TrimSuffix(pod.GenerateName, "-") - } else if pod.Name != "" { - baseName = pod.Name - } - - uniqueID := generateUniqueID() - claimName := fmt.Sprintf(constants.DRAResourceClaimName, baseName, uniqueID) - - // Use cached resource class instead of making API calls - resourceClass := p.resourceClass - - claim := &resourcev1beta2.ResourceClaim{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "resource.k8s.io/v1beta2", - Kind: "ResourceClaim", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: claimName, - Namespace: pod.Namespace, - // Note: We cannot set OwnerReference here because pod.UID is empty during admission. - // The controller will set the proper owner reference once the Pod is created. - }, - Spec: resourcev1beta2.ResourceClaimSpec{ - Devices: resourcev1beta2.DeviceClaim{ - Requests: []resourcev1beta2.DeviceRequest{ - { - Name: fmt.Sprintf(constants.DRAResourceClaimRequestName, generateUniqueID()), - Exactly: &resourcev1beta2.ExactDeviceRequest{ - DeviceClassName: resourceClass, - Selectors: []resourcev1beta2.DeviceSelector{ - { - CEL: &resourcev1beta2.CELDeviceSelector{ - Expression: celSelector, - }, - }, - }, - Count: int64(tfInfo.Profile.GPUCount), - }, - }, - }, - }, - }, - } - - return claim, nil -} - -// injectResourceClaimRef adds ResourceClaim reference to Pod spec -func (p *DRAProcessor) injectResourceClaimRef(pod *corev1.Pod, claim *resourcev1beta2.ResourceClaim, containerIndices []int) { - // Add ResourceClaim reference to pod.Spec.ResourceClaims - if pod.Spec.ResourceClaims == nil { - pod.Spec.ResourceClaims = []corev1.PodResourceClaim{} - } - - claimRef := corev1.PodResourceClaim{ - Name: constants.DRAClaimDefineName, - ResourceClaimName: &claim.Name, - } - - // Check if the claim reference already exists to maintain idempotency - claimExists := false - for i, existingClaim := range pod.Spec.ResourceClaims { - if existingClaim.Name == constants.DRAClaimDefineName { - // Update existing claim to point to the new ResourceClaim name - pod.Spec.ResourceClaims[i].ResourceClaimName = &claim.Name - claimExists = true - break - } - } - - if !claimExists { - pod.Spec.ResourceClaims = append(pod.Spec.ResourceClaims, claimRef) - } - - // Add resource claim consumption to containers - for _, containerIndex := range containerIndices { - container := &pod.Spec.Containers[containerIndex] - if container.Resources.Claims == nil { - container.Resources.Claims = []corev1.ResourceClaim{} - } - - // Check if the container already has this claim to maintain idempotency - hasGPUClaim := false - for _, existingClaim := range container.Resources.Claims { - if existingClaim.Name == constants.DRAClaimDefineName { - hasGPUClaim = true - break - } - } - - if !hasGPUClaim { - container.Resources.Claims = append(container.Resources.Claims, corev1.ResourceClaim{ - Name: constants.DRAClaimDefineName, - }) - } - } -} - -// createResourceClaim creates a ResourceClaim object with proper error handling and retries -func (p *DRAProcessor) createResourceClaim(ctx context.Context, claim *resourcev1beta2.ResourceClaim) error { - // Try to create the ResourceClaim - if err := p.Create(ctx, claim); err != nil { - if errors.IsAlreadyExists(err) { - // Check if the existing claim is for the same pod - existingClaim := &resourcev1beta2.ResourceClaim{} - getErr := p.Get(ctx, client.ObjectKey{Name: claim.Name, Namespace: claim.Namespace}, existingClaim) - if getErr != nil { - return fmt.Errorf("failed to check existing ResourceClaim: %w", getErr) - } - // Different pod or missing labels, this is an error - return fmt.Errorf("ResourceClaim %s already exists for a different pod", claim.Name) - } - - if errors.IsInvalid(err) { - return fmt.Errorf("ResourceClaim is invalid: %w", err) - } - - if errors.IsForbidden(err) { - return fmt.Errorf("insufficient permissions to create ResourceClaim: %w", err) - } + return fmt.Errorf("failed to build CEL selector: %w", err) } - return nil -} - -// Note: patchTFClientForDRA is temporarily handled in the main pod_webhook.go -// until we can properly abstract all the TF client patching logic + // Inject ResourceClaimTemplate reference to Pod + p.injectResourceClaimTemplateRef(pod) -// HandleDRAAdmission handles the complete DRA admission process -func (p *DRAProcessor) HandleDRAAdmission(ctx context.Context, pod *corev1.Pod, tfInfo *utils.TensorFusionInfo, containerIndices []int) error { - // Convert GPU resources to ResourceClaim - resourceClaim, err := p.convertToResourceClaim(pod, tfInfo) - if err != nil { - return fmt.Errorf("failed to convert to ResourceClaim: %w", err) + // Mark pod with DRA enabled annotation + if pod.Annotations == nil { + pod.Annotations = make(map[string]string) } + pod.Annotations[constants.DRAEnabledAnnotation] = constants.TrueStringValue + pod.Annotations[constants.DRACelExpressionAnnotation] = celSelector - // Create ResourceClaim - if err := p.createResourceClaim(ctx, resourceClaim); err != nil { - return fmt.Errorf("failed to create ResourceClaim: %w", err) - } - // Inject ResourceClaim reference to Pod - p.injectResourceClaimRef(pod, resourceClaim, containerIndices) return nil } @@ -305,3 +162,30 @@ func BuildCELSelector(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (string, return strings.Join(conditions, " && "), nil } + +// injectResourceClaimTemplateRef adds ResourceClaimTemplate reference to Pod spec +func (p *DRAProcessor) injectResourceClaimTemplateRef(pod *corev1.Pod) { + // Add ResourceClaimTemplate reference to pod.Spec.ResourceClaims + if pod.Spec.ResourceClaims == nil { + pod.Spec.ResourceClaims = []corev1.PodResourceClaim{} + } + + // Use ResourceClaimTemplate instead of direct ResourceClaim + claimRef := corev1.PodResourceClaim{ + Name: constants.DRAClaimDefineName, + ResourceClaimTemplateName: &p.resourceClaimTemplateName, + } + + // Check if the claim reference already exists to maintain idempotency + claimExists := false + for _, existingClaim := range pod.Spec.ResourceClaims { + if existingClaim.Name == constants.DRAClaimDefineName { + claimExists = true + break + } + } + + if !claimExists { + pod.Spec.ResourceClaims = append(pod.Spec.ResourceClaims, claimRef) + } +} diff --git a/internal/webhook/v1/pod_webhook_dra_test.go b/internal/webhook/v1/pod_webhook_dra_test.go index 09a738b0..fd625cc9 100644 --- a/internal/webhook/v1/pod_webhook_dra_test.go +++ b/internal/webhook/v1/pod_webhook_dra_test.go @@ -7,11 +7,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" - resourcev1beta2 "k8s.io/api/resource/v1beta2" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -74,10 +72,9 @@ func TestDRAProcessor_IsDRAEnabled(t *testing.T) { } } -func TestDRAProcessor_convertToResourceClaim(t *testing.T) { +func TestDRAProcessor_HandleDRAAdmission(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, tfv1.AddToScheme(scheme)) - require.NoError(t, resourcev1beta2.AddToScheme(scheme)) // Create a SchedulingConfigTemplate with DRA config template := &tfv1.SchedulingConfigTemplate{ @@ -86,8 +83,8 @@ func TestDRAProcessor_convertToResourceClaim(t *testing.T) { }, Spec: tfv1.SchedulingConfigTemplateSpec{ DRA: &tfv1.DRAConfig{ - Enable: &[]bool{true}[0], - ResourceClass: "custom.tensorfusion.ai/gpu", + Enable: &[]bool{true}[0], + ResourceClaimTemplateName: "custom-gpu-template", }, }, } @@ -101,16 +98,15 @@ func TestDRAProcessor_convertToResourceClaim(t *testing.T) { Client: fakeClient, } - // Initialize DRA config to set up the resource class cache - err := processor.InitializeDRAConfig(context.Background()) - require.NoError(t, err) - pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - GenerateName: "test-pod-", - UID: types.UID("test-uid"), + Name: "test-pod", + Namespace: "test-namespace", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "test-container"}, + }, }, } @@ -126,180 +122,64 @@ func TestDRAProcessor_convertToResourceClaim(t *testing.T) { }, } - claim, err := processor.convertToResourceClaim(pod, tfInfo) + containerIndices := []int{0} + + // Test HandleDRAAdmission + err := processor.HandleDRAAdmission(context.Background(), pod, tfInfo, containerIndices) require.NoError(t, err) - require.NotNil(t, claim) - - // Verify claim structure - assert.Contains(t, claim.Name, "test-pod-") - assert.Contains(t, claim.Name, "-gpu-claim") - assert.Equal(t, "test-namespace", claim.Namespace) - assert.Equal(t, "resource.k8s.io/v1beta2", claim.APIVersion) - assert.Equal(t, "ResourceClaim", claim.Kind) - - // Verify labels instead of owner references (since we removed owner references during admission) - require.NotNil(t, claim.Labels) - assert.Equal(t, "test-pod-", claim.Labels["tensorfusion.ai/pod"]) // Uses GenerateName as podIdentifier - assert.Equal(t, "gpu", claim.Labels["tensorfusion.ai/claim-for"]) - - // Verify device claim - require.Len(t, claim.Spec.Devices.Requests, 1) - deviceReq := claim.Spec.Devices.Requests[0] - assert.Equal(t, "gpu", deviceReq.Name) - - // Verify ExactDeviceRequest structure - require.NotNil(t, deviceReq.Exactly) - exactReq := deviceReq.Exactly - assert.Equal(t, "custom.tensorfusion.ai/gpu", exactReq.DeviceClassName) // Uses cached resource class from template - assert.Equal(t, int64(1), exactReq.Count) - - // Verify CEL selector - require.Len(t, exactReq.Selectors, 1) - require.NotNil(t, exactReq.Selectors[0].CEL) - - // The simplified CEL selector should only contain basic resource requirements - celExpression := exactReq.Selectors[0].CEL.Expression - - // Verify it contains the expected resource filters (simplified version) + + // Verify CEL expression is stored in Pod annotation + celExpression := pod.Annotations[constants.DRACelExpressionAnnotation] + require.NotEmpty(t, celExpression) assert.Contains(t, celExpression, `device.attributes["tflops"].quantity >= quantity("10")`) assert.Contains(t, celExpression, `device.attributes["vram"].quantity >= quantity("8Gi")`) - // Verify conditions are combined with AND - assert.Contains(t, celExpression, " && ") -} + // Verify DRA enabled annotation is set + assert.Equal(t, constants.TrueStringValue, pod.Annotations[constants.DRAEnabledAnnotation]) -func TestDRAProcessor_injectResourceClaimRef(t *testing.T) { - processor := &DRAProcessor{} + // Verify ResourceClaimTemplate reference is added to Pod + require.Len(t, pod.Spec.ResourceClaims, 1) + podClaim := pod.Spec.ResourceClaims[0] + assert.Equal(t, constants.DRAClaimDefineName, podClaim.Name) + require.NotNil(t, podClaim.ResourceClaimTemplateName) + assert.Equal(t, "custom-gpu-template", *podClaim.ResourceClaimTemplateName) + + // Verify processor has cached the ResourceClaimTemplateName + assert.Equal(t, "custom-gpu-template", processor.resourceClaimTemplateName) +} +func TestBuildCELSelector(t *testing.T) { pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "test-pod", Namespace: "test-namespace", }, - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - {Name: "container1"}, - {Name: "container2"}, - }, - }, - } - - claim := &resourcev1beta2.ResourceClaim{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-claim", - Namespace: "test-namespace", - }, - } - - containerIndices := []int{0, 1} - - processor.injectResourceClaimRef(pod, claim, containerIndices) - - // Verify pod resource claims - require.Len(t, pod.Spec.ResourceClaims, 1) - podClaim := pod.Spec.ResourceClaims[0] - assert.Equal(t, "gpu-claim", podClaim.Name) - require.NotNil(t, podClaim.ResourceClaimName) - assert.Equal(t, "test-claim", *podClaim.ResourceClaimName) - - // Verify container resource claims - for _, idx := range containerIndices { - container := pod.Spec.Containers[idx] - require.Len(t, container.Resources.Claims, 1) - assert.Equal(t, "gpu-claim", container.Resources.Claims[0].Name) } - // Verify annotations - require.NotNil(t, pod.Annotations) - assert.Equal(t, constants.TrueStringValue, pod.Annotations[constants.DRAEnabledAnnotation]) -} - -func TestDRAProcessor_createResourceClaim(t *testing.T) { - scheme := runtime.NewScheme() - require.NoError(t, resourcev1beta2.AddToScheme(scheme)) - - tests := []struct { - name string - existingClaim *resourcev1beta2.ResourceClaim - expectError bool - errorType string - }{ - { - name: "successful creation", - expectError: false, - }, - { - name: "claim already exists with same pod", - existingClaim: &resourcev1beta2.ResourceClaim{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-claim", - Namespace: "test-namespace", - Labels: map[string]string{ - "tensorfusion.ai/pod": "test-pod", - "tensorfusion.ai/claim-for": "gpu", - }, - }, - }, - expectError: false, - }, - { - name: "claim already exists with different pod", - existingClaim: &resourcev1beta2.ResourceClaim{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-claim", - Namespace: "test-namespace", - Labels: map[string]string{ - "tensorfusion.ai/pod": "different-pod", - "tensorfusion.ai/claim-for": "gpu", - }, + tfInfo := &utils.TensorFusionInfo{ + Profile: &tfv1.WorkloadProfileSpec{ + GPUCount: 2, + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), }, }, - expectError: true, - errorType: "conflict", + GPUModel: "H100", }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var fakeClient client.Client - if tt.existingClaim != nil { - fakeClient = fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(tt.existingClaim). - Build() - } else { - fakeClient = fake.NewClientBuilder(). - WithScheme(scheme). - Build() - } - - processor := &DRAProcessor{ - Client: fakeClient, - } - - claim := &resourcev1beta2.ResourceClaim{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-claim", - Namespace: "test-namespace", - Labels: map[string]string{ - "tensorfusion.ai/pod": "test-pod", - "tensorfusion.ai/claim-for": "gpu", - }, - }, - } + celExpression, err := BuildCELSelector(pod, tfInfo) + require.NoError(t, err) + require.NotEmpty(t, celExpression) - err := processor.createResourceClaim(context.Background(), claim) + // Verify it contains the expected resource filters + assert.Contains(t, celExpression, `device.attributes["tflops"].quantity >= quantity("20")`) + assert.Contains(t, celExpression, `device.attributes["vram"].quantity >= quantity("16Gi")`) + assert.Contains(t, celExpression, `device.attributes["model"] == "H100"`) - if tt.expectError { - require.Error(t, err) - if tt.errorType == "conflict" { - assert.Contains(t, err.Error(), "already exists for a different pod") - } - } else { - require.NoError(t, err) - } - }) - } + // Verify conditions are combined with AND + assert.Contains(t, celExpression, " && ") } func TestHasDRAClaim(t *testing.T) { @@ -352,8 +232,8 @@ func TestDRAProcessor_LazyConfigLoading(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "template1"}, Spec: tfv1.SchedulingConfigTemplateSpec{ DRA: &tfv1.DRAConfig{ - Enable: &[]bool{true}[0], - ResourceClass: "test.ai/gpu", + Enable: &[]bool{true}[0], + ResourceClaimTemplateName: "test-gpu-template", }, }, }, From ff9efd2244b1550ebc2e5364b430a867332606ea Mon Sep 17 00:00:00 2001 From: dylan Date: Wed, 24 Sep 2025 07:48:13 -0700 Subject: [PATCH 31/34] support resource claim cel builder --- internal/constants/constants.go | 2 - internal/gpuallocator/gpuallocator.go | 2 +- internal/webhook/v1/pod_dra.go | 24 +++- internal/webhook/v1/pod_webhook_dra_test.go | 146 +++++++++++++++++--- 4 files changed, 147 insertions(+), 27 deletions(-) diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 9a44345c..2d8eae1a 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -74,8 +74,6 @@ const ( // In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system SelectedWorkloadAnnotation = Domain + "/selected-workload" - CELFilterExpressionAnnotation = Domain + "/cel-filter-expression" - WorkloadModeAnnotation = Domain + "/workload-mode" WorkloadModeDynamic = "dynamic" WorkloadModeFixed = "fixed" diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index 44deb3c4..2e8e54fd 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -1355,7 +1355,7 @@ func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest Limit: gpuLimitResource, DisableCELFilter: disableCELFilter, - CELFilterExpression: pod.Annotations[constants.CELFilterExpressionAnnotation], + CELFilterExpression: pod.Annotations[constants.DRACelExpressionAnnotation], Count: uint(count), GPUModel: pod.Annotations[constants.GPUModelAnnotation], diff --git a/internal/webhook/v1/pod_dra.go b/internal/webhook/v1/pod_dra.go index 87f73154..cd1b7c8c 100644 --- a/internal/webhook/v1/pod_dra.go +++ b/internal/webhook/v1/pod_dra.go @@ -136,7 +136,7 @@ func (p *DRAProcessor) HandleDRAAdmission(ctx context.Context, pod *corev1.Pod, return nil } -// TODO: support more attributes for filtering +// BuildCELSelector constructs a CEL expression for DRA device selection based on TensorFusion requirements func BuildCELSelector(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (string, error) { var conditions []string @@ -154,6 +154,28 @@ func BuildCELSelector(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (string, conditions = append(conditions, fmt.Sprintf(`device.attributes["model"] == "%s"`, tfInfo.Profile.GPUModel)) } + // 3. GPU count requirement (important for multi-GPU workloads) + if tfInfo.Profile.GPUCount > 0 { + conditions = append(conditions, fmt.Sprintf(`int(device.attributes["gpu_count"]) >= %d`, tfInfo.Profile.GPUCount)) + } + + // 4. Pool name filter (for resource isolation and scheduling preferences) + if tfInfo.Profile.PoolName != "" { + conditions = append(conditions, fmt.Sprintf(`device.attributes["pool_name"] == "%s"`, tfInfo.Profile.PoolName)) + } + + // 5. Workload name filter (for workload-specific device assignment) + if tfInfo.WorkloadName != "" { + conditions = append(conditions, fmt.Sprintf(`device.attributes["workload_name"] == "%s"`, tfInfo.WorkloadName)) + // Workload namespace is same as pod namespace in TensorFusion + conditions = append(conditions, fmt.Sprintf(`device.attributes["workload_namespace"] == "%s"`, pod.Namespace)) + } + + // 6. Pod namespace filter (for namespace-based device isolation) + if pod.Namespace != "" { + conditions = append(conditions, fmt.Sprintf(`device.attributes["pod_namespace"] == "%s"`, pod.Namespace)) + } + // Return a basic condition if no specific requirements if len(conditions) == 0 { // Simple condition that should work with most DRA drivers diff --git a/internal/webhook/v1/pod_webhook_dra_test.go b/internal/webhook/v1/pod_webhook_dra_test.go index fd625cc9..e6fce827 100644 --- a/internal/webhook/v1/pod_webhook_dra_test.go +++ b/internal/webhook/v1/pod_webhook_dra_test.go @@ -149,37 +149,137 @@ func TestDRAProcessor_HandleDRAAdmission(t *testing.T) { } func TestBuildCELSelector(t *testing.T) { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", + tests := []struct { + name string + pod *corev1.Pod + tfInfo *utils.TensorFusionInfo + expectedConditions []string + unexpectedConditions []string + }{ + { + name: "Basic resource filters", + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test-namespace", + }, + }, + tfInfo: &utils.TensorFusionInfo{ + Profile: &tfv1.WorkloadProfileSpec{ + GPUCount: 2, + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), + }, + }, + GPUModel: "H100", + }, + }, + expectedConditions: []string{ + `device.attributes["tflops"].quantity >= quantity("20")`, + `device.attributes["vram"].quantity >= quantity("16Gi")`, + `device.attributes["model"] == "H100"`, + `int(device.attributes["gpu_count"]) >= 2`, + `device.attributes["pod_namespace"] == "test-namespace"`, + }, }, - } - - tfInfo := &utils.TensorFusionInfo{ - Profile: &tfv1.WorkloadProfileSpec{ - GPUCount: 2, - Resources: tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: resource.MustParse("20"), - Vram: resource.MustParse("16Gi"), + { + name: "All filters including pool and workload", + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "production", + }, + }, + tfInfo: &utils.TensorFusionInfo{ + Profile: &tfv1.WorkloadProfileSpec{ + GPUCount: 1, + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + }, + GPUModel: "A100", + PoolName: "high-priority", + }, + WorkloadName: "ml-training-job", + }, + expectedConditions: []string{ + `device.attributes["tflops"].quantity >= quantity("10")`, + `device.attributes["vram"].quantity >= quantity("8Gi")`, + `device.attributes["model"] == "A100"`, + `int(device.attributes["gpu_count"]) >= 1`, + `device.attributes["pool_name"] == "high-priority"`, + `device.attributes["workload_name"] == "ml-training-job"`, + `device.attributes["workload_namespace"] == "production"`, + `device.attributes["pod_namespace"] == "production"`, + }, + }, + { + name: "Zero resources fallback to default condition", + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + }, + }, + tfInfo: &utils.TensorFusionInfo{ + Profile: &tfv1.WorkloadProfileSpec{ + GPUCount: 0, // Zero count should not add condition + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + // Zero resources + }, + }, + }, + }, + expectedConditions: []string{ + `device.attributes["pod_namespace"] == "default"`, + }, + }, + { + name: "Empty resources fallback to basic condition", + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "", + }, + }, + tfInfo: &utils.TensorFusionInfo{ + Profile: &tfv1.WorkloadProfileSpec{ + // All empty/zero values }, }, - GPUModel: "H100", + expectedConditions: []string{ + `device.attributes.exists("type")`, + }, }, } - celExpression, err := BuildCELSelector(pod, tfInfo) - require.NoError(t, err) - require.NotEmpty(t, celExpression) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + celExpression, err := BuildCELSelector(tt.pod, tt.tfInfo) + require.NoError(t, err) + require.NotEmpty(t, celExpression) + + // Verify expected conditions are present + for _, condition := range tt.expectedConditions { + assert.Contains(t, celExpression, condition, "Expected condition not found: %s", condition) + } - // Verify it contains the expected resource filters - assert.Contains(t, celExpression, `device.attributes["tflops"].quantity >= quantity("20")`) - assert.Contains(t, celExpression, `device.attributes["vram"].quantity >= quantity("16Gi")`) - assert.Contains(t, celExpression, `device.attributes["model"] == "H100"`) + // Verify unexpected conditions are not present + for _, condition := range tt.unexpectedConditions { + assert.NotContains(t, celExpression, condition, "Unexpected condition found: %s", condition) + } - // Verify conditions are combined with AND - assert.Contains(t, celExpression, " && ") + // Verify proper AND joining (unless it's the fallback condition) + if len(tt.expectedConditions) > 1 { + assert.Contains(t, celExpression, " && ", "Conditions should be joined with &&") + } + }) + } } func TestHasDRAClaim(t *testing.T) { From 1afc62de324099fd876d7554152dd4c8e07e2736 Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 28 Sep 2025 05:24:38 -0700 Subject: [PATCH 32/34] fix conflict for gpuresources.go --- internal/scheduler/gpuresources/gpuresources.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go index 23ba873b..bc893087 100644 --- a/internal/scheduler/gpuresources/gpuresources.go +++ b/internal/scheduler/gpuresources/gpuresources.go @@ -117,7 +117,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state fwk.CycleState, pod *v1.Po // Check if DRA mode is enabled for this pod if isDRAEnabled(pod) && hasDRAClaim(pod) { - return nil, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU prefilter") + return nil, fwk.NewStatus(fwk.Skip, "DRA mode enabled, skipping custom GPU prefilter") } // Skip non tensor-fusion mode @@ -294,7 +294,7 @@ func (s *GPUFit) RemovePod(ctx context.Context, state fwk.CycleState, pod *v1.Po func (s *GPUFit) Filter(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeInfo fwk.NodeInfo) *fwk.Status { // Check if DRA mode is enabled for this pod if isDRAEnabled(pod) && hasDRAClaim(pod) { - return framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU filter") + return fwk.NewStatus(fwk.Skip, "DRA mode enabled, skipping custom GPU filter") } if !utils.IsTensorFusionWorker(pod) { @@ -338,8 +338,8 @@ func (s *GPUFit) Score( ctx context.Context, state fwk.CycleState, pod *v1.Pod, - nodeInfo *framework.NodeInfo, -) (int64, *framework.Status) { + nodeInfo fwk.NodeInfo, +) (int64, *fwk.Status) { // Skip non tensor-fusion mode scheduling if !utils.IsTensorFusionWorker(pod) { return 0, fwk.NewStatus(fwk.Success, "") @@ -377,7 +377,7 @@ func (s *GPUFit) ScoreExtensions() framework.ScoreExtensions { return nil } -func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { +func (s *GPUFit) Reserve(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeName string) *fwk.Status { if !utils.IsTensorFusionWorker(pod) { return fwk.NewStatus(fwk.Success, "skip for non tensor-fusion mode") } @@ -423,7 +423,7 @@ func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod * return fwk.NewStatus(fwk.Success, "") } -func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { +func (s *GPUFit) Unreserve(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeName string) { if !utils.IsTensorFusionWorker(pod) { return } @@ -442,7 +442,7 @@ func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod }, schedulingResult.FinalGPUs, pod.ObjectMeta) } -func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { +func (s *GPUFit) PostBind(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeName string) { if !utils.IsTensorFusionWorker(pod) { return } From efbce3fcf00949e6c8363b2e054bc1adcc0e0495 Mon Sep 17 00:00:00 2001 From: dylan Date: Sun, 28 Sep 2025 08:17:33 -0700 Subject: [PATCH 33/34] 1. support resource slice build and destory 2. make resource slice build and dra request build in the same logic --- cmd/main.go | 25 +- internal/constants/constants.go | 4 + .../dra/resourceclaim_controller.go | 58 +++-- .../dra/resourceclaim_controller_test.go | 13 +- .../dra/resourceslice_controller.go | 216 ++++++++++++++++++ internal/webhook/v1/pod_dra.go | 26 +-- 6 files changed, 287 insertions(+), 55 deletions(-) create mode 100644 internal/controller/dra/resourceslice_controller.go diff --git a/cmd/main.go b/cmd/main.go index a50ce745..b0ec36e7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -67,25 +67,8 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/NexusGPU/tensor-fusion/internal/version" webhookcorev1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8sVer "k8s.io/apimachinery/pkg/util/version" "k8s.io/apiserver/pkg/util/feature" - "k8s.io/client-go/kubernetes" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - _ "k8s.io/client-go/plugin/pkg/client/auth" - "k8s.io/client-go/rest" - "k8s.io/klog/v2" - "k8s.io/kubernetes/cmd/kube-scheduler/app" - "k8s.io/kubernetes/pkg/scheduler" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/healthz" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/metrics/filters" - metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" - "sigs.k8s.io/controller-runtime/pkg/webhook" - "sigs.k8s.io/yaml" // +kubebuilder:scaffold:imports ) @@ -437,6 +420,14 @@ func startCustomResourceController( setupLog.Error(err, "unable to create controller", "controller", "ResourceClaim") os.Exit(1) } + // Setup ResourceSlice controller for DRA Phase 2 + if err = (&dra.ResourceSliceReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ResourceSlice") + os.Exit(1) + } if err = (&controller.NodeReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 67a3dde6..77648769 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -140,6 +140,10 @@ const ( // ResourceClaimTemplate related constants DRAResourceClaimTemplateName = "tensor-fusion-gpu-template" + + // ResourceSlice related constants + DRAResourceSliceName = "tensor-fusion-resource-slice-%s" + DRAResourceSlicePool = "tensor-fusion-resource-slice-pool-%s" ) // for avoid golang lint issues diff --git a/internal/controller/dra/resourceclaim_controller.go b/internal/controller/dra/resourceclaim_controller.go index 679fb8cb..6d18b234 100644 --- a/internal/controller/dra/resourceclaim_controller.go +++ b/internal/controller/dra/resourceclaim_controller.go @@ -20,8 +20,9 @@ import ( "context" "fmt" - resourcev1beta2 "k8s.io/api/resource/v1beta2" + "github.com/NexusGPU/tensor-fusion/internal/utils" corev1 "k8s.io/api/core/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -88,20 +89,23 @@ func (r *ResourceClaimReconciler) Reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil } - // Get CEL expression from Pod annotation - celExpression := ownerPod.Annotations[constants.DRACelExpressionAnnotation] - if celExpression == "" { - log.Info("No CEL expression found in Pod annotation", "pod", ownerPod.Name) - return ctrl.Result{}, nil - } - // Update ResourceClaim with CEL expression - if err := r.updateResourceClaimCEL(ctx, resourceClaim, celExpression); err != nil { + if err := r.updateResourceClaimCEL(resourceClaim, ownerPod); err != nil { log.Error(err, "Failed to update ResourceClaim CEL expression") return ctrl.Result{}, err } + // Update ResourceClaim with capacity request + if err := r.updateCapacityRequest(resourceClaim, ownerPod); err != nil { + log.Error(err, "Failed to update ResourceClaim capacity request") + return ctrl.Result{}, err + } + + if err := r.Update(ctx, resourceClaim); err != nil { + log.Error(err, "Failed to update ResourceClaim") + return ctrl.Result{}, err + } - log.Info("Successfully updated ResourceClaim with CEL expression", "cel", celExpression) + log.Info("Successfully updated ResourceClaim") return ctrl.Result{}, nil } @@ -142,7 +146,7 @@ func (r *ResourceClaimReconciler) findOwnerPod(ctx context.Context, resourceClai } // updateResourceClaimCEL updates the ResourceClaim's CEL selector expression -func (r *ResourceClaimReconciler) updateResourceClaimCEL(ctx context.Context, resourceClaim *resourcev1beta2.ResourceClaim, celExpression string) error { +func (r *ResourceClaimReconciler) updateResourceClaimCEL(resourceClaim *resourcev1beta2.ResourceClaim, pod *corev1.Pod) error { // Check if we need to update if len(resourceClaim.Spec.Devices.Requests) == 0 { return fmt.Errorf("no device requests found in ResourceClaim") @@ -153,6 +157,13 @@ func (r *ResourceClaimReconciler) updateResourceClaimCEL(ctx context.Context, re return fmt.Errorf("no ExactDeviceRequest found") } + // Get CEL expression from Pod annotation + celExpression := pod.Annotations[constants.DRACelExpressionAnnotation] + + if celExpression == "" { + return nil + } + // Check if CEL expression is already set correctly if len(deviceReq.Exactly.Selectors) > 0 && deviceReq.Exactly.Selectors[0].CEL != nil && @@ -172,8 +183,27 @@ func (r *ResourceClaimReconciler) updateResourceClaimCEL(ctx context.Context, re deviceReq.Exactly.Selectors[0].CEL.Expression = celExpression - // Update the ResourceClaim - return r.Update(ctx, resourceClaim) + return nil +} + +func (r *ResourceClaimReconciler) updateCapacityRequest(resourceClaim *resourcev1beta2.ResourceClaim, pod *corev1.Pod) error { + if len(resourceClaim.Spec.Devices.Requests) == 0 { + return fmt.Errorf("no device requests found in ResourceClaim") + } + + deviceReq := &resourceClaim.Spec.Devices.Requests[0] + if deviceReq.Exactly == nil { + return fmt.Errorf("no ExactDeviceRequest found") + } + gpuRequestResource, err := utils.GetGPUResource(pod, true) + if err != nil { + return fmt.Errorf("failed to get GPU resource: %w", err) + } + //TODO extract to constants + deviceReq.Exactly.Capacity.Requests["tflops"] = gpuRequestResource.Tflops + deviceReq.Exactly.Capacity.Requests["vram"] = gpuRequestResource.Vram + + return nil } // SetupWithManager sets up the controller with the Manager. @@ -181,4 +211,4 @@ func (r *ResourceClaimReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&resourcev1beta2.ResourceClaim{}). Complete(r) -} \ No newline at end of file +} diff --git a/internal/controller/dra/resourceclaim_controller_test.go b/internal/controller/dra/resourceclaim_controller_test.go index ece541f9..aeebbda7 100644 --- a/internal/controller/dra/resourceclaim_controller_test.go +++ b/internal/controller/dra/resourceclaim_controller_test.go @@ -31,7 +31,7 @@ func TestResourceClaimReconciler_Reconcile(t *testing.T) { expectUpdate bool }{ { - name: "ResourceClaim not found", + name: "ResourceClaim not found", expectedResult: ctrl.Result{}, expectError: false, }, @@ -535,7 +535,14 @@ func TestResourceClaimReconciler_updateResourceClaimCEL(t *testing.T) { Scheme: scheme, } - err := reconciler.updateResourceClaimCEL(context.Background(), tt.resourceClaim, tt.celExpression) + mockPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + constants.DRACelExpressionAnnotation: tt.celExpression, + }, + }, + } + err := reconciler.updateResourceClaimCEL(tt.resourceClaim, mockPod) if tt.expectError { require.Error(t, err) @@ -554,4 +561,4 @@ func TestResourceClaimReconciler_updateResourceClaimCEL(t *testing.T) { } }) } -} \ No newline at end of file +} diff --git a/internal/controller/dra/resourceslice_controller.go b/internal/controller/dra/resourceslice_controller.go new file mode 100644 index 00000000..64a00ce8 --- /dev/null +++ b/internal/controller/dra/resourceslice_controller.go @@ -0,0 +1,216 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dra + +import ( + "context" + "fmt" + + resourcev1beta2 "k8s.io/api/resource/v1beta2" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" +) + +// ResourceSliceReconciler reconciles ResourceSlice objects based on GPUNode and GPU changes +type ResourceSliceReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceslices,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch +//+kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus,verbs=get;list;watch +//+kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpupools,verbs=get;list;watch + +// Reconcile processes GPUNode changes and generates/updates corresponding ResourceSlices +func (r *ResourceSliceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + log.Info("Reconciling ResourceSlice for GPUNode", "name", req.Name) + + // Fetch the GPUNode + gpuNode := &tfv1.GPUNode{} + if err := r.Get(ctx, req.NamespacedName, gpuNode); err != nil { + if errors.IsNotFound(err) { + // GPUNode was deleted, clean up associated ResourceSlice + return r.cleanupResourceSlice(ctx, req.Name) + } + log.Error(err, "Failed to get GPUNode") + return ctrl.Result{}, err + } + + // If GPUNode is being deleted, clean up ResourceSlice + if !gpuNode.DeletionTimestamp.IsZero() { + return r.cleanupResourceSlice(ctx, gpuNode.Name) + } + // Get all GPUs owned by this node + gpuList := &tfv1.GPUList{} + if err := r.List(ctx, gpuList, client.MatchingLabels{constants.LabelKeyOwner: gpuNode.Name}); err != nil { + log.Error(err, "Failed to list GPUs for node") + return ctrl.Result{}, err + } + + // Skip if no GPUs discovered yet + if len(gpuList.Items) == 0 { + log.Info("No GPUs discovered for node yet, skipping ResourceSlice generation") + return ctrl.Result{}, nil + } + + // Generate/update ResourceSlice for this node + if err := r.reconcileResourceSlice(ctx, gpuNode, gpuList.Items); err != nil { + log.Error(err, "Failed to reconcile ResourceSlice") + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileResourceSlice creates or updates the ResourceSlice for a GPUNode +func (r *ResourceSliceReconciler) reconcileResourceSlice(ctx context.Context, gpuNode *tfv1.GPUNode, gpus []tfv1.GPU) error { + log := log.FromContext(ctx) + + resourceSliceName := fmt.Sprintf(constants.DRAResourceSliceName, gpuNode.Name) + resourceSlice := &resourcev1beta2.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceSliceName, + }, + } + + _, err := controllerutil.CreateOrUpdate(ctx, r.Client, resourceSlice, func() error { + // Set basic spec fields + resourceSlice.Spec.Driver = constants.DRADriverName + resourceSlice.Spec.NodeName = &gpuNode.Name + resourceSlice.Spec.Pool = resourcev1beta2.ResourcePool{ + Name: gpuNode.Labels[constants.GpuPoolKey], + Generation: gpuNode.Generation, + ResourceSliceCount: 1, + } + + // Generate devices list + devices, err := r.generateDevices(ctx, gpus) + if err != nil { + return fmt.Errorf("failed to generate devices: %w", err) + } + resourceSlice.Spec.Devices = devices + + // Set labels for easy identification + if resourceSlice.Labels == nil { + resourceSlice.Labels = make(map[string]string) + } + resourceSlice.Labels[constants.LabelKeyOwner] = gpuNode.Name + return nil + }) + + if err != nil { + return fmt.Errorf("failed to create or update ResourceSlice: %w", err) + } + + log.Info("Successfully reconciled ResourceSlice", "resourceSlice", resourceSliceName) + return nil +} + +// generateDevices creates the device list for ResourceSlice based on physical GPUs +func (r *ResourceSliceReconciler) generateDevices(_ context.Context, gpus []tfv1.GPU) ([]resourcev1beta2.Device, error) { + devices := make([]resourcev1beta2.Device, 0, len(gpus)) + + // Calculate virtual capacities for proportional allocation + + for _, gpu := range gpus { + if gpu.Status.Capacity == nil { + continue + } + //TODO extract to constants + poolName := gpu.Labels[constants.GpuPoolKey] + device := resourcev1beta2.Device{ + Name: gpu.Status.UUID, + Attributes: map[resourcev1beta2.QualifiedName]resourcev1beta2.DeviceAttribute{ + "model": { + StringValue: &gpu.Status.GPUModel, + }, + "pool_name": { + StringValue: &poolName, + }, + "pod_namespace": { + StringValue: &gpu.Namespace, + }, + }, + Capacity: map[resourcev1beta2.QualifiedName]resourcev1beta2.DeviceCapacity{ + "tflops": { + Value: gpu.Status.Capacity.Tflops, + }, + "vram": { + Value: gpu.Status.Capacity.Vram, + }, + }, + AllowMultipleAllocations: func() *bool { b := true; return &b }(), + } + + devices = append(devices, device) + } + + return devices, nil +} + +// cleanupResourceSlice removes the ResourceSlice associated with a deleted GPUNode +func (r *ResourceSliceReconciler) cleanupResourceSlice(ctx context.Context, nodeName string) (ctrl.Result, error) { + log := log.FromContext(ctx) + + resourceSliceName := fmt.Sprintf(constants.DRAResourceSliceName, nodeName) + resourceSlice := &resourcev1beta2.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceSliceName, + }, + } + + err := r.Delete(ctx, resourceSlice) + if err != nil && !errors.IsNotFound(err) { + log.Error(err, "Failed to delete ResourceSlice", "name", resourceSliceName) + return ctrl.Result{}, err + } + + log.Info("Successfully cleaned up ResourceSlice", "name", resourceSliceName) + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager +func (r *ResourceSliceReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&tfv1.GPUNode{}). + Watches(&tfv1.GPU{}, handler.EnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []reconcile.Request { + // Get the owner GPUNode name from GPU labels + if labels := obj.GetLabels(); labels != nil { + if nodeName, ok := labels[constants.LabelKeyOwner]; ok { + return []reconcile.Request{ + {NamespacedName: types.NamespacedName{Name: nodeName}}, + } + } + } + return nil + })). + Complete(r) +} diff --git a/internal/webhook/v1/pod_dra.go b/internal/webhook/v1/pod_dra.go index cd1b7c8c..6a55fc4f 100644 --- a/internal/webhook/v1/pod_dra.go +++ b/internal/webhook/v1/pod_dra.go @@ -140,38 +140,22 @@ func (p *DRAProcessor) HandleDRAAdmission(ctx context.Context, pod *corev1.Pod, func BuildCELSelector(pod *corev1.Pod, tfInfo *utils.TensorFusionInfo) (string, error) { var conditions []string - // 1. Basic resource requirements using standard DRA quantity attributes - requests := tfInfo.Profile.Resources.Requests - if !requests.Tflops.IsZero() { - conditions = append(conditions, fmt.Sprintf(`device.attributes["tflops"].quantity >= quantity("%s")`, requests.Tflops.String())) - } - if !requests.Vram.IsZero() { - conditions = append(conditions, fmt.Sprintf(`device.attributes["vram"].quantity >= quantity("%s")`, requests.Vram.String())) - } - - // 2. GPU model filter (if specified - basic attribute that should be widely supported) + // 1. GPU model filter (if specified - basic attribute that should be widely supported) if tfInfo.Profile.GPUModel != "" { conditions = append(conditions, fmt.Sprintf(`device.attributes["model"] == "%s"`, tfInfo.Profile.GPUModel)) } - // 3. GPU count requirement (important for multi-GPU workloads) + // 2. GPU count requirement (important for multi-GPU workloads) if tfInfo.Profile.GPUCount > 0 { - conditions = append(conditions, fmt.Sprintf(`int(device.attributes["gpu_count"]) >= %d`, tfInfo.Profile.GPUCount)) + conditions = append(conditions, fmt.Sprintf(`size(devices) >= %d`, tfInfo.Profile.GPUCount)) } - // 4. Pool name filter (for resource isolation and scheduling preferences) + // 3. Pool name filter (for resource isolation and scheduling preferences) if tfInfo.Profile.PoolName != "" { conditions = append(conditions, fmt.Sprintf(`device.attributes["pool_name"] == "%s"`, tfInfo.Profile.PoolName)) } - // 5. Workload name filter (for workload-specific device assignment) - if tfInfo.WorkloadName != "" { - conditions = append(conditions, fmt.Sprintf(`device.attributes["workload_name"] == "%s"`, tfInfo.WorkloadName)) - // Workload namespace is same as pod namespace in TensorFusion - conditions = append(conditions, fmt.Sprintf(`device.attributes["workload_namespace"] == "%s"`, pod.Namespace)) - } - - // 6. Pod namespace filter (for namespace-based device isolation) + // 4. Pod namespace filter (for namespace-based device isolation) if pod.Namespace != "" { conditions = append(conditions, fmt.Sprintf(`device.attributes["pod_namespace"] == "%s"`, pod.Namespace)) } From 7d95fef8e230e4dad255dead35a07674bf8f8d9a Mon Sep 17 00:00:00 2001 From: dylan Date: Sat, 4 Oct 2025 07:47:12 -0700 Subject: [PATCH 34/34] feat: Added DRA CEL filter support - Implemented DRA CEL filters in GPU allocation requests - Added benchmarks for basic and complex expressions - Updated the resource slice controller to support Kubernetes hostname labels --- .../dra/resourceslice_controller.go | 2 + .../cel_filter/cel_filter_benchmark_test.go | 41 ++++ .../filter/cel_filter/dra_cel_filter.go | 216 ++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 internal/gpuallocator/filter/cel_filter/dra_cel_filter.go diff --git a/internal/controller/dra/resourceslice_controller.go b/internal/controller/dra/resourceslice_controller.go index 64a00ce8..fbd03f6f 100644 --- a/internal/controller/dra/resourceslice_controller.go +++ b/internal/controller/dra/resourceslice_controller.go @@ -122,6 +122,7 @@ func (r *ResourceSliceReconciler) reconcileResourceSlice(ctx context.Context, gp resourceSlice.Labels = make(map[string]string) } resourceSlice.Labels[constants.LabelKeyOwner] = gpuNode.Name + resourceSlice.Labels[constants.KubernetesHostNameLabel] = gpuNode.Name return nil }) @@ -144,6 +145,7 @@ func (r *ResourceSliceReconciler) generateDevices(_ context.Context, gpus []tfv1 continue } //TODO extract to constants + //TODO quota support poolName := gpu.Labels[constants.GpuPoolKey] device := resourcev1beta2.Device{ Name: gpu.Status.UUID, diff --git a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go index 39fcd907..254baf7c 100644 --- a/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go +++ b/internal/gpuallocator/filter/cel_filter/cel_filter_benchmark_test.go @@ -9,6 +9,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" + dracel "k8s.io/dynamic-resource-allocation/cel" ) // Test constants for repeated strings @@ -147,6 +148,46 @@ func BenchmarkFilterPerformance(b *testing.B) { } }) + // Benchmark DRA CEL filter - basic filtering + b.Run("DRACELFilter_Basic", func(b *testing.B) { + request := createTestAllocRequest("A100", "") + cache := dracel.NewCache(100, dracel.Features{}) + + draFilter, err := NewDRACELFilter(request, cache) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + filteredGPUs, err := draFilter.Filter(ctx, workerPodKey, gpus) + if err != nil { + b.Fatal(err) + } + _ = filteredGPUs + } + }) + + // Benchmark DRA CEL filter - complex expression + b.Run("DRACELFilter_Complex", func(b *testing.B) { + request := createTestAllocRequest("", "device.attributes['model'].string == 'A100' && device.attributes['label.environment'].string == '"+testEnvironmentProduction+"'") + cache := dracel.NewCache(100, dracel.Features{}) + + draFilter, err := NewDRACELFilter(request, cache) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + filteredGPUs, err := draFilter.Filter(ctx, workerPodKey, gpus) + if err != nil { + b.Fatal(err) + } + _ = filteredGPUs + } + }) + // Print performance comparison report after benchmarks printPerformanceComparison(b) } diff --git a/internal/gpuallocator/filter/cel_filter/dra_cel_filter.go b/internal/gpuallocator/filter/cel_filter/dra_cel_filter.go new file mode 100644 index 00000000..83b73c93 --- /dev/null +++ b/internal/gpuallocator/filter/cel_filter/dra_cel_filter.go @@ -0,0 +1,216 @@ +package cel_filter + +import ( + "context" + "encoding/json" + "fmt" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" + "github.com/samber/lo" + resourceapi "k8s.io/api/resource/v1" + dracel "k8s.io/dynamic-resource-allocation/cel" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// DRACELFilter implements CEL filtering using k8s.io/dynamic-resource-allocation/cel +type DRACELFilter struct { + name string + requiredPhases []tfv1.TensorFusionGPUPhase + userExpression string + cache *dracel.Cache + displayExpression string +} + +// NewDRACELFilter creates a new DRA-based CEL filter from allocation request +func NewDRACELFilter(req *tfv1.AllocRequest, cache *dracel.Cache) (*DRACELFilter, error) { + // Extract early filtering criteria + var requiredPhases []tfv1.TensorFusionGPUPhase + var userExpression, displayExpression string + + if req != nil { + requiredPhases = []tfv1.TensorFusionGPUPhase{ + tfv1.TensorFusionGPUPhaseRunning, + tfv1.TensorFusionGPUPhasePending, + } + userExpression = req.CELFilterExpression + displayExpression = buildDisplayExpression(req) + } + + // Handle nil request case + name := "AllocRequest-unknown" + if req != nil { + name = fmt.Sprintf("AllocRequest-%s", req.WorkloadNameNamespace.String()) + } + + // Validate expression if provided + if userExpression != "" && cache != nil { + result := cache.Check(userExpression) + if result.Error != nil { + return nil, fmt.Errorf("failed to compile CEL expression %q: %w", userExpression, result.Error) + } + } + + return &DRACELFilter{ + name: name, + requiredPhases: requiredPhases, + userExpression: userExpression, + cache: cache, + displayExpression: displayExpression, + }, nil +} + +// Name returns the filter name +func (f *DRACELFilter) Name() string { + return f.name +} + +// Filter applies the CEL expression to filter GPUs +func (f *DRACELFilter) Filter(ctx context.Context, workerPodKey tfv1.NameNamespace, gpus []*tfv1.GPU) ([]*tfv1.GPU, error) { + log := log.FromContext(ctx) + if len(gpus) == 0 { + return gpus, nil + } + + // Early filtering phase: apply basic filters first + earlyFilteredGPUs := make([]*tfv1.GPU, 0, len(gpus)) + for _, gpu := range gpus { + // Progressive migration mode check + if utils.IsProgressiveMigration() && gpu.Status.UsedBy != tfv1.UsedByTensorFusion { + continue + } + + // Fast path: check phase first (most common filter) + if f.requiredPhases != nil && !lo.Contains(f.requiredPhases, gpu.Status.Phase) { + continue + } + + earlyFilteredGPUs = append(earlyFilteredGPUs, gpu) + } + + // If no user expression, return early filtered results + if f.userExpression == "" { + log.V(1).Info("DRA CEL filter applied (early filtering only)", + "filter", f.name, + "inputGPUs", len(gpus), + "outputGPUs", len(earlyFilteredGPUs)) + return earlyFilteredGPUs, nil + } + + // If no GPUs passed early filtering, return empty result + if len(earlyFilteredGPUs) == 0 { + return earlyFilteredGPUs, nil + } + + // Get compiled expression from cache + compiledExpr := f.cache.GetOrCompile(f.userExpression) + if compiledExpr.Error != nil { + return nil, fmt.Errorf("failed to compile CEL expression %q: %w", f.userExpression, compiledExpr.Error) + } + + // Apply CEL filtering using DRA + filteredGPUs := make([]*tfv1.GPU, 0, len(earlyFilteredGPUs)) + for _, gpu := range earlyFilteredGPUs { + // Convert GPU to DRA Device + device, err := convertGPUToDevice(gpu) + if err != nil { + log.Error(err, "Failed to convert GPU to Device", "gpu", gpu.Name) + continue + } + + // Evaluate CEL expression + matches, details, err := compiledExpr.DeviceMatches(ctx, device) + if err != nil { + log.Error(err, "CEL expression evaluation failed", + "expression", f.userExpression, + "gpu", gpu.Name, + "details", details) + // On error, exclude the GPU (fail-safe) + continue + } + + if matches { + filteredGPUs = append(filteredGPUs, gpu) + } + } + + log.V(1).Info("DRA CEL filter applied", + "filter", f.name, + "displayExpression", f.displayExpression, + "userExpression", f.userExpression, + "inputGPUs", len(gpus), + "earlyFilteredGPUs", len(earlyFilteredGPUs), + "outputGPUs", len(filteredGPUs)) + + return filteredGPUs, nil +} + +// convertGPUToDevice converts tfv1.GPU to dracel.Device +func convertGPUToDevice(gpu *tfv1.GPU) (dracel.Device, error) { + if gpu == nil { + return dracel.Device{}, fmt.Errorf("GPU is nil") + } + + allowMultiple := true + device := dracel.Device{ + Driver: constants.DRADriverName, + AllowMultipleAllocations: &allowMultiple, + Attributes: make(map[resourceapi.QualifiedName]resourceapi.DeviceAttribute), + Capacity: make(map[resourceapi.QualifiedName]resourceapi.DeviceCapacity), + } + + // Map basic attributes + device.Attributes[GPUFieldName] = resourceapi.DeviceAttribute{StringValue: &gpu.Name} + device.Attributes[GPUFieldNamespace] = resourceapi.DeviceAttribute{StringValue: &gpu.Namespace} + model := gpu.Status.GPUModel + device.Attributes[GPUFieldGPUModel] = resourceapi.DeviceAttribute{StringValue: &model} + uuid := gpu.Status.UUID + device.Attributes[GPUFieldUUID] = resourceapi.DeviceAttribute{StringValue: &uuid} + usedBy := string(gpu.Status.UsedBy) + device.Attributes[GPUFieldUsedBy] = resourceapi.DeviceAttribute{StringValue: &usedBy} + message := gpu.Status.Message + device.Attributes[GPUFieldMessage] = resourceapi.DeviceAttribute{StringValue: &message} + + // Map labels with prefix + if len(gpu.Labels) > 0 { + for k, v := range gpu.Labels { + labelValue := v + device.Attributes[resourceapi.QualifiedName(fmt.Sprintf("%s.%s", GPUFieldLabels, k))] = resourceapi.DeviceAttribute{StringValue: &labelValue} + } + } + + // Map annotations with prefix + if len(gpu.Annotations) > 0 { + for k, v := range gpu.Annotations { + annotationValue := v + device.Attributes[resourceapi.QualifiedName(fmt.Sprintf("%s.%s", GPUFieldAnnotations, k))] = resourceapi.DeviceAttribute{StringValue: &annotationValue} + } + } + + // Map nodeSelector with prefix + if len(gpu.Status.NodeSelector) > 0 { + for k, v := range gpu.Status.NodeSelector { + selectorValue := v + device.Attributes[resourceapi.QualifiedName(fmt.Sprintf("%s.%s", GPUFieldNodeSelector, k))] = resourceapi.DeviceAttribute{StringValue: &selectorValue} + } + } + + // Map runningApps as JSON string + if len(gpu.Status.RunningApps) > 0 { + appsJSON, err := json.Marshal(gpu.Status.RunningApps) + if err != nil { + return dracel.Device{}, fmt.Errorf("failed to marshal runningApps: %w", err) + } + appsStr := string(appsJSON) + device.Attributes[GPUFieldRunningApps] = resourceapi.DeviceAttribute{StringValue: &appsStr} + } + + // Map capacity (tflops and vram) - DRA experimental version maintains capacity state + if gpu.Status.Capacity != nil { + device.Capacity[ResourceFieldTFlops] = resourceapi.DeviceCapacity{Value: gpu.Status.Capacity.Tflops} + device.Capacity[ResourceFieldVRAM] = resourceapi.DeviceCapacity{Value: gpu.Status.Capacity.Vram} + } + + return device, nil +}