InftyAI · X1aoZEOuO · Aug 5, 2025 · Sep 1, 2025 · Sep 9, 2025
diff --git a/Makefile b/Makefile
@@ -302,6 +302,15 @@ install-prometheus:
 uninstall-prometheus:
 	kubectl delete -k config/prometheus
 
+.PHONY: install-keda
+install-keda:
+	helm repo add kedacore https://kedacore.github.io/charts
+	helm install keda kedacore/keda --namespace keda --create-namespace
+
+.PHONY: uninstall-keda
+uninstall-keda:
+	helm uninstall keda -n keda
+
 ##@Release
 
 .PHONY: artifacts

diff --git a/docs/examples/serverless/README.md b/docs/examples/serverless/README.md
@@ -0,0 +1,119 @@
+# Serverless Configuration and Documentation
+
+## Overview
+
+This document provides a detailed guide on configuring serverless environments using Kubernetes, with a focus on integrating Prometheus for monitoring and KEDA for scaling. The configuration aims to ensure efficient resource utilization and seamless scaling of applications.
+
+## Concepts
+
+### Prometheus Configuration
+
+Prometheus is used for monitoring and alerting. To enable cross-namespace ServiceMonitor discovery, use `namespaceSelector`. In Prometheus, define `serviceMonitorSelector` to associate with ServiceMonitors.
+
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: qwen2-0--5b-lb-monitor
+  namespace: llmaz-system
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: servicemonitor
+spec:
+  namespaceSelector:
+    any: true
+  selector:
+    matchLabels:
+      llmaz.io/model-name: qwen2-0--5b
+  endpoints:
+    - port: http
+      path: /metrics
+      scheme: http
+```
+
+- Ensure that the `namespaceSelector` is set to allow cross-namespace monitoring.
+- Label your services appropriately to be discovered by Prometheus.
+
+### KEDA Configuration
+
+KEDA (Kubernetes Event-driven Autoscaling) is used for scaling applications based on custom metrics. It can be integrated with Prometheus to trigger scaling actions.
+
+
+```yaml
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: qwen2-0--5b-scaler
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: inference.llmaz.io/v1alpha1
+    kind: Playground
+    name: qwen2-0--5b
+  pollingInterval: 30
+  cooldownPeriod: 50
+  minReplicaCount: 0
+  maxReplicaCount: 3
+  triggers:
+  - type: prometheus
+    metadata:
+      serverAddress: http://prometheus-operated.llmaz-system.svc.cluster.local:9090
+      metricName: llamacpp:requests_processing
+      query: sum(llamacpp:requests_processing)
+      threshold: "0.2"
+```
+
+- Ensure that the `serverAddress` points to the correct Prometheus service.
+- Adjust `pollingInterval` and `cooldownPeriod` to optimize scaling behavior and avoid conflicts with other scaling mechanisms.
+
+### Integration with Activator
+
+Consider integrating the serverless configuration with an activator for scale-from-zero scenarios. The activator can be implemented using a controller pattern or as a standalone goroutine.
+
+### Controller Runtime Framework
+
+Using the Controller Runtime framework can simplify the development of Kubernetes controllers. It provides abstractions for managing resources and handling events.
+
+#### Key Components
+
+1. **Controller**: Monitors resource states and triggers actions to align actual and desired states.
+2. **Reconcile Function**: Core logic for transitioning resource states.
+3. **Manager**: Manages the lifecycle of controllers and shared resources.
+4. **Client**: Interface for interacting with the Kubernetes API.
+5. **Scheme**: Registry for resource types.
+6. **Event Source and Handler**: Define event sources and handling logic.
+
+
+## Quick Start Guide
+
+1. Install Prometheus and KEDA using Helm charts, following the official documentation [Install Guide](https://llmaz.inftyai.com/docs/getting-started/installation/).
+
+```bash
+helm install llmaz oci://registry-1.docker.io/inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.10
+make install-keda
+make install-prometheus
+```
+
+2.  Create a ServiceMonitor for Prometheus to discover your services.
+```bash
+kubectl apply -f service-monitor.yaml
+```
+
+3. Create a ScaledObject for KEDA to manage scaling.
+```bash
+kubectl apply -f scaled-object.yaml
+```
+
+4. Test with a cold start application.
+```bash
+kubectl exec -it -n kube-system deploy/activator -- wget -O- qwen2-0--5b-lb.default.svc:8080
+```
+
+5. Check with Prometheus and KEDA dashboards to monitor metrics and scaling activities in web page.
+```bash
+kubectl port-forward services/prometheus-operated 9090:9090 --address 0.0.0.0 -n llmaz-system
+```
+
+## Conclusion
+
+This configuration guide provides a comprehensive approach to setting up a serverless environment with Kubernetes, Prometheus, and KEDA. By following these guidelines, you can ensure efficient scaling and monitoring of your applications.
diff --git a/docs/examples/serverless/basic.yaml b/docs/examples/serverless/basic.yaml
@@ -0,0 +1,76 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: qwen2-0--5b
+spec:
+  familyName: qwen2
+  source:
+    modelHub:
+      modelID: Qwen/Qwen2-0.5B-Instruct-GGUF
+      filename: qwen2-0_5b-instruct-q5_k_m.gguf
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen2-0--5b
+spec:
+  replicas: 0
+  modelClaim:
+    modelName: qwen2-0--5b
+  backendRuntimeConfig:
+    backendName: llamacpp
+    configName: default
+    args:
+      - -fa # use flash attention
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: GatewayClass
+metadata:
+  name: default-envoy-ai-gateway
+spec:
+  controllerName: gateway.envoyproxy.io/gatewayclass-controller
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: default-envoy-ai-gateway
+spec:
+  gatewayClassName: default-envoy-ai-gateway
+  listeners:
+    - name: http
+      protocol: HTTP
+      port: 80
+---
+apiVersion: aigateway.envoyproxy.io/v1alpha1
+kind: AIGatewayRoute
+metadata:
+  name: default-envoy-ai-gateway
+spec:
+  schema:
+    name: OpenAI
+  targetRefs:
+    - name: default-envoy-ai-gateway
+      kind: Gateway
+      group: gateway.networking.k8s.io
+  rules:
+    - matches:
+        - headers:
+            - type: Exact
+              name: x-ai-eg-model
+              value: qwen2-0--5b
+      backendRefs:
+        - name: qwen2-0--5b
+---
+apiVersion: aigateway.envoyproxy.io/v1alpha1
+kind: AIServiceBackend
+metadata:
+  name: qwen2-0--5b
+spec:
+  timeouts:
+    request: 3m
+  schema:
+    name: OpenAI
+  backendRef:
+    name: qwen2-0--5b-lb
+    kind: Service
+    port: 8080
diff --git a/docs/examples/serverless/scaled-object.yaml b/docs/examples/serverless/scaled-object.yaml
@@ -0,0 +1,21 @@
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: qwen2-0--5b-scaler
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: inference.llmaz.io/v1alpha1
+    kind: Playground
+    name: qwen2-0--5b
+  pollingInterval: 30
+  cooldownPeriod: 50
+  minReplicaCount: 0
+  maxReplicaCount: 3
+  triggers:
+  - type: prometheus
+    metadata:
+      serverAddress: http://prometheus-operated.llmaz-system.svc.cluster.local:9090
+      metricName: llamacpp:requests_processing
+      query: sum(llamacpp:requests_processing)
+      threshold: "0.2"
diff --git a/docs/examples/serverless/service-monitor.yaml b/docs/examples/serverless/service-monitor.yaml
@@ -0,0 +1,18 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: qwen2-0--5b-lb-monitor
+  namespace: llmaz-system
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: servicemonitor
+spec:
+  namespaceSelector:
+    any: true
+  selector:
+    matchLabels:
+      llmaz.io/model-name: qwen2-0--5b
+  endpoints:
+    - port: http
+      path: /metrics
+      scheme: http
diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go
@@ -131,7 +131,7 @@ func (r *ServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 	}
 
 	// Create a service for the leader pods of the lws for loadbalancing.
-	if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service); err != nil {
+	if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service, models); err != nil {
 		return ctrl.Result{}, err
 	}
 
@@ -419,7 +419,7 @@ func setControllerReferenceForWorkload(owner metav1.Object, lws *applyconfigurat
 	return nil
 }
 
-func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service) error {
+func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service, model []*coreapi.OpenModel) error {
 	log := ctrl.LoggerFrom(ctx)
 	// The load balancing service name.
 	svcName := service.Name + "-lb"
@@ -433,6 +433,7 @@ func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Sche
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      svcName,
 				Namespace: service.Namespace,
+				Labels:    modelLabels(model[0]),
 			},
 			Spec: corev1.ServiceSpec{
 				Ports: []corev1.ServicePort{