From f7506017ced54cae1b8c41705acc4aea82304693 Mon Sep 17 00:00:00 2001
From: mnmehta <30246802+mnmehta@users.noreply.github.com>
Date: Thu, 5 Jun 2025 23:09:57 -0700
Subject: [PATCH 1/5] For quick validation use 1st decode pod if there are
 multiple pods (#305)

Signed-off-by: mnmehta <30246802+mnmehta@users.noreply.github.com>

updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>

updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .../sample-application/modelservice.yaml      |   4 +-
 charts/llm-d/values.yaml                      |  36 +-
 .../examples/rob-benchmarking/2P1D-het.yaml   | 697 ++++++++++++++++++
 quickstart/examples/rob-benchmarking/Justfile |  50 ++
 .../examples/rob-benchmarking/Justfile.remote |  36 +
 .../benchmark-interactive-pod.yaml            |  32 +
 quickstart/test-request.sh                    |   2 +-
 7 files changed, 842 insertions(+), 15 deletions(-)
 create mode 100644 quickstart/examples/rob-benchmarking/2P1D-het.yaml
 create mode 100644 quickstart/examples/rob-benchmarking/Justfile
 create mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote
 create mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index 6ba5c22..efa35d6 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -30,7 +30,7 @@ spec:
       {{- range .Values.sampleApplication.decode.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN
@@ -49,7 +49,7 @@ spec:
       {{- range .Values.sampleApplication.prefill.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index 0d9e000..d0aa57a 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -125,22 +125,22 @@ sampleApplication:
         # -- Key within the secret under which the token is located
         key: HF_TOKEN
 
-  # @schema
-  # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
-  # @schema
-  # -- Modify resource limits/requests available to the pods
-  # -- Resource requests/limits
-  # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
-  resources:
-    limits:
-      nvidia.com/gpu: "1"
-    requests:
-      nvidia.com/gpu: "1"
-
   # -- InferencePool port configuration
   inferencePoolPort: 8000
 
   prefill:
+    # @schema
+    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+    # @schema
+    # -- Modify resource limits/requests available to the pods
+    # -- Resource requests/limits
+    # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+      requests:
+        nvidia.com/gpu: "1"
+
     # -- number of desired prefill replicas
     replicas: 1
 
@@ -152,6 +152,18 @@ sampleApplication:
     extraArgs: []
 
   decode:
+    # @schema
+    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+    # @schema
+    # -- Modify resource limits/requests available to the pods
+    # -- Resource requests/limits
+    # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+      requests:
+        nvidia.com/gpu: "1"
+
     # -- number of desired decode replicas
     replicas: 1
 
diff --git a/quickstart/examples/rob-benchmarking/2P1D-het.yaml b/quickstart/examples/rob-benchmarking/2P1D-het.yaml
new file mode 100644
index 0000000..2dfc7b4
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/2P1D-het.yaml
@@ -0,0 +1,697 @@
+# yaml-language-server: $schema=values.schema.json
+
+# Default values for the llm-d chart.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# -- Global parameters
+# Global Docker image parameters
+# Please, note that this will override the image parameters, including dependencies, configured to use the global value
+# Current available global Docker image parameters: imageRegistry, imagePullSecrets and storageClass
+# @default -- See below
+global:
+    # -- Global Docker image registry
+    imageRegistry: ""
+
+    # @schema
+    # items:
+    #   type: string
+    # @schema
+    # -- Global Docker registry secret names as an array
+    # </br> E.g. `imagePullSecrets: [myRegistryKeySecretName]`
+    imagePullSecrets: []
+
+    security:
+        allowInsecureImages: true
+
+# @schema
+# additionalProperties: true
+# @schema
+# -- Parameters for bitnami.common dependency
+common: {}
+
+# -- Common parameters
+# -- Override Kubernetes version
+kubeVersion: ""
+
+# -- String to partially override common.names.fullname
+nameOverride: ""
+
+# -- String to fully override common.names.fullname
+fullnameOverride: ""
+
+# -- Default Kubernetes cluster domain
+clusterDomain: cluster.local
+
+# @schema
+# additionalProperties: true
+# @schema
+# -- Labels to add to all deployed objects
+commonLabels: {}
+
+# @schema
+# additionalProperties: true
+# @schema
+# -- Annotations to add to all deployed objects
+commonAnnotations: {}
+
+# @schema
+# items:
+#   type: [string, object]
+# @schema
+# -- Array of extra objects to deploy with the release
+extraDeploy: []
+
+# -- Helm tests
+test:
+    # -- Enable rendering of helm test resources
+    enabled: false
+
+    # @default -- See below
+    image:
+        # -- Test connection pod image registry
+        registry: quay.io
+
+        # -- Test connection pod image repository. Note that the image needs to have both the `sh` and `curl` binaries in it.
+        repository: curl/curl
+
+        # -- Test connection pod image tag. Note that the image needs to have both the `sh` and `curl` binaries in it.
+        tag: latest
+
+        # -- Specify a imagePullPolicy
+        imagePullPolicy: "Always"
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+        pullSecrets: []
+
+# -- Sample application deploying a p-d pair of specific model
+# @default -- See below
+sampleApplication:
+    baseConfigMapRefName: basic-gpu-with-nixl-preset
+
+    # -- Enable rendering of sample application resources
+    enabled: true
+
+    model:
+        # -- Fully qualified pvc URI: pvc://<pvc-name>/<model-path>
+        modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic
+
+        # -- Name of the model
+        modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
+
+        # -- Aliases to the Model named vllm will serve with
+        servedModelNames: []
+
+        auth:
+            # -- HF token auth config via k8s secret.
+            hfToken:
+                # -- Name of the secret to create to store your huggingface token
+                name: llm-d-hf-token
+                # -- Value of the token. Do not set this but use `envsubst` in conjunction with the helm chart
+                key: HF_TOKEN
+
+    # -- InferencePool port configuration
+    inferencePoolPort: 8000
+
+    prefill:
+        # -- number of desired prefill replicas
+        replicas: 2
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- args to add to the prefill deployment
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--distributed-executor-backend"
+            - "mp"
+            - "--block-size"
+            - "128"
+            - "--max-num-batched-tokens"
+            - "32768"
+
+    decode:
+        # -- number of desired decode replicas
+        replicas: 1
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- args to add to the decode deployment
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "4"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--distributed-executor-backend"
+            - "mp"
+            - "--block-size"
+            - "128"
+
+# -- Gateway configuration
+# @default -- See below
+gateway:
+    # -- Deploy resources related to Gateway
+    enabled: true
+
+    # --  String to fully override gateway.fullname
+    fullnameOverride: ""
+
+    # -- String to partially override gateway.fullname
+    nameOverride: ""
+
+    # -- Gateway class that determines the backend used
+    # Currently supported values: "kgateway" or "istio"
+    gatewayClassName: kgateway
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Additional annotations provided to the Gateway resource
+    annotations: {}
+
+    # Special parameters applied to kGateway via GatewayParameters resource
+    kGatewayParameters:
+        # @schema
+        # type: [number, boolean]
+        # @schema
+        proxyUID: false
+
+    # @schema
+    # items:
+    #  type: object
+    #  properties:
+    #    name:
+    #      description: Name is the name of the Listener. This name MUST be unique within a Gateway
+    #      type: string
+    #    path:
+    #      description: Path to expose via Ingress
+    #      type: string
+    #    port:
+    #      description: Port is the network port. Multiple listeners may use the same port, subject to the Listener compatibility rules
+    #      type: integer
+    #      minimum: 1
+    #      maximum: 65535
+    #    protocol:
+    #      description: Protocol specifies the network protocol this listener expects to receive
+    #      type: string
+    # @schema
+    # Set of listeners exposed via the Gateway, also propagated to the Ingress if enabled
+    listeners:
+        - name: default
+          path: /
+          port: 80
+          protocol: HTTP
+
+    # -- Gateway's service type. Ingress is only available if the service type is set to NodePort. Accepted values: ["LoadBalancer", "NodePort"]
+    serviceType: NodePort
+
+# -- Ingress configuration
+# @default -- See below
+ingress:
+    # -- Deploy Ingress
+    enabled: true
+
+    # -- Name of the IngressClass cluster resource which defines which controller will implement the resource (e.g nginx)
+    ingressClassName: ""
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Additional annotations for the Ingress resource
+    annotations: {}
+
+    # -- Hostname to be used to expose the NodePort service to the inferencing gateway
+    host: ""
+
+    # -- List of additional hostnames to be covered with this ingress record (e.g. a CNAME)
+    # <!-- E.g.
+    # extraHosts:
+    #   - name: llm-d.env.example.com
+    #     path: / (Optional)
+    #     pathType: Prefix (Optional)
+    #     port: 7007 (Optional) -->
+    extraHosts: []
+
+    # -- Path to be used to expose the full route to access the inferencing gateway
+    path: "/"
+
+    # -- Ingress TLS parameters
+    tls:
+        # -- Enable TLS configuration for the host defined at `ingress.host` parameter
+        enabled: false
+
+        # -- The name to which the TLS Secret will be called
+        secretName: ""
+
+    # @schema
+    # items:
+    #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.networking.v1.IngressTLS
+    # @schema
+    # -- The TLS configuration for additional hostnames to be covered with this ingress record.
+    # <br /> Ref: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls
+    # <!-- E.g.
+    # extraTls:
+    #   - hosts:
+    #     - llm-d.env.example.com
+    #     secretName: llm-d-env -->
+    extraTls: []
+
+    # -- used as part of the host dirivation if not specified from OCP cluster domain (dont edit)
+    clusterRouterBase: ""
+
+# -- Model service controller configuration
+# @default -- See below
+modelservice:
+    # -- Toggle to deploy modelservice controller related resources
+    enabled: true
+
+    # -- Enable metrics gathering via podMonitor / ServiceMonitor
+    metrics:
+        # -- Enable metrics scraping from prefill and decode services, see `model
+        enabled: true
+
+        # -- Prometheus ServiceMonitor configuration
+        # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
+        # @default -- See below
+        serviceMonitor:
+            # @schema
+            # additionalProperties: true
+            # @schema
+            # -- Additional annotations provided to the ServiceMonitor
+            annotations: {}
+
+            # @schema
+            # additionalProperties: true
+            # @schema
+            # -- Additional labels provided to the ServiceMonitor
+            labels: {}
+
+            # -- ServiceMonitor endpoint port
+            port: "vllm"
+
+            # -- ServiceMonitor endpoint path
+            path: "/metrics"
+
+            # -- ServiceMonitor endpoint interval at which metrics should be scraped
+            interval: "15s"
+
+            # -- ServiceMonitor namespace selector
+            namespaceSelector:
+                any: false
+
+                # @schema
+                # items:
+                #   type: string
+                # @schema
+                matchNames: []
+
+            # -- ServiceMonitor selector matchLabels
+            # </br> matchLabels must match labels on modelservice Services
+            selector:
+                # @schema
+                # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
+                # @schema
+                matchLabels: {}
+
+    # --  String to fully override modelservice.fullname
+    fullnameOverride: ""
+
+    # --  String to partially override modelservice.fullname
+    nameOverride: ""
+
+    # -- Number of controller replicas
+    replicas: 1
+
+    # -- Modelservice controller image, please change only if appropriate adjustments to the CRD are being made
+    # @default -- See below
+    image:
+        # -- Model Service controller image registry
+        registry: ghcr.io
+
+        # -- Model Service controller image repository
+        repository: llm-d/llm-d-model-service
+
+        # -- Model Service controller image tag
+        tag: "0.0.10"
+
+        # -- Specify a imagePullPolicy
+        imagePullPolicy: "Always"
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+        pullSecrets: []
+
+    # -- Endpoint picker configuration
+    # @default -- See below
+    epp:
+        # -- Endpoint picker image used in ModelService CR presets
+        # @default -- See below
+        image:
+            # -- Endpoint picker image registry
+            registry: ghcr.io
+
+            # -- Endpoint picker image repository
+            repository: llm-d/llm-d-inference-scheduler
+
+            # -- Endpoint picker image tag
+            tag: 0.0.2
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "Always"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+        # -- Enable metrics gathering via podMonitor / ServiceMonitor
+        metrics:
+            # -- Enable metrics scraping from endpoint picker service
+            enabled: true
+
+            # -- Prometheus ServiceMonitor configuration
+            # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
+            # @default -- See below
+            serviceMonitor:
+                # @schema
+                # additionalProperties: true
+                # @schema
+                # -- Additional annotations provided to the ServiceMonitor
+                annotations: {}
+
+                # @schema
+                # additionalProperties: true
+                # @schema
+                # -- Additional labels provided to the ServiceMonitor
+                labels: {}
+
+                # -- ServiceMonitor endpoint port
+                port: "metrics"
+
+                # -- ServiceMonitor endpoint path
+                path: "/metrics"
+
+                # -- ServiceMonitor endpoint interval at which metrics should be scraped
+                interval: "10s"
+
+                # -- ServiceMonitor namespace selector
+                namespaceSelector:
+                    any: false
+
+                    # @schema
+                    # items:
+                    #   type: string
+                    # @schema
+                    matchNames: []
+
+                # -- ServiceMonitor selector matchLabels
+                # </br> matchLabels must match labels on modelservice Services
+                selector:
+                    # @schema
+                    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
+                    # @schema
+                    matchLabels: {}
+
+        # -- Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again.
+        # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration
+        defaultEnvVars:
+            - name: ENABLE_KVCACHE_AWARE_SCORER
+              value: "false"
+            - name: KVCACHE_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: KVCACHE_INDEXER_REDIS_ADDR
+              value: '{{ if .Values.redis.enabled }}{{ include "redis.master.service.fullurl" . }}{{ end }}'
+            - name: ENABLE_PREFIX_AWARE_SCORER
+              value: "true"
+            - name: PREFIX_AWARE_SCORER_WEIGHT
+              value: "2"
+            - name: ENABLE_LOAD_AWARE_SCORER
+              value: "true"
+            - name: LOAD_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: ENABLE_SESSION_AWARE_SCORER
+              value: "false"
+            - name: SESSION_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: PD_ENABLED
+              value: "true"
+            - name: PD_PROMPT_LEN_THRESHOLD
+              value: "512"
+            - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+              value: "false"
+            - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+              value: "true"
+            - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+              value: "true"
+            - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
+              value: "2"
+            - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+              value: "false"
+            - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: DECODE_ENABLE_KVCACHE_AWARE_SCORER
+              value: "false"
+            - name: DECODE_KVCACHE_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: DECODE_ENABLE_LOAD_AWARE_SCORER
+              value: "true"
+            - name: DECODE_LOAD_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: DECODE_ENABLE_PREFIX_AWARE_SCORER
+              value: "true"
+            - name: DECODE_PREFIX_AWARE_SCORER_WEIGHT
+              value: "2"
+            - name: DECODE_ENABLE_SESSION_AWARE_SCORER
+              value: "false"
+            - name: DECODE_SESSION_AWARE_SCORER_WEIGHT
+              value: "1"
+
+        # @schema
+        # items:
+        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar
+        # @schema
+        # -- Additional environment variables for endpoint picker
+        defaultEnvVarsOverride: []
+
+    # -- Prefill options
+    # @default -- See below
+    prefill:
+        # @schema
+        # items:
+        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
+        # @schema
+        # -- Tolerations configuration to deploy prefill pods to tainted nodes
+        # @default -- See below
+        tolerations:
+            # -- default NVIDIA GPU toleration
+            - key: nvidia.com/gpu
+              operator: Exists
+              effect: NoSchedule
+
+    # -- Decode options
+    # @default -- See below
+    decode:
+        # @schema
+        # items:
+        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
+        # @schema
+        # -- Tolerations configuration to deploy decode pods to tainted nodes
+        # @default -- See below
+        tolerations:
+            # -- default NVIDIA GPU toleration
+            - key: nvidia.com/gpu
+              operator: Exists
+              effect: NoSchedule
+
+    # -- vLLM container options
+    # @default -- See below
+    vllm:
+        # -- vLLM image used in ModelService CR presets
+        # @default -- See below
+        image:
+            # -- llm-d image registry
+            registry: ghcr.io
+
+            # -- llm-d image repository
+            repository: llm-d/llm-d-dev
+
+            # -- llm-d image tag
+            tag: 0.0.10
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "IfNotPresent"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+        # -- Enable metrics gathering via podMonitor / ServiceMonitor
+        metrics:
+            # -- Enable metrics scraping from prefill & decode services
+            enabled: true
+
+    # -- Routing proxy container options
+    # @default -- See below
+    routingProxy:
+        # -- Routing proxy image used in ModelService CR presets
+        image:
+            # -- Routing proxy image registry
+            registry: ghcr.io
+
+            # -- Routing proxy image repository
+            repository: llm-d/llm-d-routing-sidecar
+
+            # -- Routing proxy image tag
+            tag: "0.0.6"
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "IfNotPresent"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+    # -- llm-d inference simulator container options
+    # @default -- See below
+    inferenceSimulator:
+        # -- llm-d inference simulator image used in ModelService CR presets
+        # @default -- See below
+        image:
+            # -- llm-d inference simulator image registry
+            registry: ghcr.io
+
+            # -- llm-d inference simulator image repository
+            repository: llm-d/llm-d-inference-sim
+
+            # -- llm-d inference simulator image tag
+            tag: "0.0.4"
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "IfNotPresent"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Annotations to add to all modelservice resources
+    annotations: {}
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Pod annotations for modelservice
+    podAnnotations: {}
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Pod labels for modelservice
+    podLabels: {}
+
+    # Model service controller settings
+    service:
+        # -- Toggle to deploy a Service resource for Model service controller
+        enabled: true
+
+        # -- Port number exposed from Model Service controller
+        port: 8443
+
+        # -- Service type
+        type: ClusterIP
+
+    # -- Service Account Configuration
+    # @default -- See below
+    serviceAccount:
+        # -- Enable the creation of a ServiceAccount for Modelservice pods
+        create: true
+
+        # --  String to fully override modelservice.serviceAccountName, defaults to modelservice.fullname
+        fullnameOverride: ""
+
+        # --  String to partially override modelservice.serviceAccountName, defaults to modelservice.fullname
+        nameOverride: ""
+
+        # @schema
+        # additionalProperties: true
+        # @schema
+        # -- Additional custom labels to the service ServiceAccount.
+        labels: {}
+
+        # @schema
+        # additionalProperties: true
+        # @schema
+        # -- Additional custom annotations for the ServiceAccount.
+        annotations: {}
+
+    rbac:
+        # -- Enable the creation of RBAC resources
+        create: true
+
+# @schema
+# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json
+# @schema
+# -- Bitnami/Redis chart configuration
+# @default -- Use sane defaults for minimal Redis deployment
+redis:
+    enabled: false
+    auth:
+        enabled: false
+        existingSecretPasswordKey: ""
+        existingSecret: ""
+    architecture: standalone
+    image:
+        registry: quay.io
+        repository: sclorg/redis-7-c9s
+        tag: c9s
+    master:
+        kind: Deployment
+        resources:
+            limits:
+                memory: "256Mi"
+                cpu: "250m"
+            requests:
+                memory: "128Mi"
+                cpu: "100m"
+        persistence:
+            enabled: true
+            size: "5Gi"
+        pdb:
+            create: false
+        service:
+            ports:
+                redis: 8100
+    networkPolicy:
+        enabled: false
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
new file mode 100644
index 0000000..209e44f
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -0,0 +1,50 @@
+NAMESPACE := "pete-davidson"
+MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
+
+logs POD:
+    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"
+
+get-ips:
+    just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}'
+get-pods:
+    kubectl get pods -n {{NAMESPACE}} -o wide
+
+hf-token:
+  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n {{NAMESPACE}}
+
+[working-directory: '../quickstart']
+install VALUES:
+    ./llmd-installer.sh \
+        --hf-token $HF_TOKEN \
+        --namespace {{NAMESPACE}} \
+        --storage-class shared-vast --storage-size 300Gi \
+        --values-file $PWD/../project/{{VALUES}}
+
+start VALUES: 
+    just install {{VALUES}} && \
+    just hf-token && \
+    just start-bench
+
+[working-directory: '../quickstart']
+uninstall VALUES:
+    ./llmd-installer.sh \
+        --hf-token $HF_TOKEN \
+        --namespace {{NAMESPACE}} \
+        --storage-class shared-vast  --storage-size 300Gi \
+        --values-file $PWD/../project/{{VALUES}} \
+        --uninstall
+
+gh-token GH_TOKEN:
+    kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
+
+# Interactive benchmark commands:
+start-bench:
+    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
+
+delete-bench:
+    kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
+
+exec-bench:
+    kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \
+    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
+    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
new file mode 100644
index 0000000..bbec981
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -0,0 +1,36 @@
+# Use this Justfile within the cluster.
+
+# MODEL := "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+BASE_URL := "http://llm-d-inference-gateway"
+
+eval:
+    lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=50,max_retries=3,tokenized_requests=False \
+    --limit 100
+
+benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+    python vllm/benchmarks/benchmark_serving.py \
+        --base-url {{BASE_URL}} \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}}  \
+        --request-rate {{RR}} \
+        --seed $(date +%M%H%M%S) \
+        --num-prompts {{NUM_REQUESTS}} \
+        --ignore-eos
+
+# just benchmark 4 1000 15000 5000 <-- current 1P3D setup
+#
+benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+    python vllm/benchmarks/benchmark_serving.py \
+        --base-url http://{{POD_IP}}:8000 \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}}  \
+        --request-rate {{RR}} \
+        --seed $(date +%M%H%M%S) \
+        --num-prompts {{NUM_REQUESTS}} \
+        --ignore-eos
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
new file mode 100644
index 0000000..bcb6434
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
@@ -0,0 +1,32 @@
+# benchmark-client-interactive-pod.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+    name: benchmark-interactive
+    labels:
+        app: benchmark-interactive # Labels for organization
+spec:
+    containers:
+        - name: benchmark-runner
+          image: "quay.io/tms/pd-disagg-benchmark:0.0.6"
+          imagePullPolicy: Always
+          stdin: true
+          tty: true
+          resources:
+              requests:
+                  cpu: "16"
+                  memory: "64Gi"
+              limits:
+                  cpu: "16"
+                  memory: "64Gi"
+          env:
+              - name: PROXY_HOST
+                value: "custom-llm-proxy-service"
+              - name: PROXY_PORT
+                value: "80"
+              - name: HF_TOKEN
+                valueFrom:
+                    secretKeyRef:
+                        name: hf-token-secret # set up with just hf_token
+                        key: HF_TOKEN
+    restartPolicy: Never
diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh
index 5635240..26f0afc 100755
--- a/quickstart/test-request.sh
+++ b/quickstart/test-request.sh
@@ -89,7 +89,7 @@ validation() {
   # Discover the decode pod IP
   POD_IP=$(kubectl get pods -n "$NAMESPACE" \
     -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.podIP}{"\n"}{end}' \
-    | grep decode | awk '{print $2}')
+    | grep decode | awk '{print $2}' | head -1)
 
   if [[ -z "$POD_IP" ]]; then
       echo "Error: no decode pod found in namespace $NAMESPACE"

From 36ab0c996e53bda040febc3602a65525cce307f1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:17:08 +0000
Subject: [PATCH 2/5] rmove examples

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .../examples/rob-benchmarking/2P1D-het.yaml   | 697 ------------------
 quickstart/examples/rob-benchmarking/Justfile |  50 --
 .../examples/rob-benchmarking/Justfile.remote |  36 -
 .../benchmark-interactive-pod.yaml            |  32 -
 4 files changed, 815 deletions(-)
 delete mode 100644 quickstart/examples/rob-benchmarking/2P1D-het.yaml
 delete mode 100644 quickstart/examples/rob-benchmarking/Justfile
 delete mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote
 delete mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml

diff --git a/quickstart/examples/rob-benchmarking/2P1D-het.yaml b/quickstart/examples/rob-benchmarking/2P1D-het.yaml
deleted file mode 100644
index 2dfc7b4..0000000
--- a/quickstart/examples/rob-benchmarking/2P1D-het.yaml
+++ /dev/null
@@ -1,697 +0,0 @@
-# yaml-language-server: $schema=values.schema.json
-
-# Default values for the llm-d chart.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-# -- Global parameters
-# Global Docker image parameters
-# Please, note that this will override the image parameters, including dependencies, configured to use the global value
-# Current available global Docker image parameters: imageRegistry, imagePullSecrets and storageClass
-# @default -- See below
-global:
-    # -- Global Docker image registry
-    imageRegistry: ""
-
-    # @schema
-    # items:
-    #   type: string
-    # @schema
-    # -- Global Docker registry secret names as an array
-    # </br> E.g. `imagePullSecrets: [myRegistryKeySecretName]`
-    imagePullSecrets: []
-
-    security:
-        allowInsecureImages: true
-
-# @schema
-# additionalProperties: true
-# @schema
-# -- Parameters for bitnami.common dependency
-common: {}
-
-# -- Common parameters
-# -- Override Kubernetes version
-kubeVersion: ""
-
-# -- String to partially override common.names.fullname
-nameOverride: ""
-
-# -- String to fully override common.names.fullname
-fullnameOverride: ""
-
-# -- Default Kubernetes cluster domain
-clusterDomain: cluster.local
-
-# @schema
-# additionalProperties: true
-# @schema
-# -- Labels to add to all deployed objects
-commonLabels: {}
-
-# @schema
-# additionalProperties: true
-# @schema
-# -- Annotations to add to all deployed objects
-commonAnnotations: {}
-
-# @schema
-# items:
-#   type: [string, object]
-# @schema
-# -- Array of extra objects to deploy with the release
-extraDeploy: []
-
-# -- Helm tests
-test:
-    # -- Enable rendering of helm test resources
-    enabled: false
-
-    # @default -- See below
-    image:
-        # -- Test connection pod image registry
-        registry: quay.io
-
-        # -- Test connection pod image repository. Note that the image needs to have both the `sh` and `curl` binaries in it.
-        repository: curl/curl
-
-        # -- Test connection pod image tag. Note that the image needs to have both the `sh` and `curl` binaries in it.
-        tag: latest
-
-        # -- Specify a imagePullPolicy
-        imagePullPolicy: "Always"
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-        pullSecrets: []
-
-# -- Sample application deploying a p-d pair of specific model
-# @default -- See below
-sampleApplication:
-    baseConfigMapRefName: basic-gpu-with-nixl-preset
-
-    # -- Enable rendering of sample application resources
-    enabled: true
-
-    model:
-        # -- Fully qualified pvc URI: pvc://<pvc-name>/<model-path>
-        modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic
-
-        # -- Name of the model
-        modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
-
-        # -- Aliases to the Model named vllm will serve with
-        servedModelNames: []
-
-        auth:
-            # -- HF token auth config via k8s secret.
-            hfToken:
-                # -- Name of the secret to create to store your huggingface token
-                name: llm-d-hf-token
-                # -- Value of the token. Do not set this but use `envsubst` in conjunction with the helm chart
-                key: HF_TOKEN
-
-    # -- InferencePool port configuration
-    inferencePoolPort: 8000
-
-    prefill:
-        # -- number of desired prefill replicas
-        replicas: 2
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- args to add to the prefill deployment
-        extraArgs:
-            - "--tensor-parallel-size"
-            - "1"
-            - "--disable-log-requests"
-            - "--max-model-len"
-            - "32768"
-            - "--distributed-executor-backend"
-            - "mp"
-            - "--block-size"
-            - "128"
-            - "--max-num-batched-tokens"
-            - "32768"
-
-    decode:
-        # -- number of desired decode replicas
-        replicas: 1
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- args to add to the decode deployment
-        extraArgs:
-            - "--tensor-parallel-size"
-            - "4"
-            - "--disable-log-requests"
-            - "--max-model-len"
-            - "32768"
-            - "--distributed-executor-backend"
-            - "mp"
-            - "--block-size"
-            - "128"
-
-# -- Gateway configuration
-# @default -- See below
-gateway:
-    # -- Deploy resources related to Gateway
-    enabled: true
-
-    # --  String to fully override gateway.fullname
-    fullnameOverride: ""
-
-    # -- String to partially override gateway.fullname
-    nameOverride: ""
-
-    # -- Gateway class that determines the backend used
-    # Currently supported values: "kgateway" or "istio"
-    gatewayClassName: kgateway
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Additional annotations provided to the Gateway resource
-    annotations: {}
-
-    # Special parameters applied to kGateway via GatewayParameters resource
-    kGatewayParameters:
-        # @schema
-        # type: [number, boolean]
-        # @schema
-        proxyUID: false
-
-    # @schema
-    # items:
-    #  type: object
-    #  properties:
-    #    name:
-    #      description: Name is the name of the Listener. This name MUST be unique within a Gateway
-    #      type: string
-    #    path:
-    #      description: Path to expose via Ingress
-    #      type: string
-    #    port:
-    #      description: Port is the network port. Multiple listeners may use the same port, subject to the Listener compatibility rules
-    #      type: integer
-    #      minimum: 1
-    #      maximum: 65535
-    #    protocol:
-    #      description: Protocol specifies the network protocol this listener expects to receive
-    #      type: string
-    # @schema
-    # Set of listeners exposed via the Gateway, also propagated to the Ingress if enabled
-    listeners:
-        - name: default
-          path: /
-          port: 80
-          protocol: HTTP
-
-    # -- Gateway's service type. Ingress is only available if the service type is set to NodePort. Accepted values: ["LoadBalancer", "NodePort"]
-    serviceType: NodePort
-
-# -- Ingress configuration
-# @default -- See below
-ingress:
-    # -- Deploy Ingress
-    enabled: true
-
-    # -- Name of the IngressClass cluster resource which defines which controller will implement the resource (e.g nginx)
-    ingressClassName: ""
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Additional annotations for the Ingress resource
-    annotations: {}
-
-    # -- Hostname to be used to expose the NodePort service to the inferencing gateway
-    host: ""
-
-    # -- List of additional hostnames to be covered with this ingress record (e.g. a CNAME)
-    # <!-- E.g.
-    # extraHosts:
-    #   - name: llm-d.env.example.com
-    #     path: / (Optional)
-    #     pathType: Prefix (Optional)
-    #     port: 7007 (Optional) -->
-    extraHosts: []
-
-    # -- Path to be used to expose the full route to access the inferencing gateway
-    path: "/"
-
-    # -- Ingress TLS parameters
-    tls:
-        # -- Enable TLS configuration for the host defined at `ingress.host` parameter
-        enabled: false
-
-        # -- The name to which the TLS Secret will be called
-        secretName: ""
-
-    # @schema
-    # items:
-    #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.networking.v1.IngressTLS
-    # @schema
-    # -- The TLS configuration for additional hostnames to be covered with this ingress record.
-    # <br /> Ref: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls
-    # <!-- E.g.
-    # extraTls:
-    #   - hosts:
-    #     - llm-d.env.example.com
-    #     secretName: llm-d-env -->
-    extraTls: []
-
-    # -- used as part of the host dirivation if not specified from OCP cluster domain (dont edit)
-    clusterRouterBase: ""
-
-# -- Model service controller configuration
-# @default -- See below
-modelservice:
-    # -- Toggle to deploy modelservice controller related resources
-    enabled: true
-
-    # -- Enable metrics gathering via podMonitor / ServiceMonitor
-    metrics:
-        # -- Enable metrics scraping from prefill and decode services, see `model
-        enabled: true
-
-        # -- Prometheus ServiceMonitor configuration
-        # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
-        # @default -- See below
-        serviceMonitor:
-            # @schema
-            # additionalProperties: true
-            # @schema
-            # -- Additional annotations provided to the ServiceMonitor
-            annotations: {}
-
-            # @schema
-            # additionalProperties: true
-            # @schema
-            # -- Additional labels provided to the ServiceMonitor
-            labels: {}
-
-            # -- ServiceMonitor endpoint port
-            port: "vllm"
-
-            # -- ServiceMonitor endpoint path
-            path: "/metrics"
-
-            # -- ServiceMonitor endpoint interval at which metrics should be scraped
-            interval: "15s"
-
-            # -- ServiceMonitor namespace selector
-            namespaceSelector:
-                any: false
-
-                # @schema
-                # items:
-                #   type: string
-                # @schema
-                matchNames: []
-
-            # -- ServiceMonitor selector matchLabels
-            # </br> matchLabels must match labels on modelservice Services
-            selector:
-                # @schema
-                # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
-                # @schema
-                matchLabels: {}
-
-    # --  String to fully override modelservice.fullname
-    fullnameOverride: ""
-
-    # --  String to partially override modelservice.fullname
-    nameOverride: ""
-
-    # -- Number of controller replicas
-    replicas: 1
-
-    # -- Modelservice controller image, please change only if appropriate adjustments to the CRD are being made
-    # @default -- See below
-    image:
-        # -- Model Service controller image registry
-        registry: ghcr.io
-
-        # -- Model Service controller image repository
-        repository: llm-d/llm-d-model-service
-
-        # -- Model Service controller image tag
-        tag: "0.0.10"
-
-        # -- Specify a imagePullPolicy
-        imagePullPolicy: "Always"
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-        pullSecrets: []
-
-    # -- Endpoint picker configuration
-    # @default -- See below
-    epp:
-        # -- Endpoint picker image used in ModelService CR presets
-        # @default -- See below
-        image:
-            # -- Endpoint picker image registry
-            registry: ghcr.io
-
-            # -- Endpoint picker image repository
-            repository: llm-d/llm-d-inference-scheduler
-
-            # -- Endpoint picker image tag
-            tag: 0.0.2
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "Always"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-        # -- Enable metrics gathering via podMonitor / ServiceMonitor
-        metrics:
-            # -- Enable metrics scraping from endpoint picker service
-            enabled: true
-
-            # -- Prometheus ServiceMonitor configuration
-            # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
-            # @default -- See below
-            serviceMonitor:
-                # @schema
-                # additionalProperties: true
-                # @schema
-                # -- Additional annotations provided to the ServiceMonitor
-                annotations: {}
-
-                # @schema
-                # additionalProperties: true
-                # @schema
-                # -- Additional labels provided to the ServiceMonitor
-                labels: {}
-
-                # -- ServiceMonitor endpoint port
-                port: "metrics"
-
-                # -- ServiceMonitor endpoint path
-                path: "/metrics"
-
-                # -- ServiceMonitor endpoint interval at which metrics should be scraped
-                interval: "10s"
-
-                # -- ServiceMonitor namespace selector
-                namespaceSelector:
-                    any: false
-
-                    # @schema
-                    # items:
-                    #   type: string
-                    # @schema
-                    matchNames: []
-
-                # -- ServiceMonitor selector matchLabels
-                # </br> matchLabels must match labels on modelservice Services
-                selector:
-                    # @schema
-                    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
-                    # @schema
-                    matchLabels: {}
-
-        # -- Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again.
-        # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration
-        defaultEnvVars:
-            - name: ENABLE_KVCACHE_AWARE_SCORER
-              value: "false"
-            - name: KVCACHE_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: KVCACHE_INDEXER_REDIS_ADDR
-              value: '{{ if .Values.redis.enabled }}{{ include "redis.master.service.fullurl" . }}{{ end }}'
-            - name: ENABLE_PREFIX_AWARE_SCORER
-              value: "true"
-            - name: PREFIX_AWARE_SCORER_WEIGHT
-              value: "2"
-            - name: ENABLE_LOAD_AWARE_SCORER
-              value: "true"
-            - name: LOAD_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: ENABLE_SESSION_AWARE_SCORER
-              value: "false"
-            - name: SESSION_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: PD_ENABLED
-              value: "true"
-            - name: PD_PROMPT_LEN_THRESHOLD
-              value: "512"
-            - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
-              value: "false"
-            - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
-              value: "true"
-            - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
-              value: "true"
-            - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
-              value: "2"
-            - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
-              value: "false"
-            - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: DECODE_ENABLE_KVCACHE_AWARE_SCORER
-              value: "false"
-            - name: DECODE_KVCACHE_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: DECODE_ENABLE_LOAD_AWARE_SCORER
-              value: "true"
-            - name: DECODE_LOAD_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: DECODE_ENABLE_PREFIX_AWARE_SCORER
-              value: "true"
-            - name: DECODE_PREFIX_AWARE_SCORER_WEIGHT
-              value: "2"
-            - name: DECODE_ENABLE_SESSION_AWARE_SCORER
-              value: "false"
-            - name: DECODE_SESSION_AWARE_SCORER_WEIGHT
-              value: "1"
-
-        # @schema
-        # items:
-        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar
-        # @schema
-        # -- Additional environment variables for endpoint picker
-        defaultEnvVarsOverride: []
-
-    # -- Prefill options
-    # @default -- See below
-    prefill:
-        # @schema
-        # items:
-        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
-        # @schema
-        # -- Tolerations configuration to deploy prefill pods to tainted nodes
-        # @default -- See below
-        tolerations:
-            # -- default NVIDIA GPU toleration
-            - key: nvidia.com/gpu
-              operator: Exists
-              effect: NoSchedule
-
-    # -- Decode options
-    # @default -- See below
-    decode:
-        # @schema
-        # items:
-        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
-        # @schema
-        # -- Tolerations configuration to deploy decode pods to tainted nodes
-        # @default -- See below
-        tolerations:
-            # -- default NVIDIA GPU toleration
-            - key: nvidia.com/gpu
-              operator: Exists
-              effect: NoSchedule
-
-    # -- vLLM container options
-    # @default -- See below
-    vllm:
-        # -- vLLM image used in ModelService CR presets
-        # @default -- See below
-        image:
-            # -- llm-d image registry
-            registry: ghcr.io
-
-            # -- llm-d image repository
-            repository: llm-d/llm-d-dev
-
-            # -- llm-d image tag
-            tag: 0.0.10
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "IfNotPresent"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-        # -- Enable metrics gathering via podMonitor / ServiceMonitor
-        metrics:
-            # -- Enable metrics scraping from prefill & decode services
-            enabled: true
-
-    # -- Routing proxy container options
-    # @default -- See below
-    routingProxy:
-        # -- Routing proxy image used in ModelService CR presets
-        image:
-            # -- Routing proxy image registry
-            registry: ghcr.io
-
-            # -- Routing proxy image repository
-            repository: llm-d/llm-d-routing-sidecar
-
-            # -- Routing proxy image tag
-            tag: "0.0.6"
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "IfNotPresent"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-    # -- llm-d inference simulator container options
-    # @default -- See below
-    inferenceSimulator:
-        # -- llm-d inference simulator image used in ModelService CR presets
-        # @default -- See below
-        image:
-            # -- llm-d inference simulator image registry
-            registry: ghcr.io
-
-            # -- llm-d inference simulator image repository
-            repository: llm-d/llm-d-inference-sim
-
-            # -- llm-d inference simulator image tag
-            tag: "0.0.4"
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "IfNotPresent"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Annotations to add to all modelservice resources
-    annotations: {}
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Pod annotations for modelservice
-    podAnnotations: {}
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Pod labels for modelservice
-    podLabels: {}
-
-    # Model service controller settings
-    service:
-        # -- Toggle to deploy a Service resource for Model service controller
-        enabled: true
-
-        # -- Port number exposed from Model Service controller
-        port: 8443
-
-        # -- Service type
-        type: ClusterIP
-
-    # -- Service Account Configuration
-    # @default -- See below
-    serviceAccount:
-        # -- Enable the creation of a ServiceAccount for Modelservice pods
-        create: true
-
-        # --  String to fully override modelservice.serviceAccountName, defaults to modelservice.fullname
-        fullnameOverride: ""
-
-        # --  String to partially override modelservice.serviceAccountName, defaults to modelservice.fullname
-        nameOverride: ""
-
-        # @schema
-        # additionalProperties: true
-        # @schema
-        # -- Additional custom labels to the service ServiceAccount.
-        labels: {}
-
-        # @schema
-        # additionalProperties: true
-        # @schema
-        # -- Additional custom annotations for the ServiceAccount.
-        annotations: {}
-
-    rbac:
-        # -- Enable the creation of RBAC resources
-        create: true
-
-# @schema
-# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json
-# @schema
-# -- Bitnami/Redis chart configuration
-# @default -- Use sane defaults for minimal Redis deployment
-redis:
-    enabled: false
-    auth:
-        enabled: false
-        existingSecretPasswordKey: ""
-        existingSecret: ""
-    architecture: standalone
-    image:
-        registry: quay.io
-        repository: sclorg/redis-7-c9s
-        tag: c9s
-    master:
-        kind: Deployment
-        resources:
-            limits:
-                memory: "256Mi"
-                cpu: "250m"
-            requests:
-                memory: "128Mi"
-                cpu: "100m"
-        persistence:
-            enabled: true
-            size: "5Gi"
-        pdb:
-            create: false
-        service:
-            ports:
-                redis: 8100
-    networkPolicy:
-        enabled: false
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
deleted file mode 100644
index 209e44f..0000000
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ /dev/null
@@ -1,50 +0,0 @@
-NAMESPACE := "pete-davidson"
-MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
-
-logs POD:
-    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"
-
-get-ips:
-    just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}'
-get-pods:
-    kubectl get pods -n {{NAMESPACE}} -o wide
-
-hf-token:
-  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n {{NAMESPACE}}
-
-[working-directory: '../quickstart']
-install VALUES:
-    ./llmd-installer.sh \
-        --hf-token $HF_TOKEN \
-        --namespace {{NAMESPACE}} \
-        --storage-class shared-vast --storage-size 300Gi \
-        --values-file $PWD/../project/{{VALUES}}
-
-start VALUES: 
-    just install {{VALUES}} && \
-    just hf-token && \
-    just start-bench
-
-[working-directory: '../quickstart']
-uninstall VALUES:
-    ./llmd-installer.sh \
-        --hf-token $HF_TOKEN \
-        --namespace {{NAMESPACE}} \
-        --storage-class shared-vast  --storage-size 300Gi \
-        --values-file $PWD/../project/{{VALUES}} \
-        --uninstall
-
-gh-token GH_TOKEN:
-    kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
-
-# Interactive benchmark commands:
-start-bench:
-    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
-
-delete-bench:
-    kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
-
-exec-bench:
-    kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \
-    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
-    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
deleted file mode 100644
index bbec981..0000000
--- a/quickstart/examples/rob-benchmarking/Justfile.remote
+++ /dev/null
@@ -1,36 +0,0 @@
-# Use this Justfile within the cluster.
-
-# MODEL := "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-BASE_URL := "http://llm-d-inference-gateway"
-
-eval:
-    lm_eval --model local-completions --tasks gsm8k \
-    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=50,max_retries=3,tokenized_requests=False \
-    --limit 100
-
-benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
-    python vllm/benchmarks/benchmark_serving.py \
-        --base-url {{BASE_URL}} \
-        --model {{MODEL}} \
-        --dataset-name random \
-        --random-input-len {{INPUT_LEN}} \
-        --random-output-len {{OUTPUT_LEN}}  \
-        --request-rate {{RR}} \
-        --seed $(date +%M%H%M%S) \
-        --num-prompts {{NUM_REQUESTS}} \
-        --ignore-eos
-
-# just benchmark 4 1000 15000 5000 <-- current 1P3D setup
-#
-benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
-    python vllm/benchmarks/benchmark_serving.py \
-        --base-url http://{{POD_IP}}:8000 \
-        --model {{MODEL}} \
-        --dataset-name random \
-        --random-input-len {{INPUT_LEN}} \
-        --random-output-len {{OUTPUT_LEN}}  \
-        --request-rate {{RR}} \
-        --seed $(date +%M%H%M%S) \
-        --num-prompts {{NUM_REQUESTS}} \
-        --ignore-eos
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
deleted file mode 100644
index bcb6434..0000000
--- a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# benchmark-client-interactive-pod.yaml
-apiVersion: v1
-kind: Pod
-metadata:
-    name: benchmark-interactive
-    labels:
-        app: benchmark-interactive # Labels for organization
-spec:
-    containers:
-        - name: benchmark-runner
-          image: "quay.io/tms/pd-disagg-benchmark:0.0.6"
-          imagePullPolicy: Always
-          stdin: true
-          tty: true
-          resources:
-              requests:
-                  cpu: "16"
-                  memory: "64Gi"
-              limits:
-                  cpu: "16"
-                  memory: "64Gi"
-          env:
-              - name: PROXY_HOST
-                value: "custom-llm-proxy-service"
-              - name: PROXY_PORT
-                value: "80"
-              - name: HF_TOKEN
-                valueFrom:
-                    secretKeyRef:
-                        name: hf-token-secret # set up with just hf_token
-                        key: HF_TOKEN
-    restartPolicy: Never

From ff8ae72771267ab1e40022a3d462c18e1265884a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:17:38 +0000
Subject: [PATCH 3/5] fix typo

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 charts/llm-d/templates/sample-application/modelservice.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index efa35d6..913a086 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -49,7 +49,7 @@ spec:
       {{- range .Values.sampleApplication.prefill.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.prefill.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN

From e117b30deb708d67ea705115ab1a86ec4ff89716 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:18:15 +0000
Subject: [PATCH 4/5] fix

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 quickstart/test-request.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh
index 26f0afc..5635240 100755
--- a/quickstart/test-request.sh
+++ b/quickstart/test-request.sh
@@ -89,7 +89,7 @@ validation() {
   # Discover the decode pod IP
   POD_IP=$(kubectl get pods -n "$NAMESPACE" \
     -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.podIP}{"\n"}{end}' \
-    | grep decode | awk '{print $2}' | head -1)
+    | grep decode | awk '{print $2}')
 
   if [[ -z "$POD_IP" ]]; then
       echo "Error: no decode pod found in namespace $NAMESPACE"

From 282ee2ad6a1e79d21e1ab72bd1b9a5d56716fada Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:58:55 +0000
Subject: [PATCH 5/5] updated schema

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 charts/llm-d/values.schema.json | 169 +++++++++++++++++++++-----------
 1 file changed, 114 insertions(+), 55 deletions(-)

diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json
index a1910e9..b405e0a 100644
--- a/charts/llm-d/values.schema.json
+++ b/charts/llm-d/values.schema.json
@@ -10471,6 +10471,65 @@
                             "description": "number of desired decode replicas",
                             "required": [],
                             "title": "replicas"
+                        },
+                        "resources": {
+                            "description": "ResourceRequirements describes the compute resource requirements.",
+                            "properties": {
+                                "claims": {
+                                    "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+                                    "items": {
+                                        "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+                                        "properties": {
+                                            "name": {
+                                                "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+                                                "type": "string"
+                                            },
+                                            "request": {
+                                                "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+                                                "type": "string"
+                                            }
+                                        },
+                                        "required": [
+                                            "name"
+                                        ],
+                                        "type": "object"
+                                    },
+                                    "type": "array",
+                                    "x-kubernetes-list-map-keys": [
+                                        "name"
+                                    ],
+                                    "x-kubernetes-list-type": "map"
+                                },
+                                "limits": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
+                                    },
+                                    "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                },
+                                "requests": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
+                                    },
+                                    "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                }
+                            },
+                            "type": "object"
                         }
                     },
                     "required": [],
@@ -10688,69 +10747,69 @@
                             "description": "number of desired prefill replicas",
                             "required": [],
                             "title": "replicas"
-                        }
-                    },
-                    "required": [],
-                    "title": "prefill",
-                    "type": "object"
-                },
-                "resources": {
-                    "description": "ResourceRequirements describes the compute resource requirements.",
-                    "properties": {
-                        "claims": {
-                            "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
-                            "items": {
-                                "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
-                                "properties": {
-                                    "name": {
-                                        "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
-                                        "type": "string"
+                        },
+                        "resources": {
+                            "description": "ResourceRequirements describes the compute resource requirements.",
+                            "properties": {
+                                "claims": {
+                                    "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+                                    "items": {
+                                        "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+                                        "properties": {
+                                            "name": {
+                                                "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+                                                "type": "string"
+                                            },
+                                            "request": {
+                                                "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+                                                "type": "string"
+                                            }
+                                        },
+                                        "required": [
+                                            "name"
+                                        ],
+                                        "type": "object"
                                     },
-                                    "request": {
-                                        "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
-                                        "type": "string"
-                                    }
+                                    "type": "array",
+                                    "x-kubernetes-list-map-keys": [
+                                        "name"
+                                    ],
+                                    "x-kubernetes-list-type": "map"
                                 },
-                                "required": [
-                                    "name"
-                                ],
-                                "type": "object"
-                            },
-                            "type": "array",
-                            "x-kubernetes-list-map-keys": [
-                                "name"
-                            ],
-                            "x-kubernetes-list-type": "map"
-                        },
-                        "limits": {
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "string"
+                                "limits": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
                                     },
-                                    {
-                                        "type": "number"
-                                    }
-                                ]
-                            },
-                            "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
-                            "type": "object"
-                        },
-                        "requests": {
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "string"
+                                    "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                },
+                                "requests": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
                                     },
-                                    {
-                                        "type": "number"
-                                    }
-                                ]
+                                    "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                }
                             },
-                            "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
                             "type": "object"
                         }
                     },
+                    "required": [],
+                    "title": "prefill",
                     "type": "object"
                 }
             },