From 0308d43856314185394582df701de7b07b78fa47 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 12 Nov 2025 12:01:19 -0800
Subject: [PATCH] add CPU BKMs

---
 helm/templates/servingruntime.yaml | 24 +++++++++++++++++++++++-
 helm/values.yaml                   |  7 ++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/helm/templates/servingruntime.yaml b/helm/templates/servingruntime.yaml
index 8927006..84bee71 100644
--- a/helm/templates/servingruntime.yaml
+++ b/helm/templates/servingruntime.yaml
@@ -24,6 +24,28 @@ spec:
         - {{ .Values.model.maxModelLen | quote }}
         - '--served-model-name'
         - {{ .Values.model.name }}
+        - '--tensor-parallel-size'
+        - {{ .Values.resources.inference.tp }}
+        - '--dtype'
+        - {{ .Values.model.dtype }}
+        - '--enforce-eager'
+        - '--distributed-executor-backend'
+        - "mp"
+        - '--block-size'
+        - {{ .Values.model.block_size }}
+        - '--max-num-batched-tokens'
+        - {{ .Values.model.max_num_batched_tokens }}
+        - '--max-num-seqs'
+        - {{ .Values.model.max_num_seqs }}
+      env:
+        - name: VLLM_CPU_KVCACHE_SPACE
+          value: "40"
+        - name: VLLM_CPU_SGL_KERNEL
+          value: "1"
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: "1"
+        - name: VLLM_RPC_TIMEOUT
+          value: "100000"
       image: {{ .Values.images.vllmRuntime.repository }}:{{ .Values.images.vllmRuntime.tag }}
       name: kserve-container
       ports:
@@ -33,4 +55,4 @@ spec:
   multiModel: false
   supportedModelFormats:
     - autoSelect: true
-      name: vLLM
\ No newline at end of file
+      name: vLLM
diff --git a/helm/values.yaml b/helm/values.yaml
index 6473877..aaec419 100644
--- a/helm/values.yaml
+++ b/helm/values.yaml
@@ -40,14 +40,19 @@ model:
   storageUri: "oci://quay.io/rh-aiservices-bu/tinyllama:1.0"
   name: "tinyllama"
   maxModelLen: 2048
+  dtype: "bfloat16"
+  block_size: "128"
+  max_num_batched_tokens: "2048"
+  max_num_seqs: "256"
 
 resources:
   inference:
     requests:
       cpu: "2"
       memory: "4Gi"
+      tp: "1"
     limits:
       cpu: "8"
       memory: "8Gi"
 
-storageClassName: gp3-csi
\ No newline at end of file
+storageClassName: gp3-csi