From 0308d43856314185394582df701de7b07b78fa47 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 12 Nov 2025 12:01:19 -0800 Subject: [PATCH] add CPU BKMs --- helm/templates/servingruntime.yaml | 24 +++++++++++++++++++++++- helm/values.yaml | 7 ++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/helm/templates/servingruntime.yaml b/helm/templates/servingruntime.yaml index 8927006..84bee71 100644 --- a/helm/templates/servingruntime.yaml +++ b/helm/templates/servingruntime.yaml @@ -24,6 +24,28 @@ spec: - {{ .Values.model.maxModelLen | quote }} - '--served-model-name' - {{ .Values.model.name }} + - '--tensor-parallel-size' + - {{ .Values.resources.inference.tp }} + - '--dtype' + - {{ .Values.model.dtype }} + - '--enforce-eager' + - '--distributed-executor-backend' + - "mp" + - '--block-size' + - {{ .Values.model.block_size }} + - '--max-num-batched-tokens' + - {{ .Values.model.max_num_batched_tokens }} + - '--max-num-seqs' + - {{ .Values.model.max_num_seqs }} + env: + - name: VLLM_CPU_KVCACHE_SPACE + value: "40" + - name: VLLM_CPU_SGL_KERNEL + value: "1" + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + - name: VLLM_RPC_TIMEOUT + value: "100000" image: {{ .Values.images.vllmRuntime.repository }}:{{ .Values.images.vllmRuntime.tag }} name: kserve-container ports: @@ -33,4 +55,4 @@ spec: multiModel: false supportedModelFormats: - autoSelect: true - name: vLLM \ No newline at end of file + name: vLLM diff --git a/helm/values.yaml b/helm/values.yaml index 6473877..aaec419 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -40,14 +40,19 @@ model: storageUri: "oci://quay.io/rh-aiservices-bu/tinyllama:1.0" name: "tinyllama" maxModelLen: 2048 + dtype: "bfloat16" + block_size: "128" + max_num_batched_tokens: "2048" + max_num_seqs: "256" resources: inference: requests: cpu: "2" memory: "4Gi" + tp: "1" limits: cpu: "8" memory: "8Gi" -storageClassName: gp3-csi \ No newline at end of file +storageClassName: gp3-csi