From 8d177b08afba82887c04d0f2526169d0f6aa93da Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:02:43 -0700 Subject: [PATCH 01/20] add example for kv cache offloading Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 examples/llm-api/llm_kv_cache_offloading.py diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py new file mode 100644 index 00000000000..9aa28042120 --- /dev/null +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -0,0 +1,33 @@ +from tensorrt_llm import LLM +from tensorrt_llm.llmapi import KvCacheConfig + + +def main(): + print("\n=== KV Cache Configuration Example ===") + print("\n1. KV Cache Configuration:") + + llm_advanced = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_batch_size=1, + max_seq_len=1024, + kv_cache_config=KvCacheConfig( + free_gpu_memory_fraction=0.5, + enable_block_reuse=True, + max_tokens=1024, + host_cache_size=1 * 1024 * 1024 * 1024, + tokens_per_block=32)) + + prompts = [ + "Hello, my name is", + "The capital of France is", + "The future of AI is", + ] + + outputs = llm_advanced.generate(prompts) + for i, output in enumerate(outputs): + print(f"Query {i+1}: {output.prompt}") + print(f"Answer: {output.outputs[0].text[:100]}...") + print() + + +if __name__ == "__main__": + main() From 47e36a695ba12c40858d467772389d223bdc23fa Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:26:40 -0700 Subject: [PATCH 02/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 89 +++++++++++++++------ 1 file changed, 65 insertions(+), 24 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 9aa28042120..5475dc795a8 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -3,30 +3,71 @@ def main(): - print("\n=== KV Cache Configuration Example ===") - print("\n1. KV Cache Configuration:") - - llm_advanced = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_batch_size=1, - max_seq_len=1024, - kv_cache_config=KvCacheConfig( - free_gpu_memory_fraction=0.5, - enable_block_reuse=True, - max_tokens=1024, - host_cache_size=1 * 1024 * 1024 * 1024, - tokens_per_block=32)) - - prompts = [ - "Hello, my name is", - "The capital of France is", - "The future of AI is", - ] - - outputs = llm_advanced.generate(prompts) - for i, output in enumerate(outputs): - print(f"Query {i+1}: {output.prompt}") - print(f"Answer: {output.outputs[0].text[:100]}...") - print() + + prompt_a = ( + "Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n" + "The following excerpt is from a pamphlet.\nYou will do me the justice to remember, " + "that I have always strenuously supported the Right of every man to his own opinion" + ) + + prompt_b = ( + "Question: This question refers to the following information.\nRead the following excerpt." + "\nThe revolutionary seed had penetrated into every country and spread more or less. " + "It was greatly developed under") + + # Offloading Off + llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_batch_size=1, + max_seq_len=256, + kv_cache_config=KvCacheConfig(enable_block_reuse=True, + max_tokens=256, + tokens_per_block=16)) + # prompt_a occupies kv cache + output_a = llm.generate(prompt_a) + print(output_a.prompt) + + # since max_batch_size=1, prompt_b clears and update kv cache + output_b = llm.generate(prompt_b) + print(output_b.prompt) + + # prompt_a clears and update kv cache again + output_a = llm.generate(prompt_a) + print(output_a.prompt) + + # prompt_b clears and update kv cache again + output_b = llm.generate(prompt_b) + print(output_b.prompt) + + llm.shutdown() + + # Offloading On + llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_batch_size=1, + max_seq_len=256, + kv_cache_config=KvCacheConfig(enable_block_reuse=True, + max_tokens=256, + tokens_per_block=16, + host_cache_size=1024**3)) + # prompt_a occupies kv cache + output_a = llm.generate(prompt_a) + print(output_a.prompt) + + # since max_batch_size=1, and offloading is enabled, kv cache of prompt_a will be offloaded to host memory. + # kv cache of prompt_b keeps in device memory. + output_b = llm.generate(prompt_b) + print(output_b.prompt) + + # kv cache of prompt_a will be onboarded to device memory, kv cache of prompt_b will be offloaded to host memory. + # kv cache of prompt_a will be reused. + output_a = llm.generate(prompt_a) + print(output_a.prompt) + + # kv cache of prompt_b will be onboarded to device memory, kv cache of prompt_a will be offloaded to host memory. + # kv cache of prompt_b will be reused. + output_b = llm.generate(prompt_b) + print(output_b.prompt) + + llm.shutdown() if __name__ == "__main__": From cd0bb9ff28f3efad217de8c19612f8bee91fef32 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:27:39 -0700 Subject: [PATCH 03/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 5475dc795a8..215a7110e3d 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -22,7 +22,7 @@ def main(): kv_cache_config=KvCacheConfig(enable_block_reuse=True, max_tokens=256, tokens_per_block=16)) - # prompt_a occupies kv cache + # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print(output_a.prompt) @@ -31,10 +31,12 @@ def main(): print(output_b.prompt) # prompt_a clears and update kv cache again + # no kv cache reuse happens output_a = llm.generate(prompt_a) print(output_a.prompt) # prompt_b clears and update kv cache again + # no kv cache reuse happens output_b = llm.generate(prompt_b) print(output_b.prompt) @@ -48,7 +50,7 @@ def main(): max_tokens=256, tokens_per_block=16, host_cache_size=1024**3)) - # prompt_a occupies kv cache + # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print(output_a.prompt) From 30858323a4ab4f4ba4288dd8bb12cb0eb7bf7c94 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:33:02 -0700 Subject: [PATCH 04/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 26 +++++++++++++-------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 215a7110e3d..72e6c05ae52 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -5,23 +5,28 @@ def main(): prompt_a = ( - "Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n" + "Given the following question and four candidate answers (A, B, C and D), choose the best answer." "The following excerpt is from a pamphlet.\nYou will do me the justice to remember, " "that I have always strenuously supported the Right of every man to his own opinion" ) prompt_b = ( - "Question: This question refers to the following information.\nRead the following excerpt." - "\nThe revolutionary seed had penetrated into every country and spread more or less. " + "Question: This question refers to the following information. Read the following excerpt." + "The revolutionary seed had penetrated into every country and spread more or less. " "It was greatly developed under") + kv_cache_max_tokens = 256 + kv_cache_page_size = 16 + kv_cache_host_size_in_bytes = 1024**3 + # Offloading Off llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=1, max_seq_len=256, - kv_cache_config=KvCacheConfig(enable_block_reuse=True, - max_tokens=256, - tokens_per_block=16)) + kv_cache_config=KvCacheConfig( + enable_block_reuse=True, + max_tokens=kv_cache_max_tokens, + tokens_per_block=kv_cache_page_size)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print(output_a.prompt) @@ -46,10 +51,11 @@ def main(): llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=1, max_seq_len=256, - kv_cache_config=KvCacheConfig(enable_block_reuse=True, - max_tokens=256, - tokens_per_block=16, - host_cache_size=1024**3)) + kv_cache_config=KvCacheConfig( + enable_block_reuse=True, + max_tokens=kv_cache_max_tokens, + tokens_per_block=kv_cache_page_size, + host_cache_size=kv_cache_host_size_in_bytes)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print(output_a.prompt) From b0df6e7b93bf673c5c96b59bd536a9f8c138de3b Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:35:32 -0700 Subject: [PATCH 05/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 72e6c05ae52..79d25823d37 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -6,20 +6,20 @@ def main(): prompt_a = ( "Given the following question and four candidate answers (A, B, C and D), choose the best answer." - "The following excerpt is from a pamphlet.\nYou will do me the justice to remember, " - "that I have always strenuously supported the Right of every man to his own opinion" + "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) prompt_b = ( "Question: This question refers to the following information. Read the following excerpt." "The revolutionary seed had penetrated into every country and spread more or less. " - "It was greatly developed under") + ) kv_cache_max_tokens = 256 kv_cache_page_size = 16 kv_cache_host_size_in_bytes = 1024**3 # Offloading Off + print("\n === Offloading Off === \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=1, max_seq_len=256, @@ -48,6 +48,7 @@ def main(): llm.shutdown() # Offloading On + print("\n === Offloading On === \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=1, max_seq_len=256, From 0362c15e557ea1eb3836de79967437f1be1b0ed6 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:39:09 -0700 Subject: [PATCH 06/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 79d25823d37..40492028a4a 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -6,12 +6,12 @@ def main(): prompt_a = ( "Given the following question and four candidate answers (A, B, C and D), choose the best answer." - "The following excerpt is from a pamphlet. You will do me the justice to remember, " + # "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) prompt_b = ( "Question: This question refers to the following information. Read the following excerpt." - "The revolutionary seed had penetrated into every country and spread more or less. " + # "The revolutionary seed had penetrated into every country and spread more or less. " ) kv_cache_max_tokens = 256 From ab069a9ca19b60beb259e9983eca66ae3ee8d7c8 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:46:16 -0700 Subject: [PATCH 07/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 32 +++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 40492028a4a..4cbec5c42e4 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -29,21 +29,29 @@ def main(): tokens_per_block=kv_cache_page_size)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) - print(output_a.prompt) + print( + f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) # since max_batch_size=1, prompt_b clears and update kv cache output_b = llm.generate(prompt_b) - print(output_b.prompt) + print( + f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) # prompt_a clears and update kv cache again # no kv cache reuse happens output_a = llm.generate(prompt_a) - print(output_a.prompt) + print( + f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) # prompt_b clears and update kv cache again # no kv cache reuse happens output_b = llm.generate(prompt_b) - print(output_b.prompt) + print( + f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) llm.shutdown() @@ -59,22 +67,30 @@ def main(): host_cache_size=kv_cache_host_size_in_bytes)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) - print(output_a.prompt) + print( + f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) # since max_batch_size=1, and offloading is enabled, kv cache of prompt_a will be offloaded to host memory. # kv cache of prompt_b keeps in device memory. output_b = llm.generate(prompt_b) - print(output_b.prompt) + print( + f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) # kv cache of prompt_a will be onboarded to device memory, kv cache of prompt_b will be offloaded to host memory. # kv cache of prompt_a will be reused. output_a = llm.generate(prompt_a) - print(output_a.prompt) + print( + f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) # kv cache of prompt_b will be onboarded to device memory, kv cache of prompt_a will be offloaded to host memory. # kv cache of prompt_b will be reused. output_b = llm.generate(prompt_b) - print(output_b.prompt) + print( + f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + ) llm.shutdown() From fc888e7a9fa95b8f7841a81d9b52744ba8339636 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:48:11 -0700 Subject: [PATCH 08/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 4cbec5c42e4..ff65b4eee0b 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -36,7 +36,7 @@ def main(): # since max_batch_size=1, prompt_b clears and update kv cache output_b = llm.generate(prompt_b) print( - f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) # prompt_a clears and update kv cache again @@ -50,7 +50,7 @@ def main(): # no kv cache reuse happens output_b = llm.generate(prompt_b) print( - f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) llm.shutdown() @@ -75,7 +75,7 @@ def main(): # kv cache of prompt_b keeps in device memory. output_b = llm.generate(prompt_b) print( - f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) # kv cache of prompt_a will be onboarded to device memory, kv cache of prompt_b will be offloaded to host memory. @@ -89,7 +89,7 @@ def main(): # kv cache of prompt_b will be reused. output_b = llm.generate(prompt_b) print( - f"Prompt: {output_b.prompt!r}, Generated text: {output_a.outputs[0].text!r}" + f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) llm.shutdown() From 06dea3c91c53ea8e4e05583277c09b2c1335e634 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:22:30 -0700 Subject: [PATCH 09/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index ff65b4eee0b..86ef1c9388a 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -1,3 +1,5 @@ +import time + from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig @@ -14,18 +16,17 @@ def main(): # "The revolutionary seed had penetrated into every country and spread more or less. " ) - kv_cache_max_tokens = 256 kv_cache_page_size = 16 kv_cache_host_size_in_bytes = 1024**3 # Offloading Off - print("\n === Offloading Off === \n") + print("\n ====== Offloading Off ====== \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=1, max_seq_len=256, kv_cache_config=KvCacheConfig( enable_block_reuse=True, - max_tokens=kv_cache_max_tokens, + free_gpu_memory_fraction=0.01, tokens_per_block=kv_cache_page_size)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) @@ -54,15 +55,16 @@ def main(): ) llm.shutdown() + time.sleep(5) # Offloading On - print("\n === Offloading On === \n") + print("\n ====== Offloading On ====== \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=1, max_seq_len=256, kv_cache_config=KvCacheConfig( enable_block_reuse=True, - max_tokens=kv_cache_max_tokens, + free_gpu_memory_fraction=0.01, tokens_per_block=kv_cache_page_size, host_cache_size=kv_cache_host_size_in_bytes)) # prompt_a occupies kv cache pool @@ -71,21 +73,24 @@ def main(): f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) - # since max_batch_size=1, and offloading is enabled, kv cache of prompt_a will be offloaded to host memory. + # since max_batch_size=1, and offloading is enabled, + # kv cache of prompt_a will be offloaded to host memory. # kv cache of prompt_b keeps in device memory. output_b = llm.generate(prompt_b) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) - # kv cache of prompt_a will be onboarded to device memory, kv cache of prompt_b will be offloaded to host memory. + # kv cache of prompt_a will be onboarded to device memory, + # kv cache of prompt_b will be offloaded to host memory. # kv cache of prompt_a will be reused. output_a = llm.generate(prompt_a) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) - # kv cache of prompt_b will be onboarded to device memory, kv cache of prompt_a will be offloaded to host memory. + # kv cache of prompt_b will be onboarded to device memory, + # kv cache of prompt_a will be offloaded to host memory. # kv cache of prompt_b will be reused. output_b = llm.generate(prompt_b) print( From 9e87d5ffaf1d55f7d1b56a2e6c12f6ff525594a4 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:27:22 -0700 Subject: [PATCH 10/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 86ef1c9388a..7277bcc40b8 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -16,6 +16,7 @@ def main(): # "The revolutionary seed had penetrated into every country and spread more or less. " ) + kv_cache_free_gpu_memory_fraction = 0.001 kv_cache_page_size = 16 kv_cache_host_size_in_bytes = 1024**3 @@ -26,7 +27,7 @@ def main(): max_seq_len=256, kv_cache_config=KvCacheConfig( enable_block_reuse=True, - free_gpu_memory_fraction=0.01, + free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, tokens_per_block=kv_cache_page_size)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) @@ -64,7 +65,7 @@ def main(): max_seq_len=256, kv_cache_config=KvCacheConfig( enable_block_reuse=True, - free_gpu_memory_fraction=0.01, + free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, tokens_per_block=kv_cache_page_size, host_cache_size=kv_cache_host_size_in_bytes)) # prompt_a occupies kv cache pool From d8a32423c7a62f137fe1f60291a12494d551640c Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:32:20 -0700 Subject: [PATCH 11/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 7277bcc40b8..0cdce95d5cf 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -8,13 +8,15 @@ def main(): prompt_a = ( "Given the following question and four candidate answers (A, B, C and D), choose the best answer." - # "The following excerpt is from a pamphlet. You will do me the justice to remember, " + "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) prompt_b = ( "Question: This question refers to the following information. Read the following excerpt." - # "The revolutionary seed had penetrated into every country and spread more or less. " + "The revolutionary seed had penetrated into every country and spread more or less. " ) + max_batch_size = 1 + max_seq_len = 512 kv_cache_free_gpu_memory_fraction = 0.001 kv_cache_page_size = 16 @@ -23,8 +25,8 @@ def main(): # Offloading Off print("\n ====== Offloading Off ====== \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_batch_size=1, - max_seq_len=256, + max_batch_size=max_batch_size, + max_seq_len=max_seq_len, kv_cache_config=KvCacheConfig( enable_block_reuse=True, free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, @@ -61,8 +63,8 @@ def main(): # Offloading On print("\n ====== Offloading On ====== \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_batch_size=1, - max_seq_len=256, + max_batch_size=max_batch_size, + max_seq_len=max_seq_len, kv_cache_config=KvCacheConfig( enable_block_reuse=True, free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, From 6a3757518b0ecd4be5c29f5965366a73dfb75957 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:43:43 -0700 Subject: [PATCH 12/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 22 ++++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 0cdce95d5cf..38556bcea5d 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -16,21 +16,20 @@ def main(): "The revolutionary seed had penetrated into every country and spread more or less. " ) max_batch_size = 1 - max_seq_len = 512 + max_seq_len = 256 - kv_cache_free_gpu_memory_fraction = 0.001 + kv_cache_max_tokens = 256 kv_cache_page_size = 16 - kv_cache_host_size_in_bytes = 1024**3 # Offloading Off print("\n ====== Offloading Off ====== \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=max_batch_size, max_seq_len=max_seq_len, - kv_cache_config=KvCacheConfig( - enable_block_reuse=True, - free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, - tokens_per_block=kv_cache_page_size)) + kv_cache_config=KvCacheConfig(enable_block_reuse=True, + max_tokens=kv_cache_max_tokens, + tokens_per_block=kv_cache_page_size, + host_cache_size=0)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print( @@ -65,11 +64,10 @@ def main(): llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=max_batch_size, max_seq_len=max_seq_len, - kv_cache_config=KvCacheConfig( - enable_block_reuse=True, - free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, - tokens_per_block=kv_cache_page_size, - host_cache_size=kv_cache_host_size_in_bytes)) + kv_cache_config=KvCacheConfig(enable_block_reuse=True, + max_tokens=kv_cache_max_tokens, + tokens_per_block=kv_cache_page_size, + host_cache_size=1024**3)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print( From bea06fd09f50e9478166f453714c5329f7a78651 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:58:15 -0700 Subject: [PATCH 13/20] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 90 ++++++++------------- 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 38556bcea5d..54b690bc52b 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -1,10 +1,10 @@ -import time +import argparse from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig -def main(): +def main(args): prompt_a = ( "Given the following question and four candidate answers (A, B, C and D), choose the best answer." @@ -20,79 +20,52 @@ def main(): kv_cache_max_tokens = 256 kv_cache_page_size = 16 + kv_cache_host_size = 1024**3 if args.enable_offloading else 0 - # Offloading Off - print("\n ====== Offloading Off ====== \n") llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=max_batch_size, max_seq_len=max_seq_len, kv_cache_config=KvCacheConfig(enable_block_reuse=True, max_tokens=kv_cache_max_tokens, tokens_per_block=kv_cache_page_size, - host_cache_size=0)) + host_cache_size=kv_cache_host_size)) # prompt_a occupies kv cache pool output_a = llm.generate(prompt_a) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) - - # since max_batch_size=1, prompt_b clears and update kv cache - output_b = llm.generate(prompt_b) - print( - f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" - ) - - # prompt_a clears and update kv cache again - # no kv cache reuse happens - output_a = llm.generate(prompt_a) - print( - f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" - ) - - # prompt_b clears and update kv cache again - # no kv cache reuse happens - output_b = llm.generate(prompt_b) - print( - f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" - ) - - llm.shutdown() - time.sleep(5) - - # Offloading On - print("\n ====== Offloading On ====== \n") - llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_batch_size=max_batch_size, - max_seq_len=max_seq_len, - kv_cache_config=KvCacheConfig(enable_block_reuse=True, - max_tokens=kv_cache_max_tokens, - tokens_per_block=kv_cache_page_size, - host_cache_size=1024**3)) - # prompt_a occupies kv cache pool - output_a = llm.generate(prompt_a) - print( - f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" - ) - - # since max_batch_size=1, and offloading is enabled, - # kv cache of prompt_a will be offloaded to host memory. - # kv cache of prompt_b keeps in device memory. + ''' + since max_batch_size=1, + if enable_offloading=False: + prompt_b clears and update kv cache + if enable_offloading=True: + kv cache of prompt_a will be offloaded to host memory. + kv cache of prompt_b keeps in device memory. + ''' output_b = llm.generate(prompt_b) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) - - # kv cache of prompt_a will be onboarded to device memory, - # kv cache of prompt_b will be offloaded to host memory. - # kv cache of prompt_a will be reused. + ''' + if not enable_offloading: + prompt_a clears and update kv cache again, no kv cache reuse happens + else: + kv cache of prompt_a will be onboarded to device memory, and be reused. + kv cache of prompt_b will be offloaded to host memory. + ''' output_a = llm.generate(prompt_a) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) - - # kv cache of prompt_b will be onboarded to device memory, - # kv cache of prompt_a will be offloaded to host memory. - # kv cache of prompt_b will be reused. + ''' + if not enable_offloading: + prompt_b clears and update kv cache again, no kv cache reuse happens + else: + kv cache of prompt_b will be onboarded to device memory, and be reused. + kv cache of prompt_a will be offloaded to host memory. + ''' + # + # output_b = llm.generate(prompt_b) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" @@ -102,4 +75,9 @@ def main(): if __name__ == "__main__": - main() + parser = argparse.ArgumentParser() + parser.add_argument('--enable_offloading', + default=False, + action='store_true') + args = parser.parse_args() + main(args) From b999da86c8e524e2a1f3cc13fd6f1f4312ea375a Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 10 Sep 2025 10:24:02 -0700 Subject: [PATCH 14/20] update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 30 +++++++++++---------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 54b690bc52b..5317aef593d 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -1,6 +1,6 @@ import argparse -from tensorrt_llm import LLM +from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig @@ -22,6 +22,8 @@ def main(args): kv_cache_page_size = 16 kv_cache_host_size = 1024**3 if args.enable_offloading else 0 + sampling_params = SamplingParams(max_tokens=max_seq_len) + llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_batch_size=max_batch_size, max_seq_len=max_seq_len, @@ -29,44 +31,44 @@ def main(args): max_tokens=kv_cache_max_tokens, tokens_per_block=kv_cache_page_size, host_cache_size=kv_cache_host_size)) - # prompt_a occupies kv cache pool - output_a = llm.generate(prompt_a) + ''' + prompt_a occupies kv cache pool + ''' + output_a = llm.generate(prompt_a, sampling_params) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) ''' since max_batch_size=1, - if enable_offloading=False: - prompt_b clears and update kv cache - if enable_offloading=True: + if not enable_offloading: + prompt_b clears and updates kv cache + else: kv cache of prompt_a will be offloaded to host memory. - kv cache of prompt_b keeps in device memory. + kv cache of prompt_b will be in device memory. ''' - output_b = llm.generate(prompt_b) + output_b = llm.generate(prompt_b, sampling_params) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) ''' if not enable_offloading: - prompt_a clears and update kv cache again, no kv cache reuse happens + prompt_a clears and updates kv cache again, no kv cache reuse happens else: kv cache of prompt_a will be onboarded to device memory, and be reused. kv cache of prompt_b will be offloaded to host memory. ''' - output_a = llm.generate(prompt_a) + output_a = llm.generate(prompt_a, sampling_params) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) ''' if not enable_offloading: - prompt_b clears and update kv cache again, no kv cache reuse happens + prompt_b clears and updates kv cache again, no kv cache reuse happens else: kv cache of prompt_b will be onboarded to device memory, and be reused. kv cache of prompt_a will be offloaded to host memory. ''' - # - # - output_b = llm.generate(prompt_b) + output_b = llm.generate(prompt_b, sampling_params) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) From c6ec2d9c0004c935f44875f6a630b88efc4e819f Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 10 Sep 2025 10:42:43 -0700 Subject: [PATCH 15/20] update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 5317aef593d..385bb981285 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -12,8 +12,8 @@ def main(args): ) prompt_b = ( - "Question: This question refers to the following information. Read the following excerpt." - "The revolutionary seed had penetrated into every country and spread more or less. " + "Question: Given the following question and four candidate answers (A, B, C and D), choose the best answer." + "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) max_batch_size = 1 max_seq_len = 256 From 1eff9cca3f06c691d307f14f89bd7933ac169681 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 10 Sep 2025 10:55:02 -0700 Subject: [PATCH 16/20] update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 385bb981285..70042ac0131 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -12,7 +12,7 @@ def main(args): ) prompt_b = ( - "Question: Given the following question and four candidate answers (A, B, C and D), choose the best answer." + "the following question and four candidate answers (A, B, C and D), choose the best answer." "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) max_batch_size = 1 From 71250c590369491ba7ae31bd3de80813b1ebd401 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:01:52 -0700 Subject: [PATCH 17/20] update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 70042ac0131..c2ad354810b 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -7,12 +7,12 @@ def main(args): prompt_a = ( - "Given the following question and four candidate answers (A, B, C and D), choose the best answer." + "the following question and four candidate answers (A, B, C and D), choose the best answer." "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) prompt_b = ( - "the following question and four candidate answers (A, B, C and D), choose the best answer." + "Given the following question and four candidate answers (A, B, C and D), choose the best answer." "The following excerpt is from a pamphlet. You will do me the justice to remember, " ) max_batch_size = 1 From 458f82df675e0ff47e6baa0dd742ba91ae93ded7 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 10 Sep 2025 15:29:54 -0700 Subject: [PATCH 18/20] update model to qwen-8b Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index c2ad354810b..9783d0a1a8d 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -5,16 +5,11 @@ def main(args): - prompt_a = ( - "the following question and four candidate answers (A, B, C and D), choose the best answer." - "The following excerpt is from a pamphlet. You will do me the justice to remember, " - ) - - prompt_b = ( - "Given the following question and four candidate answers (A, B, C and D), choose the best answer." - "The following excerpt is from a pamphlet. You will do me the justice to remember, " - ) + "Returns the per-iterations statistics computed since last call to this method. " + "Contains at most iter_stats_max_iterations iterations.") + prompt_b = ("Use for skipping decoding step for non generation model, " + "and return the batch_output (such as mm_embeddings)") max_batch_size = 1 max_seq_len = 256 @@ -24,7 +19,7 @@ def main(args): sampling_params = SamplingParams(max_tokens=max_seq_len) - llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + llm = LLM(model="Qwen/Qwen3-8B", max_batch_size=max_batch_size, max_seq_len=max_seq_len, kv_cache_config=KvCacheConfig(enable_block_reuse=True, From 0540a5d9f10a0ff65263e9e0ead65b6c40dc7684 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 16 Sep 2025 17:01:18 +0800 Subject: [PATCH 19/20] polish example Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- examples/llm-api/llm_kv_cache_offloading.py | 121 ++++++++++++++------ 1 file changed, 86 insertions(+), 35 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py index 9783d0a1a8d..3cedd211db0 100644 --- a/examples/llm-api/llm_kv_cache_offloading.py +++ b/examples/llm-api/llm_kv_cache_offloading.py @@ -1,3 +1,49 @@ +''' +This script demonstrates the effectiveness of KV cache host offloading in TensorRT-LLM. + +**Scenario:** +The script simulates a scenario where the GPU's KV cache is severely limited, +while multiple requests with recurring prompts (like system prompts) are processed. + +1. **Constrained GPU Cache:** The GPU KV cache is configured to be very small, + only large enough to hold the state for a single request. +2. **Alternating Prompts:** Four requests are sent sequentially (batch size of 1) + with two distinct prompts in an A, B, A, B pattern. +3. **Cache Eviction:** Due to the small GPU cache, processing prompt B will + force the eviction of the cache generated for prompt A. + +**Demonstration:** + +* **Without Offloading (Default):** + - When the first prompt 'A' is processed, its KV cache is stored on the GPU. + - When prompt 'B' arrives, the cache manager needs space and discards the cache for 'A'. + - When prompt 'A' is sent again, its cache must be recomputed from scratch. + - **Expected Outcome:** The log will show `reused blocks: 0` and `cache hit rate: 0`. + +* **With Offloading (`--enable_offloading`):** + - When prompt 'B' arrives, the cache for 'A' is not discarded but is instead + *offloaded* from the fast GPU VRAM to the slower (but larger) host CPU RAM. + - When prompt 'A' is sent again, its KV cache is loaded back from host RAM + to the GPU, which is significantly faster than recomputing it. + - **Expected Outcome:** The log will show positive values for `reused blocks` + and a non-zero `cache hit rate`, confirming that the cache was successfully + reused from the host. + +**How to Run & Verify:** + +1. **Without Offloading:** + ```bash + TLLM_LOG_LEVEL=DEBUG python llm_kv_cache_offloading.py 2>&1 | tee offloading_disabled.log + ``` + (Check the log for zero reuse) + +2. **With Offloading:** + ```bash + TLLM_LOG_LEVEL=DEBUG python llm_kv_cache_offloading.py --enable_offloading 2>&1 | tee offloading_enabled.log + ``` + (Check the log for non-zero reuse) +''' + import argparse from tensorrt_llm import LLM, SamplingParams @@ -5,64 +51,66 @@ def main(args): + # Define two distinct prompts to simulate different requests or system prompts. prompt_a = ( "Returns the per-iterations statistics computed since last call to this method. " "Contains at most iter_stats_max_iterations iterations.") prompt_b = ("Use for skipping decoding step for non generation model, " "and return the batch_output (such as mm_embeddings)") + + # Use a batch size of 1 to process requests sequentially, making the cache + # eviction and reuse cycle easy to observe. max_batch_size = 1 max_seq_len = 256 + # --- KV Cache Configuration --- + # Set a small GPU KV cache size (in number of tokens). This is crucial for the demo, + # as it's only large enough to hold the KV cache for a single request. kv_cache_max_tokens = 256 + # Define the size of a single cache block. kv_cache_page_size = 16 + # Enable a 1 GB host cache if offloading is requested, otherwise disable it (size 0). + # This is the key toggle for the experiment. kv_cache_host_size = 1024**3 if args.enable_offloading else 0 sampling_params = SamplingParams(max_tokens=max_seq_len) - llm = LLM(model="Qwen/Qwen3-8B", - max_batch_size=max_batch_size, - max_seq_len=max_seq_len, - kv_cache_config=KvCacheConfig(enable_block_reuse=True, - max_tokens=kv_cache_max_tokens, - tokens_per_block=kv_cache_page_size, - host_cache_size=kv_cache_host_size)) - ''' - prompt_a occupies kv cache pool - ''' + llm = LLM( + model="Qwen/Qwen3-8B", + max_batch_size=max_batch_size, + max_seq_len=max_seq_len, + kv_cache_config=KvCacheConfig( + enable_block_reuse=True, # Enable reuse of cached blocks + max_tokens=kv_cache_max_tokens, # Max tokens in GPU cache + tokens_per_block=kv_cache_page_size, + host_cache_size=kv_cache_host_size # Host cache size for offloading + )) + + # Process four requests sequentially using two distinct prompts (A, B, A, B). + # This pattern is designed to showcase the cache eviction and reuse behavior. + print("--- First Round ---") + # 1. Process prompt A. Its cache is stored on the GPU. output_a = llm.generate(prompt_a, sampling_params) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) - ''' - since max_batch_size=1, - if not enable_offloading: - prompt_b clears and updates kv cache - else: - kv cache of prompt_a will be offloaded to host memory. - kv cache of prompt_b will be in device memory. - ''' + # 2. Process prompt B. Its cache replaces/offloads A's cache. output_b = llm.generate(prompt_b, sampling_params) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" ) - ''' - if not enable_offloading: - prompt_a clears and updates kv cache again, no kv cache reuse happens - else: - kv cache of prompt_a will be onboarded to device memory, and be reused. - kv cache of prompt_b will be offloaded to host memory. - ''' + + print("\n--- Second Round ---") + # 3. Process prompt A again. + # - Without offloading: Must recompute from scratch. + # - With offloading: Recovers cache from host RAM. output_a = llm.generate(prompt_a, sampling_params) print( f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}" ) - ''' - if not enable_offloading: - prompt_b clears and updates kv cache again, no kv cache reuse happens - else: - kv cache of prompt_b will be onboarded to device memory, and be reused. - kv cache of prompt_a will be offloaded to host memory. - ''' + # 4. Process prompt B again. + # - Without offloading: Must recompute from scratch. + # - With offloading: Recovers cache from host RAM. output_b = llm.generate(prompt_b, sampling_params) print( f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}" @@ -72,9 +120,12 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description= + "A script to demonstrate the effectiveness of KV cache host offloading." + ) parser.add_argument('--enable_offloading', - default=False, - action='store_true') + action='store_true', + help='Enable host RAM for KV cache offloading.') args = parser.parse_args() main(args) From f900d2b4fc170a6c66f6fb862755175811fa2ff8 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 16 Sep 2025 17:04:15 +0800 Subject: [PATCH 20/20] update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- docs/source/features/kvcache.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/features/kvcache.md b/docs/source/features/kvcache.md index 3f6b394d0e3..03d7710fa77 100644 --- a/docs/source/features/kvcache.md +++ b/docs/source/features/kvcache.md @@ -64,6 +64,8 @@ Before a block is evicted from GPU memory, it can optionally be offloaded to hos When offloading is enabled, the client can prevent specific blocks from being offloaded by toggling block priority. Blocks with lower priority than a certain threshold are not offloaded; they are evicted directly from GPU memory to reduce traffic between GPU and host. This priority is set with ```secondary_offload_min_priority```. Default value is 35, meaning any block with lower priority than 35 will not be offloaded. +Here is an [example](../../../examples/llm-api/llm_kv_cache_offloading.py) to show how to enable host offloading. + ### Partial Reuse Partial reuse of a block can happen when some but not all tokens are matched. It is enabled by default, but can be disabled by setting ```enable_partial_reuse``` to False.