vllm-project · juncgu-google · Nov 22, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/examples/gke/benchmarks/README.md b/examples/gke/benchmarks/README.md
@@ -33,7 +33,7 @@ kubectl apply -f deploy-baseline.yaml
 
 ### Option B: vLLM with TPU Host Offload
 
-This deployment configures vLLM to use a `TPUConnector` for KV cache offload to the host CPU memory. This is specified by the `--kv-transfer-config` argument.
+This deployment configures vLLM to use a `TPUOffloadConnector` for KV cache offload to the host CPU memory. This is specified by the `--kv-transfer-config` argument.
 
 ```bash
 kubectl apply -f deploy-cpu-offload.yaml

diff --git a/examples/gke/benchmarks/deploy-cpu-offload.yaml b/examples/gke/benchmarks/deploy-cpu-offload.yaml
@@ -21,7 +21,7 @@ spec:
         imagePullPolicy: Always
         command: ["/bin/sh", "-c"]
         args:
-        - "vllm serve meta-llama/Llama-3.3-70B-Instruct --kv-transfer-config '{\"kv_connector\":\"TPUConnector\",\"kv_role\":\"kv_both\",\"kv_connector_module_path\":\"tpu_inference.distributed.tpu_connector_local\"}' --port 8000 --max_num_batched_tokens 2048 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
+        - "vllm serve meta-llama/Llama-3.3-70B-Instruct --kv-transfer-config '{\"kv_connector\":\"TPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_module_path\":\"tpu_inference.distributed.offload.tpu_offload_connector\"}' --port 8000 --max_num_batched_tokens 2048 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
         env:
         - name: HUGGING_FACE_HUB_TOKEN
           valueFrom:

diff --git a/examples/gke/pod_tpu_commons_cpu_offload.yaml b/examples/gke/pod_tpu_commons_cpu_offload.yaml
@@ -18,7 +18,7 @@ spec:
     - --tensor_parallel_size=8
     - --max_model_len=1024
     - --kv-transfer-config
-    - '{"kv_connector":"TPUConnector","kv_connector_module_path":"tpu_inference.distributed.tpu_connector_local","kv_role":"kv_both"}'
+    - '{"kv_connector":"TPUOffloadConnector","kv_connector_module_path":"tpu_inference.distributed.offload.tpu_offload_connector","kv_role":"kv_both"}'
     env:
     - name: HUGGING_FACE_HUB_TOKEN
       valueFrom:

diff --git a/examples/gke/pod_tpu_commons_cpu_offload_verification.yaml b/examples/gke/pod_tpu_commons_cpu_offload_verification.yaml
@@ -2,10 +2,10 @@ apiVersion: v1
 kind: Pod
 metadata:
   name: tpu-job-offline-inference
-  # This pod verifies the correctness of the TPUConnector implementation.
+  # This pod verifies the correctness of the TPUOffloadConnector implementation.
   # It runs a script that internally performs two text generations:
   # 1. A baseline run with a standard vLLM engine.
-  # 2. A test run with the TPUConnector enabled.
+  # 2. A test run with the TPUOffloadConnector enabled.
   # The pod succeeds only if the outputs from both runs are identical,
   # ensuring that the connector does not alter the model's output.
 spec:
@@ -25,7 +25,7 @@ spec:
     - --max_model_len=1024
     - --seed=42
     - --kv-transfer-config
-    - '{"kv_connector":"TPUConnector","kv_connector_module_path":"tpu_inference.distributed.tpu_connector_local","kv_role":"kv_both"}'
+    - '{"kv_connector":"TPUOffloadConnector","kv_connector_module_path":"tpu_inference.distributed.offload.tpu_offload_connector","kv_role":"kv_both"}'
     env:
     - name: HUGGING_FACE_HUB_TOKEN
       valueFrom:

diff --git a/examples/gke/pod_tpu_host_offload_unit_tests.yaml b/examples/gke/pod_tpu_host_offload_unit_tests.yaml
@@ -2,7 +2,7 @@ apiVersion: v1
 kind: Pod
 metadata:
   name: tpu-job-host-offload-unit-tests
-  # This pod runs the distributed unit tests for the TPUConnector
+  # This pod runs the distributed unit tests for the TPUOffloadConnector
   # and other related functionalities. It executes all tests found in the
   # tests/distributed/ directory using pytest.
 spec:
@@ -17,12 +17,12 @@ spec:
     command:
     - /bin/bash
     - -c
-    - "pytest -sv tests/distributed/host_offloading_precompile_test.py"
-    # - "pytest -sv tests/distributed/cpu_offloading_worker_test.py"
-    # - "pytest -sv tests/distributed/cpu_offloading_cache_util_test.py"
-    # - "pytest -sv tests/distributed/host_offloading_accuracy_test.py"
-    # - "pytest -sv tests/distributed/local_cpu_backend_test.py"
-    # - "pytest -sv tests/distributed/host_offloading_precompile_test.py"
+    - "pytest -sv tests/distributed/offload/tpu_offload_cpu_backend_test.py"
+    - "pytest -sv tests/distributed/offload/tpu_offload_connector_worker_test.py"
+    - "pytest -sv tests/distributed/offload/tpu_offload_connector_scheduler_test.py"
+    - "pytest -sv tests/distributed/offload/tpu_offload_utils_test.py"
+    - "pytest -sv tests/distributed/offload/tpu_offload_manager_test.py"
+    - "pytest -sv tests/distributed/offload/tpu_offload_accuracy_test.py"
     env:
     - name: HUGGING_FACE_HUB_TOKEN
       valueFrom:

diff --git a/examples/offline_inference_kv_cache_verification.py b/examples/offline_inference_kv_cache_verification.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This script performs an automated correctness verification for the TPUConnector.
+This script performs an automated correctness verification for the TPUOffloadConnector.
 
 The verification works by performing a two-stage experiment for multiple prompts:
 1.  Baseline Run: For each prompt, it first runs a text generation using a
     standard vLLM engine configuration without any KV cache connector. The
     output from this run is considered the "source of truth".
 
 2.  Test Run: It then runs the exact same text generation, but this time
-    with the TPUConnector enabled via the `--kv-transfer-config` argument.
+    with the TPUOffloadConnector enabled via the `--kv-transfer-config` argument.
     It runs the generation twice to verify prefix caching.
 
 3.  Comparison: The script compares the output from each test run against the
@@ -131,7 +131,7 @@ def main(args: dict):
         time.sleep(10)
 
         # 2. Run the test with the local tpu kv connector enabled
-        print("\n--- Running Test (with TPUConnector) ---")
+        print("\n--- Running Test (with TPUOffloadConnector) ---")
         # With the connector, we run generation twice to test the prefix cache
         test_llm, test_params = setup_llm(args)
         test_outputs = run_invocations(test_llm,

diff --git a/tests/distributed/cpu_offloading_cache_util_test.py b/tests/distributed/cpu_offloading_cache_util_test.py