wip

wenxindongwork · wenxindongwork · commit a98a8d6cda8b · 2025-11-15T00:45:08.000Z
diff --git a/gs:/wenxindong-vm/trace/gpt_oss/dp2/attn_dp/prefill_heavy/batch_composition_stats_2025_11_13_01_11_38_748039.json b/gs:/wenxindong-vm/trace/gpt_oss/dp2/attn_dp/prefill_heavy/batch_composition_stats_2025_11_13_01_11_38_748039.json
@@ -0,0 +1 @@
+{"total_num_scheduled_tokens": 8192, "num_prefill_tokens": 16349, "num_decode_tokens": 35, "padded_total_num_scheduled_tokens": 16384, "num_reqs": 45}
diff --git a/tpu_inference/layers/common/sharding.py b/tpu_inference/layers/common/sharding.py
@@ -121,13 +121,19 @@ def from_vllm_config(cls,
         if enable_dp_attention:
             # Replicate attention layer when num_kv_heads < TP
             num_kv_heads = vllm_config.model_config.get_total_num_kv_heads()
+            
             kv_dtype = utils.get_jax_dtype_from_str_dtype(
                 vllm_config.cache_config.cache_dtype) or jnp.bfloat16
             packing = 4 // jnp.dtype(kv_dtype).itemsize
             # When num_kv_heads * 2 / packing < TP, tensor parallelism would
             # duplicate KV heads across devices, wasting kv cache memory.
             # Use attention DP instead to reduce per-device num_kv_heads and
             # eliminate this waste.
+
+            # if head_dim is 64, multiply packing by 2
+            if vllm_config.model_config.get_head_size() == 64:
+                packing *= 2
+                
             num_kv_heads_per_device_in_kv_cache = (num_kv_heads * 2) / packing
             attn_dp = max(
                 int(tensor_parallelism // num_kv_heads_per_device_in_kv_cache),

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"total_num_scheduled_tokens": 8192, "num_prefill_tokens": 16349, "num_decode_tokens": 35, "padded_total_num_scheduled_tokens": 16384, "num_reqs": 45}`