experimental multi-host device_put

Lumosis · Lumosis · commit b45ad8c92d12 · 2025-11-14T05:00:19.000Z
diff --git a/tpu_inference/runner/tpu_runner.py b/tpu_inference/runner/tpu_runner.py
@@ -1350,21 +1350,11 @@ def _prepare_inputs_dp(self, scheduler_output: "VllmSchedulerOutput"):
         logits_indices_cpu = logits_indices
         seq_lens_cpu = seq_lens
 
-        # First, put arrays on a single device.
-        # JAX will then handle efficient device-to-device transfer.
-        input_tuple_single_device = jax.device_put(
-            (input_ids, positions, block_tables, query_start_loc, seq_lens,
-             logits_indices, request_distribution),
-            device=self.devices[0],
-        )
-        
-        print(f'{input_tuple_single_device=}')
-
-        # Then, distribute from that single device to all devices in the mesh.
         (input_ids, positions, block_tables, query_start_loc, seq_lens, logits_indices,
          request_distribution) = device_array(
              self.mesh,
-             input_tuple_single_device,
+             (input_ids, positions, block_tables, query_start_loc, seq_lens,
+             logits_indices, request_distribution),
              sharding=data_parallel_attn_sharding,
          )
         # Async scheduling: substitute placeholder tokens for DP
@@ -1553,16 +1543,10 @@ def _prepare_inputs_non_dp(self, scheduler_output: "VllmSchedulerOutput"):
         seq_lens_cpu = seq_lens
         
         
-        logger.info(f"{self.devices=}")
-        logger.info(f"{jax.local_devices()=}")
-        input_tuple_single_device = jax.device_put(
-            (input_ids, positions, block_tables, query_start_loc, seq_lens,
-             logits_indices, request_distribution),
-            device=jax.local_devices()[0],
-        )
         (input_ids, positions, block_tables, query_start_loc, seq_lens,
          logits_indices, request_distribution) = device_array(
-             self.mesh, input_tuple_single_device)
+             self.mesh, (input_ids, positions, block_tables, query_start_loc, seq_lens,
+             logits_indices, request_distribution))
 
         if self.scheduler_config.async_scheduling and len(
                 token_in_tpu_cur_input_indices) > 0:
diff --git a/tpu_inference/utils.py b/tpu_inference/utils.py
@@ -243,7 +243,7 @@ def device_array(mesh: Mesh, *args, sharding=None, **kwargs) -> jax.Array:
     """
     if sharding is None:
         sharding = NamedSharding(mesh, PartitionSpec(None))
-    return jax.device_put(*args, device=sharding, **kwargs)
+    return jax.make_array_from_process_local_data(sharding=sharding, *args, **kwargs)
 
 
 def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]: