Fix profiler directory env var reference and update tests

xingliu14 · xingliu14 · commit 0e82f10adb29 · 2025-11-11T23:43:46.000Z
Signed-off-by: Xing Liu &lt;xingliu14@gmail.com&gt;
diff --git a/tests/worker/tpu_worker_test.py b/tests/worker/tpu_worker_test.py
@@ -63,22 +63,22 @@ def test_init_success(self, mock_vllm_config):
         assert worker.profile_dir is None
         assert worker.devices == ['tpu:0']
 
-    @patch('tpu_inference.worker.tpu_worker.envs')
+    @patch('tpu_inference.worker.tpu_worker_jax.vllm_envs')
     def test_init_with_profiler_on_rank_zero(self, mock_envs,
                                              mock_vllm_config):
         """Tests that the profiler directory is set correctly on rank 0."""
-        mock_envs.VLLM_TORCH_PROFILER_DIR = "/tmp/profiles"
+        mock_vllm_envs.VLLM_TORCH_PROFILER_DIR = "/tmp/profiles"
         worker = TPUWorker(vllm_config=mock_vllm_config,
                            local_rank=0,
                            rank=0,
                            distributed_init_method="test_method")
         assert worker.profile_dir == "/tmp/profiles"
 
-    @patch('tpu_inference.worker.tpu_worker.envs')
+    @patch('tpu_inference.worker.tpu_worker_jax.vllm_envs')
     def test_init_with_profiler_on_other_ranks(self, mock_envs,
                                                mock_vllm_config):
         """Tests that the profiler directory is NOT set on non-rank 0 workers."""
-        mock_envs.VLLM_TORCH_PROFILER_DIR = "/tmp/profiles"
+        mock_vllm_envs.VLLM_TORCH_PROFILER_DIR = "/tmp/profiles"
         worker = TPUWorker(vllm_config=mock_vllm_config,
                            local_rank=1,
                            rank=1,
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -1,5 +1,4 @@
 import functools
-import os
 from typing import Any, Optional
 
 import jax
@@ -11,6 +10,7 @@
 from vllm.config import VllmConfig
 from vllm.utils.func_utils import supports_kw
 
+from tpu_inference import envs
 from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.quantization.quantization_utils import (
@@ -314,7 +314,7 @@ def get_model(
     mesh: Mesh,
     is_draft_model: bool = False,
 ) -> Any:
-    impl = os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower()
+    impl = envs.MODEL_IMPL_TYPE
     logger.info(f"Loading model with MODEL_IMPL_TYPE={impl}")
 
     if impl == "flax_nnx":
diff --git a/tpu_inference/platforms/tpu_platform.py b/tpu_inference/platforms/tpu_platform.py
@@ -4,13 +4,14 @@
 from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
 
 import jax.numpy as jnp
-import vllm.envs as envs
+import vllm.envs as vllm_envs
 from torchax.ops.mappings import j2t_dtype
 from tpu_info import device
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.platforms.interface import Platform, PlatformEnum
 from vllm.sampling_params import SamplingParams, SamplingType
 
+from tpu_inference import envs
 from tpu_inference.layers.jax.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
 
@@ -71,7 +72,7 @@ def get_attn_backend_cls(cls, selected_backend: "_Backend", head_size: int,
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         try:
-            if envs.VLLM_TPU_USING_PATHWAYS:
+            if vllm_envs.VLLM_TPU_USING_PATHWAYS:
                 # Causes mutliprocess accessing IFRT when calling jax.devices()
                 return "TPU v6 lite"
             else:
@@ -87,7 +88,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return not envs.VLLM_USE_V1
+        return not vllm_envs.VLLM_USE_V1
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
@@ -118,11 +119,11 @@ def _initialize_sharding_config(cls, vllm_config: VllmConfig) -> None:
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        if not envs.VLLM_USE_V1:
+        if not vllm_envs.VLLM_USE_V1:
             raise RuntimeError("VLLM_USE_V1=1 must be set for JAX backend.")
 
-        if envs.VLLM_TPU_USING_PATHWAYS:
-            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
+        if vllm_envs.VLLM_TPU_USING_PATHWAYS:
+            assert not vllm_envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
                 "VLLM_ENABLE_V1_MULTIPROCESSING must be 0 when using Pathways(JAX_PLATFORMS=proxy)"
             )
         cls._initialize_sharding_config(vllm_config)
@@ -144,7 +145,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             compilation_config.backend = "openxla"
 
         # If we use vLLM's model implementation in PyTorch, we should set it with torch version of the dtype.
-        impl = os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower()
+        impl = envs.MODEL_IMPL_TYPE
 
         # NOTE(xiang): convert dtype to jnp.dtype
         # NOTE(wenlong): skip this logic for mm model preprocessing
@@ -164,7 +165,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             vllm_config.model_config.dtype = j2t_dtype(
                 vllm_config.model_config.dtype.dtype)
 
-        if envs.VLLM_USE_V1:
+        if vllm_envs.VLLM_USE_V1:
             # TODO(cuiq): remove this dependency.
             from vllm.v1.attention.backends.pallas import \
                 PallasAttentionBackend
@@ -250,7 +251,7 @@ def validate_request(
         """Raises if this request is unsupported on this platform"""
 
         if isinstance(params, SamplingParams):
-            if params.structured_outputs is not None and not envs.VLLM_USE_V1:
+            if params.structured_outputs is not None and not vllm_envs.VLLM_USE_V1:
                 raise ValueError("Structured output is not supported on "
                                  f"{cls.device_name} V0.")
             if params.sampling_type == SamplingType.RANDOM_SEED:
diff --git a/tpu_inference/worker/tpu_worker.py b/tpu_inference/worker/tpu_worker.py
@@ -8,7 +8,7 @@
 import jax.numpy as jnp
 import jaxlib
 import jaxtyping
-import vllm.envs as envs
+import vllm.envs as vllm_envs
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
                                           has_kv_transfer_group)
@@ -22,7 +22,7 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 
-from tpu_inference import utils
+from tpu_inference import envs, utils
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_transfer_port,
                                              get_node_id)
 from tpu_inference.layers.jax.sharding import ShardingConfigManager
@@ -50,7 +50,7 @@ def __init__(self,
                  devices=None):
         # If we use vLLM's model implementation in PyTorch, we should set it
         # with torch version of the dtype.
-        impl = os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower()
+        impl = envs.MODEL_IMPL_TYPE
         if impl != "vllm":  # vllm-pytorch implementation does not need this conversion
 
             # NOTE(wenlong): because sometimes mm needs to use torch for preprocessing
@@ -86,11 +86,11 @@ def __init__(self,
         # TPU Worker is initialized. The profiler server needs to start after
         # MP runtime is initialized.
         self.profile_dir = None
-        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+        if vllm_envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
             if not self.devices or 0 in self.device_ranks:
                 # For TPU, we can only have 1 active profiler session for 1 profiler
                 # server. So we only profile on rank0.
-                self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+                self.profile_dir = vllm_envs.VLLM_TORCH_PROFILER_DIR
                 logger.info("Profiling enabled. Traces will be saved to: %s",
                             self.profile_dir)