From a5a4ca893561581da377027f52fa840b77b319c8 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 28 Nov 2025 23:42:05 +0800
Subject: [PATCH 01/16] ci: rewrite script to generate index and metadata for
 nightly build

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/generate_index.py                 |  46 ---
 .buildkite/scripts/generate-nightly-index.py | 323 +++++++++++++++++++
 .buildkite/scripts/upload-wheels.sh          |  88 +++--
 3 files changed, 362 insertions(+), 95 deletions(-)
 delete mode 100644 .buildkite/generate_index.py
 create mode 100644 .buildkite/scripts/generate-nightly-index.py
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe847..000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 000000000000..33a397bd2a2a
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Any, Optional, Tuple
+import re
+import json
+from urllib.parse import quote
+import argparse
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: Optional[str]
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: Optional[str]
+    filename: str
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+    
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r'^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$'
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+    
+    package_name = match.group('package_name')
+    version = match.group('version')
+    build_tag = match.group('build_tag')
+    python_tag = match.group('python_tag')
+    abi_tag = match.group('abi_tag')
+    platform_tag = match.group('platform_tag')
+    
+    # extract variant from version
+    variant = None
+    if 'dev' in version:
+        ver_after_dev = version.split('dev')[-1]
+        if '.' in ver_after_dev:
+            variant = ver_after_dev.split('.')[-1]
+            version = version.removesuffix('.' + variant)
+    else:
+        if '+' in version:
+            version, variant = version.split('+')
+    
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file
+    )
+
+def generate_project_list(subdir_names: list[str]) -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip('/').strip('.')
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items='\n'.join(href_tags))
+
+
+def generate_package_index_and_metadata(wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path) -> Tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        href_tags.append(f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>')
+        file_meta = asdict(file)
+        file_meta['path'] = relative_path.as_posix()
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items='\n'.join(href_tags))
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+        whl_files: list[str],
+        wheel_base_dir: Path,
+        index_base_dir: Path,
+        default_variant: Optional[str] = None,
+        alias_to_default: Optional[str] = None
+        ):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (Optional[str]): The default variant name, if any.
+        alias_to_default (Optional[str]): Alias variant name for the default variant, if any.
+    
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json 
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or 'default'
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if 'default' in variant_to_files:
+            raise ValueError("All wheel files must have variant suffixes when `default_variant` is specified.")
+        if default_variant not in variant_to_files:
+            raise ValueError(f"Default variant '{default_variant}' not found among wheel files.")
+
+    if alias_to_default:
+        if 'default' not in variant_to_files:
+            raise ValueError("Alias to default variant specified, but no default variant found.")
+        if alias_to_default in variant_to_files:
+            raise ValueError(f"Alias variant name '{alias_to_default}' already exists among wheel files.")
+        else:
+            variant_to_files[alias_to_default] = variant_to_files['default'].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == 'default':
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+        
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == 'default':
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages))
+            with open(variant_dir / 'index.html', 'w') as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files,
+                wheel_base_dir,
+                package_dir
+            )
+            with open(package_dir / 'index.html', 'w') as f:
+                f.write(index_str)
+            with open(package_dir / 'metadata.json', 'w') as f:
+                f.write(metadata_str)
+    
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names))
+    with open(index_base_dir / 'index.html', 'w') as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+    """
+
+    parser = argparse.ArgumentParser(description="Process nightly build wheel files to generate indices.")
+    parser.add_argument('--version', type=str, required=True, help='Version string for the current build (e.g., commit hash)')
+    parser.add_argument('--current-objects', type=str, required=True, help='Path to JSON file containing current S3 objects listing in this version directory')
+    parser.add_argument('--output-dir', type=str, required=True, help='Directory to store generated index files')
+    parser.add_argument('--alias-to-default', type=str, default=None, help='Alias variant name for the default variant')
+
+    args = parser.parse_args()
+
+    version = args.version
+    if '/' in version or '\\' in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path, 'r') as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get('Contents', []):
+        key: str = item['Key']
+        if key.endswith('.whl'):
+            wheel_files.append(key.split('/')[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_base_dir = Path(output_dir).parent / version
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default
+    )
+    print(f"Index and metadata generated in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c009..45e80c64a09f 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,9 @@
 
 set -ex
 
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -14,70 +17,57 @@ fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
 
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "s3://$BUCKET/$BUILDKITE_COMMIT/"
 
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
-
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+obj_json="$(mktemp).json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$BUILDKITE_COMMIT/" --delimiter / --output json > "$obj_json"
 
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+mkdir -p "$INDICES_OUTPUT_DIR"
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi
+python3 ./buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" "$alias_arg"
 
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+# copy indices to /<commit>/ unconditionally
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$BUILDKITE_COMMIT/"
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi

From 67f6bb72101db85e226b251fdbab0aa725b8ecee Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 28 Nov 2025 23:44:23 +0800
Subject: [PATCH 02/16] build: set VLLM_MAIN_CUDA_VERSION to 12.9

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/release-pipeline.yaml | 2 +-
 vllm/envs.py                     | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf..c4c186cb7907 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
diff --git a/vllm/envs.py b/vllm/envs.py
index 46f1aa3222be..d0912863e644 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+    VLLM_MAIN_CUDA_VERSION: str = "12.9"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -445,10 +445,9 @@ def get_vllm_port() -> int | None:
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
-    # 12.8 is the default. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.8",
+    or "12.9",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),

From 783b7bd13872b30b4ee5223076007ec970bb9710 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 28 Nov 2025 23:45:20 +0800
Subject: [PATCH 03/16] ci: remove build-wheel-cuda-12-8 in release-pipeline

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/release-pipeline.yaml | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index c4c186cb7907..fbfc923998f8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -30,19 +30,6 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge

From b6c1cf44d6001da83e5a1b2c84836d8e14b88090 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Fri, 28 Nov 2025 23:48:34 +0800
Subject: [PATCH 04/16] ci: set DEFAULT_VARIANT_ALIAS=cu129 in upload-wheels.sh

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/upload-wheels.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 45e80c64a09f..89be9f86d3b0 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -4,6 +4,7 @@ set -ex
 
 BUCKET="vllm-wheels"
 INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
 
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)

From e10c42429bf8269d149f576fdfd96bb31c71cbc3 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 00:23:41 +0800
Subject: [PATCH 05/16] build: use newly added metadata.json instead of
 hardcoded version to download precompiled wheels

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 setup.py | 68 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 20 deletions(-)

diff --git a/setup.py b/setup.py
index 0022e7fe0bf3..0d45c51bbfe3 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
 import sysconfig
 from pathlib import Path
 from shutil import which
+from typing import Optional
 
 import torch
 from packaging.version import Version, parse
@@ -648,6 +649,17 @@ def _read_requirements(filename: str) -> list[str]:
     ]
 }
 
+def _fetch_metadata_for_variant(variant: Optional[str]) -> tuple[list[dict], str]:
+    base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
+    variant_dir = f"{variant}/" if variant is not None else ""
+    repo_url = f"https://wheels.vllm.ai/{base_commit}/{variant_dir}vllm/"
+    meta_url = repo_url + "metadata.json"
+    from urllib.request import urlopen
+    with urlopen(meta_url) as resp:
+        assert resp.status == 200, f"Failed to fetch metadata from {meta_url}"
+        wheels = json.loads(resp.read().decode("utf-8"))
+    return wheels, repo_url
+
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
     assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
@@ -656,29 +668,45 @@ def _read_requirements(filename: str) -> list[str]:
         wheel_url = wheel_location
     else:
         import platform
-
         arch = platform.machine()
-        if arch == "x86_64":
-            wheel_tag = "manylinux1_x86_64"
-        elif arch == "aarch64":
-            wheel_tag = "manylinux2014_aarch64"
-        else:
-            raise ValueError(f"Unsupported architecture: {arch}")
-        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = (
-            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        )
-        from urllib.request import urlopen
-
+        # try to fetch the wheel metadata from the nightly wheel repo
+        variant = 'cu' + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
         try:
-            with urlopen(wheel_url) as resp:
-                if resp.status != 200:
-                    wheel_url = nightly_wheel_url
+            wheels, repo_url = _fetch_metadata_for_variant(variant)
         except Exception as e:
-            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = nightly_wheel_url
-
+            print(
+                f"Failed to fetch precompiled wheel metadata for variant {variant}: {e}"
+            )
+            print("Warning: Trying the default nightly wheel variant.")
+            wheels, repo_url = _fetch_metadata_for_variant(None)
+            # if this also fails, then we have nothing more to try
+
+        # The metadata.json has the following format:
+        # see .buildkite/scripts/generate-nightly-index.py for details
+        """[{
+"package_name": "vllm",
+"version": "0.11.2.dev278+gdbc3d9991",
+"build_tag": null,
+"python_tag": "cp38",
+"abi_tag": "abi3",
+"platform_tag": "manylinux1_x86_64",
+"variant": null,
+"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
+"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+},
+...]"""
+        for wheel in wheels:
+            if wheel.get("package_name") == "vllm" and arch in wheel.get("platform_tag", ""):
+                print(f"Found precompiled wheel metadata: {wheel}")
+                assert "path" in wheel, f"Wheel metadata missing path: {wheel}"
+                wheel_url = repo_url + wheel["path"]
+                print(f"Using precompiled wheel URL: {wheel_url}")
+                break
+        else:
+            raise ValueError(
+                f"No precompiled vllm wheel found for architecture {arch} "
+                f"from repo {repo_url}"
+            )
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)

From fa2a0ad3aa969e97c63b461dc7f28cb06e3d780a Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 01:05:03 +0800
Subject: [PATCH 06/16] chore: fix ruff linting issues

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 188 +++++++++++--------
 setup.py                                     |  13 +-
 2 files changed, 123 insertions(+), 78 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 33a397bd2a2a..78e01724c56b 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -2,13 +2,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from dataclasses import dataclass, asdict
-from pathlib import Path
-from typing import Any, Optional, Tuple
-import re
+# stop ruff from complaining about line length (in comments)
+# ruff: noqa: E501
+
+import argparse
 import json
+import re
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
 from urllib.parse import quote
-import argparse
 
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
 <html>
@@ -19,21 +22,23 @@
 </html>
 """
 
+
 @dataclass
 class WheelFileInfo:
     package_name: str
     version: str
-    build_tag: Optional[str]
+    build_tag: str | None
     python_tag: str
     abi_tag: str
     platform_tag: str
-    variant: Optional[str]
+    variant: str | None
     filename: str
 
+
 def parse_from_filename(file: str) -> WheelFileInfo:
     """
     Parse wheel file name to extract metadata.
-    
+
     The format of wheel names:
         {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
     All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
@@ -44,30 +49,30 @@ def parse_from_filename(file: str) -> WheelFileInfo:
         vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
     """
     wheel_file_re = re.compile(
-        r'^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$'
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
     )
     match = wheel_file_re.match(file)
     if not match:
         raise ValueError(f"Invalid wheel file name: {file}")
-    
-    package_name = match.group('package_name')
-    version = match.group('version')
-    build_tag = match.group('build_tag')
-    python_tag = match.group('python_tag')
-    abi_tag = match.group('abi_tag')
-    platform_tag = match.group('platform_tag')
-    
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
     # extract variant from version
     variant = None
-    if 'dev' in version:
-        ver_after_dev = version.split('dev')[-1]
-        if '.' in ver_after_dev:
-            variant = ver_after_dev.split('.')[-1]
-            version = version.removesuffix('.' + variant)
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
     else:
-        if '+' in version:
-            version, variant = version.split('+')
-    
+        if "+" in version:
+            version, variant = version.split("+")
+
     return WheelFileInfo(
         package_name=package_name,
         version=version,
@@ -76,44 +81,51 @@ def parse_from_filename(file: str) -> WheelFileInfo:
         abi_tag=abi_tag,
         platform_tag=platform_tag,
         variant=variant,
-        filename=file
+        filename=file,
     )
 
+
 def generate_project_list(subdir_names: list[str]) -> str:
     """
     Generate project list HTML content linking to each project & variant sub-directory.
     """
     href_tags = []
     for name in sorted(subdir_names):
-        name = name.strip('/').strip('.')
+        name = name.strip("/").strip(".")
         href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
-    return INDEX_HTML_TEMPLATE.format(items='\n'.join(href_tags))
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
 
 
-def generate_package_index_and_metadata(wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path) -> Tuple[str, str]:
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
     """
     Generate package index HTML content for a specific package, linking to actual wheel files.
     """
     href_tags = []
     metadata = []
     for file in sorted(wheel_files, key=lambda x: x.filename):
-        relative_path = wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
-        href_tags.append(f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>')
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        href_tags.append(
+            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
+        )
         file_meta = asdict(file)
-        file_meta['path'] = relative_path.as_posix()
+        file_meta["path"] = relative_path.as_posix()
         metadata.append(file_meta)
-    index_str = INDEX_HTML_TEMPLATE.format(items='\n'.join(href_tags))
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
     metadata_str = json.dumps(metadata, indent=2)
     return index_str, metadata_str
 
 
 def generate_index_and_metadata(
-        whl_files: list[str],
-        wheel_base_dir: Path,
-        index_base_dir: Path,
-        default_variant: Optional[str] = None,
-        alias_to_default: Optional[str] = None
-        ):
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+):
     """
     Generate index for all wheel files.
 
@@ -121,9 +133,9 @@ def generate_index_and_metadata(
         whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
         wheel_base_dir (Path): Base directory for wheel files.
         index_base_dir (Path): Base directory to store index files.
-        default_variant (Optional[str]): The default variant name, if any.
-        alias_to_default (Optional[str]): Alias variant name for the default variant, if any.
-    
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+
     First, parse all wheel files to extract metadata.
     We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
     The index for the default variant (if any) is generated in the root index directory.
@@ -150,7 +162,7 @@ def generate_index_and_metadata(
                 index.html
                 vllm/
                     index.html
-                    metadata.json 
+                    metadata.json
             cu130/ # cu130 variant sub-directory
                 index.html
                 vllm/
@@ -184,7 +196,7 @@ def generate_index_and_metadata(
     # Group by variant
     variant_to_files: dict[str, list[WheelFileInfo]] = {}
     for file in parsed_files:
-        variant = file.variant or 'default'
+        variant = file.variant or "default"
         if variant not in variant_to_files:
             variant_to_files[variant] = []
         variant_to_files[variant].append(file)
@@ -193,41 +205,49 @@ def generate_index_and_metadata(
 
     # sanity check for default variant
     if default_variant:
-        if 'default' in variant_to_files:
-            raise ValueError("All wheel files must have variant suffixes when `default_variant` is specified.")
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
         if default_variant not in variant_to_files:
-            raise ValueError(f"Default variant '{default_variant}' not found among wheel files.")
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
 
     if alias_to_default:
-        if 'default' not in variant_to_files:
-            raise ValueError("Alias to default variant specified, but no default variant found.")
+        if "default" not in variant_to_files:
+            raise ValueError(
+                "Alias to default variant specified, but no default variant found."
+            )
         if alias_to_default in variant_to_files:
-            raise ValueError(f"Alias variant name '{alias_to_default}' already exists among wheel files.")
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
         else:
-            variant_to_files[alias_to_default] = variant_to_files['default'].copy()
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
             print(f"Alias variant '{alias_to_default}' created for default variant.")
 
     # Generate index for each variant
     subdir_names = set()
     for variant, files in variant_to_files.items():
-        if variant == 'default':
+        if variant == "default":
             variant_dir = index_base_dir
         else:
             variant_dir = index_base_dir / variant
             subdir_names.add(variant)
 
         variant_dir.mkdir(parents=True, exist_ok=True)
-        
+
         # gather all package names in this variant
         packages = set(f.package_name for f in files)
-        if variant == 'default':
+        if variant == "default":
             # these packages should also appear in the "project list"
             # generate after all variants are processed
             subdir_names = subdir_names.union(packages)
         else:
             # generate project list for this variant directly
             project_list_str = generate_project_list(sorted(packages))
-            with open(variant_dir / 'index.html', 'w') as f:
+            with open(variant_dir / "index.html", "w") as f:
                 f.write(project_list_str)
 
         for package in packages:
@@ -236,18 +256,16 @@ def generate_index_and_metadata(
             package_dir = variant_dir / package
             package_dir.mkdir(parents=True, exist_ok=True)
             index_str, metadata_str = generate_package_index_and_metadata(
-                package_files,
-                wheel_base_dir,
-                package_dir
+                package_files, wheel_base_dir, package_dir
             )
-            with open(package_dir / 'index.html', 'w') as f:
+            with open(package_dir / "index.html", "w") as f:
                 f.write(index_str)
-            with open(package_dir / 'metadata.json', 'w') as f:
+            with open(package_dir / "metadata.json", "w") as f:
                 f.write(metadata_str)
-    
+
     # Generate top-level project list index
     project_list_str = generate_project_list(sorted(subdir_names))
-    with open(index_base_dir / 'index.html', 'w') as f:
+    with open(index_base_dir / "index.html", "w") as f:
         f.write(project_list_str)
 
 
@@ -260,16 +278,38 @@ def generate_index_and_metadata(
         --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
     """
 
-    parser = argparse.ArgumentParser(description="Process nightly build wheel files to generate indices.")
-    parser.add_argument('--version', type=str, required=True, help='Version string for the current build (e.g., commit hash)')
-    parser.add_argument('--current-objects', type=str, required=True, help='Path to JSON file containing current S3 objects listing in this version directory')
-    parser.add_argument('--output-dir', type=str, required=True, help='Directory to store generated index files')
-    parser.add_argument('--alias-to-default', type=str, default=None, help='Alias variant name for the default variant')
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
 
     args = parser.parse_args()
 
     version = args.version
-    if '/' in version or '\\' in version:
+    if "/" in version or "\\" in version:
         raise ValueError("Version string must not contain slashes.")
     current_objects_path = Path(args.current_objects)
     output_dir = Path(args.output_dir)
@@ -277,7 +317,7 @@ def generate_index_and_metadata(
         output_dir.mkdir(parents=True, exist_ok=True)
 
     # Read current objects JSON
-    with open(current_objects_path, 'r') as f:
+    with open(current_objects_path) as f:
         current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
 
     # current_objects looks like from list_objects_v2 S3 API:
@@ -300,10 +340,10 @@ def generate_index_and_metadata(
 
     # Extract wheel file keys
     wheel_files = []
-    for item in current_objects.get('Contents', []):
-        key: str = item['Key']
-        if key.endswith('.whl'):
-            wheel_files.append(key.split('/')[-1])  # only the filename is used
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
 
     print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
 
@@ -318,6 +358,6 @@ def generate_index_and_metadata(
         wheel_base_dir=wheel_base_dir,
         index_base_dir=index_base_dir,
         default_variant=None,
-        alias_to_default=args.alias_to_default
+        alias_to_default=args.alias_to_default,
     )
     print(f"Index and metadata generated in {output_dir}")
diff --git a/setup.py b/setup.py
index 0d45c51bbfe3..d15c573e7cf7 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,6 @@
 import sysconfig
 from pathlib import Path
 from shutil import which
-from typing import Optional
 
 import torch
 from packaging.version import Version, parse
@@ -649,17 +648,20 @@ def _read_requirements(filename: str) -> list[str]:
     ]
 }
 
-def _fetch_metadata_for_variant(variant: Optional[str]) -> tuple[list[dict], str]:
+
+def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
     base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
     variant_dir = f"{variant}/" if variant is not None else ""
     repo_url = f"https://wheels.vllm.ai/{base_commit}/{variant_dir}vllm/"
     meta_url = repo_url + "metadata.json"
     from urllib.request import urlopen
+
     with urlopen(meta_url) as resp:
         assert resp.status == 200, f"Failed to fetch metadata from {meta_url}"
         wheels = json.loads(resp.read().decode("utf-8"))
     return wheels, repo_url
 
+
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
     assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
@@ -668,9 +670,10 @@ def _fetch_metadata_for_variant(variant: Optional[str]) -> tuple[list[dict], str
         wheel_url = wheel_location
     else:
         import platform
+
         arch = platform.machine()
         # try to fetch the wheel metadata from the nightly wheel repo
-        variant = 'cu' + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
         try:
             wheels, repo_url = _fetch_metadata_for_variant(variant)
         except Exception as e:
@@ -696,7 +699,9 @@ def _fetch_metadata_for_variant(variant: Optional[str]) -> tuple[list[dict], str
 },
 ...]"""
         for wheel in wheels:
-            if wheel.get("package_name") == "vllm" and arch in wheel.get("platform_tag", ""):
+            if wheel.get("package_name") == "vllm" and arch in wheel.get(
+                "platform_tag", ""
+            ):
                 print(f"Found precompiled wheel metadata: {wheel}")
                 assert "path" in wheel, f"Wheel metadata missing path: {wheel}"
                 wheel_url = repo_url + wheel["path"]

From 5f40a2fb5a23877426177764e7c1f1100149d968 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 09:56:41 +0800
Subject: [PATCH 07/16] ci: fix errors in upload-wheels.sh

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/upload-wheels.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 89be9f86d3b0..a8ab1b10e753 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -42,9 +42,10 @@ echo "Pure version (without variant): $pure_version"
 aws s3 cp "$wheel" "s3://$BUCKET/$BUILDKITE_COMMIT/"
 
 # list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "s3://$BUCKET/$BUILDKITE_COMMIT/"
 obj_json="$(mktemp).json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$BUILDKITE_COMMIT/" --delimiter / --output json > "$obj_json"
-
 mkdir -p "$INDICES_OUTPUT_DIR"
 
 # call script to generate indicies for all existing wheels
@@ -56,7 +57,7 @@ if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
 else
     alias_arg=""
 fi
-python3 ./buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" "$alias_arg"
+python3 .buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" "$alias_arg"
 
 # copy indices to /<commit>/ unconditionally
 aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$BUILDKITE_COMMIT/"

From b8741bf95f501521abae9f8c168f888065fa5c9d Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 11:09:48 +0800
Subject: [PATCH 08/16] ci: fix legacy python supoprt in
 generate-nightly-index.py

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 14 ++++++++------
 .buildkite/scripts/upload-wheels.sh          |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 78e01724c56b..0ac6f026a66f 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -2,15 +2,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# stop ruff from complaining about line length (in comments)
+# do not complain about line length (for docstring)
 # ruff: noqa: E501
+# this script should run with legacy python on buildkite agent
+# ruff: noqa: UP045
 
 import argparse
 import json
 import re
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 from urllib.parse import quote
 
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
@@ -27,11 +29,11 @@
 class WheelFileInfo:
     package_name: str
     version: str
-    build_tag: str | None
+    build_tag: Optional[str]
     python_tag: str
     abi_tag: str
     platform_tag: str
-    variant: str | None
+    variant: Optional[str]
     filename: str
 
 
@@ -123,8 +125,8 @@ def generate_index_and_metadata(
     whl_files: list[str],
     wheel_base_dir: Path,
     index_base_dir: Path,
-    default_variant: str | None = None,
-    alias_to_default: str | None = None,
+    default_variant: Optional[str] = None,
+    alias_to_default: Optional[str] = None,
 ):
     """
     Generate index for all wheel files.
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index a8ab1b10e753..21db9dfce810 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -57,7 +57,7 @@ if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
 else
     alias_arg=""
 fi
-python3 .buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" "$alias_arg"
+python3 .buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
 aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$BUILDKITE_COMMIT/"

From 39e4fe2514a669b2f50e716a53256cf5c323b7d8 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 11:42:10 +0800
Subject: [PATCH 09/16] ci: do not throw error when alias-to-default is
 specified but no default variant is found

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 0ac6f026a66f..d017f2b2d678 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -218,10 +218,11 @@ def generate_index_and_metadata(
 
     if alias_to_default:
         if "default" not in variant_to_files:
-            raise ValueError(
-                "Alias to default variant specified, but no default variant found."
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
             )
-        if alias_to_default in variant_to_files:
+        elif alias_to_default in variant_to_files:
             raise ValueError(
                 f"Alias variant name '{alias_to_default}' already exists among wheel files."
             )

From d51da1da2b9f960dcd1e27d9c33e4d5d4ced0257 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 12:06:37 +0800
Subject: [PATCH 10/16] ci: ensure enerate-nightly-index.py is run with at
 least python3.10

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py | 16 +++++++++-------
 .buildkite/scripts/upload-wheels.sh          | 14 ++++++++++++--
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index d017f2b2d678..82bcf83424c5 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -4,17 +4,19 @@
 
 # do not complain about line length (for docstring)
 # ruff: noqa: E501
-# this script should run with legacy python on buildkite agent
-# ruff: noqa: UP045
 
 import argparse
 import json
 import re
+import sys
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any
 from urllib.parse import quote
 
+if not sys.version_info >= (3, 10):
+    raise RuntimeError("This script requires Python 3.10 or higher.")
+
 INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
 <html>
   <meta name="pypi:repository-version" content="1.0">
@@ -29,11 +31,11 @@
 class WheelFileInfo:
     package_name: str
     version: str
-    build_tag: Optional[str]
+    build_tag: str | None
     python_tag: str
     abi_tag: str
     platform_tag: str
-    variant: Optional[str]
+    variant: str | None
     filename: str
 
 
@@ -125,8 +127,8 @@ def generate_index_and_metadata(
     whl_files: list[str],
     wheel_base_dir: Path,
     index_base_dir: Path,
-    default_variant: Optional[str] = None,
-    alias_to_default: Optional[str] = None,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
 ):
     """
     Generate index for all wheel files.
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 21db9dfce810..c99da944b0b0 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -44,7 +44,7 @@ aws s3 cp "$wheel" "s3://$BUCKET/$BUILDKITE_COMMIT/"
 # list all wheels in the commit directory
 echo "Existing wheels on S3:"
 aws s3 ls "s3://$BUCKET/$BUILDKITE_COMMIT/"
-obj_json="$(mktemp).json"
+obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$BUILDKITE_COMMIT/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
 
@@ -57,7 +57,17 @@ if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
 else
     alias_arg=""
 fi
-python3 .buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+
+# detect if python3.10+ is available
+has_new_python=$(python3 -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+else
+    PYTHON="python3"
+fi
+
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
 aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$BUILDKITE_COMMIT/"

From 14d2952425ff965cc77f92304d5a6009024e1320 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 12:44:30 +0800
Subject: [PATCH 11/16] ci: add more comments in upload-wheels.sh, ensure no
 race in index upload

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/scripts/generate-nightly-index.py |  2 +-
 .buildkite/scripts/upload-wheels.sh          | 47 +++++++++++++-------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 82bcf83424c5..a61f08107647 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -365,4 +365,4 @@ def generate_index_and_metadata(
         default_variant=None,
         alias_to_default=args.alias_to_default,
     )
-    print(f"Index and metadata generated in {output_dir}")
+    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index c99da944b0b0..05accb9cf16d 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,9 +2,27 @@
 
 set -ex
 
+# ======== part 0: setup ========
+
 BUCKET="vllm-wheels"
 INDICES_OUTPUT_DIR="indices"
 DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
 
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
@@ -14,8 +32,6 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"
 
 # current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
@@ -39,13 +55,20 @@ pure_version="${version%%+*}"
 echo "Pure version (without variant): $pure_version"
 
 # copy wheel to its own bucket
-aws s3 cp "$wheel" "s3://$BUCKET/$BUILDKITE_COMMIT/"
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
 
 # list all wheels in the commit directory
 echo "Existing wheels on S3:"
-aws s3 ls "s3://$BUCKET/$BUILDKITE_COMMIT/"
+aws s3 ls "$S3_COMMIT_PREFIX"
 obj_json="objects.json"
-aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$BUILDKITE_COMMIT/" --delimiter / --output json > "$obj_json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
 
 # call script to generate indicies for all existing wheels
@@ -58,19 +81,11 @@ else
     alias_arg=""
 fi
 
-# detect if python3.10+ is available
-has_new_python=$(python3 -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
-if [[ "$has_new_python" -eq 0 ]]; then
-    # use new python from docker
-    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
-else
-    PYTHON="python3"
-fi
-
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$BUILDKITE_COMMIT" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
 # copy indices to /<commit>/ unconditionally
-aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$BUILDKITE_COMMIT/"
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
 
 # copy to /nightly/ only if it is on the main branch and not a PR 
 if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then

From db2c76c9585e7618002f5567c78172d9544ce529 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Sat, 29 Nov 2025 15:17:43 +0800
Subject: [PATCH 12/16] build: more elegant exception handling in setup.py to
 download precompiled wheels

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 setup.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index d15c573e7cf7..12439a3df212 100644
--- a/setup.py
+++ b/setup.py
@@ -654,10 +654,10 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
     variant_dir = f"{variant}/" if variant is not None else ""
     repo_url = f"https://wheels.vllm.ai/{base_commit}/{variant_dir}vllm/"
     meta_url = repo_url + "metadata.json"
+    print(f"Trying to fetch metadata from {meta_url}")
     from urllib.request import urlopen
-
     with urlopen(meta_url) as resp:
-        assert resp.status == 200, f"Failed to fetch metadata from {meta_url}"
+        # urlopen raises HTTPError on unexpected status code
         wheels = json.loads(resp.read().decode("utf-8"))
     return wheels, repo_url
 
@@ -674,15 +674,18 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
         arch = platform.machine()
         # try to fetch the wheel metadata from the nightly wheel repo
         variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        try_default = False
         try:
             wheels, repo_url = _fetch_metadata_for_variant(variant)
         except Exception as e:
             print(
                 f"Failed to fetch precompiled wheel metadata for variant {variant}: {e}"
             )
+            try_default = True # try outside handler to keep the stacktrace simple
+        if try_default:
             print("Warning: Trying the default nightly wheel variant.")
             wheels, repo_url = _fetch_metadata_for_variant(None)
-            # if this also fails, then we have nothing more to try
+            # if this also fails, then we have nothing more to try / cache
 
         # The metadata.json has the following format:
         # see .buildkite/scripts/generate-nightly-index.py for details

From fcf323d7d594e2de045102422024c804cc421898 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Mon, 1 Dec 2025 17:47:53 +0800
Subject: [PATCH 13/16] build: add VLLM_PRECOMPILED_WHEEL_VARIANT support in
 setup.py to replace hardcoding

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 setup.py | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index 12439a3df212..e24238ab0595 100644
--- a/setup.py
+++ b/setup.py
@@ -654,8 +654,9 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
     variant_dir = f"{variant}/" if variant is not None else ""
     repo_url = f"https://wheels.vllm.ai/{base_commit}/{variant_dir}vllm/"
     meta_url = repo_url + "metadata.json"
-    print(f"Trying to fetch metadata from {meta_url}")
+    logger.info("Trying to fetch metadata from {}", meta_url)
     from urllib.request import urlopen
+
     with urlopen(meta_url) as resp:
         # urlopen raises HTTPError on unexpected status code
         wheels = json.loads(resp.read().decode("utf-8"))
@@ -664,29 +665,43 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
 
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    # Attempts:
+    # 1. user-specified wheel location (can be either local or remote, via
+    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
+    # 2. user-specified variant from nightly repo (current main commit via
+    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
+    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
+    # 4. the default variant from nightly repo (current main commit)
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
+        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
     else:
         import platform
 
         arch = platform.machine()
         # try to fetch the wheel metadata from the nightly wheel repo
-        variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
+        logger.info("Using precompiled wheel variant: {}", variant)
         try_default = False
+        wheels, repo_url = None, None
         try:
             wheels, repo_url = _fetch_metadata_for_variant(variant)
         except Exception as e:
-            print(
-                f"Failed to fetch precompiled wheel metadata for variant {variant}: {e}"
+            logger.warning(
+                "Failed to fetch precompiled wheel metadata for variant {}",
+                variant,
+                exc_info=e,
             )
-            try_default = True # try outside handler to keep the stacktrace simple
+            try_default = True  # try outside handler to keep the stacktrace simple
         if try_default:
-            print("Warning: Trying the default nightly wheel variant.")
+            logger.info("Trying the default variant from /nightly/")
             wheels, repo_url = _fetch_metadata_for_variant(None)
             # if this also fails, then we have nothing more to try / cache
-
+        assert wheels is not None and repo_url is not None, (
+            "Failed to fetch precompiled wheel metadata"
+        )
         # The metadata.json has the following format:
         # see .buildkite/scripts/generate-nightly-index.py for details
         """[{
@@ -705,15 +720,17 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
             if wheel.get("package_name") == "vllm" and arch in wheel.get(
                 "platform_tag", ""
             ):
-                print(f"Found precompiled wheel metadata: {wheel}")
-                assert "path" in wheel, f"Wheel metadata missing path: {wheel}"
+                logger.info("Found precompiled wheel metadata: {}", wheel)
+                if "path" not in wheel:
+                    raise ValueError(f"Wheel metadata missing path: {wheel}")
+                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
                 wheel_url = repo_url + wheel["path"]
-                print(f"Using precompiled wheel URL: {wheel_url}")
+                logger.info("Using precompiled wheel URL: {}", wheel_url)
                 break
         else:
             raise ValueError(
                 f"No precompiled vllm wheel found for architecture {arch} "
-                f"from repo {repo_url}"
+                f"from repo {repo_url}. All available wheels: {wheels}"
             )
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():

From 3108287726acbabc44e1f2defd4b805e164a36cc Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Mon, 1 Dec 2025 18:28:28 +0800
Subject: [PATCH 14/16] doc: update instructions on installing from renovated
 nightly index

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 docs/getting_started/installation/cpu.md      |  5 +-
 .../installation/gpu.cuda.inc.md              | 59 ++++++++-----------
 2 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index d1beab7855b1..37799de70254 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,7 +46,10 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
+
+When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
+For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
 ### Build wheel from source
 
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af88..eaf8d8bf30bf 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,43 +26,50 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.8.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    # Install vLLM with CUDA 12.9.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=130 # or other
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
+
+* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
+* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
+
+To install from nightly index, run:
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly
+    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
 ```
 
-??? console "pip"
+!!! warning "`pip` caveat"
+
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
+
     ```bash
-    pip install -U vllm \
-        --pre \
-        --extra-index-url https://wheels.vllm.ai/nightly
+    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
+    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
     ```
 
-    `--pre` is required for `pip` to consider pre-released versions.
-
 ##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
@@ -71,29 +78,9 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
 ```
 
-The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
-
-??? note "pip"
-    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
-    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
-    wheel file by embedding the commit hash in the URL:
-
-    ```bash
-    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-    ```
-
-    Note that the wheels are built with Python 3.8 ABI (see [PEP
-    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
-    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
-    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
-    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
-    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
-    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 

From 26de25d9264282e5b016aa7e10f9f0bd1555684f Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Mon, 1 Dec 2025 18:46:40 +0800
Subject: [PATCH 15/16] build: add VLLM_PRECOMPILED_WHEEL_COMMIT env

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 setup.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index e24238ab0595..67226b4447c7 100644
--- a/setup.py
+++ b/setup.py
@@ -310,9 +310,6 @@ def run(self):
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
-    def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
         return
@@ -649,10 +646,11 @@ def _read_requirements(filename: str) -> list[str]:
 }
 
 
-def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
-    base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
+def _fetch_metadata_for_variant(
+    commit: str, variant: str | None
+) -> tuple[list[dict], str]:
     variant_dir = f"{variant}/" if variant is not None else ""
-    repo_url = f"https://wheels.vllm.ai/{base_commit}/{variant_dir}vllm/"
+    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
     meta_url = repo_url + "metadata.json"
     logger.info("Trying to fetch metadata from {}", meta_url)
     from urllib.request import urlopen
@@ -683,11 +681,17 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
         # try to fetch the wheel metadata from the nightly wheel repo
         main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
         variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-        logger.info("Using precompiled wheel variant: {}", variant)
+        commit = os.getenv(
+            "VLLM_PRECOMPILED_WHEEL_COMMIT",
+            precompiled_wheel_utils.get_base_commit_in_main_branch(),
+        )
+        logger.info(
+            "Using precompiled wheel commit {} with variant {}", commit, variant
+        )
         try_default = False
         wheels, repo_url = None, None
         try:
-            wheels, repo_url = _fetch_metadata_for_variant(variant)
+            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
         except Exception as e:
             logger.warning(
                 "Failed to fetch precompiled wheel metadata for variant {}",
@@ -696,8 +700,8 @@ def _fetch_metadata_for_variant(variant: str | None) -> tuple[list[dict], str]:
             )
             try_default = True  # try outside handler to keep the stacktrace simple
         if try_default:
-            logger.info("Trying the default variant from /nightly/")
-            wheels, repo_url = _fetch_metadata_for_variant(None)
+            logger.info("Trying the default variant")
+            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
             # if this also fails, then we have nothing more to try / cache
         assert wheels is not None and repo_url is not None, (
             "Failed to fetch precompiled wheel metadata"

From 73488d20d5e01aeacad6aa65a03e63cb15c5a643 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Mon, 1 Dec 2025 18:49:36 +0800
Subject: [PATCH 16/16] doc: clarify new python-only build options

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 docs/getting_started/installation/cpu.md          | 10 ++++++++++
 docs/getting_started/installation/gpu.cuda.inc.md | 14 ++++++++++----
 docs/getting_started/installation/gpu.md          |  2 +-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 37799de70254..18dc6d19434b 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -53,6 +53,16 @@ For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
 ### Build wheel from source
 
+#### Set up using Python-only build (without compilation) {#python-only-build}
+
+Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
+
+```bash
+VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
+```
+
+#### Full build (with compilation) {#full-build}
+
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index eaf8d8bf30bf..ad26672f8092 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -84,7 +84,7 @@ uv pip install vllm \
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -108,18 +108,24 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
+export VLLM_USE_PRECOMPILED=1
 uv pip install --editable .
 ```
 
+There are more environment variables to control the behavior of Python-only build:
+
+* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
+* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
+* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
+
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index bc7508b29475..fb750f449985 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
 
 === "NVIDIA CUDA"