From 16e6e3b9e0a8be480af22d9648e8eaba4ba1d27c Mon Sep 17 00:00:00 2001 From: Deep Mistry Date: Wed, 28 Jan 2026 15:44:48 -0500 Subject: [PATCH 1/2] Set control plane zones from region when not explicitly configured --- .../gcp/zones/ipi-conf-gcp-zones-commands.sh | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/ci-operator/step-registry/ipi/conf/gcp/zones/ipi-conf-gcp-zones-commands.sh b/ci-operator/step-registry/ipi/conf/gcp/zones/ipi-conf-gcp-zones-commands.sh index d260bc9a27d98..271cb3b0e4301 100755 --- a/ci-operator/step-registry/ipi/conf/gcp/zones/ipi-conf-gcp-zones-commands.sh +++ b/ci-operator/step-registry/ipi/conf/gcp/zones/ipi-conf-gcp-zones-commands.sh @@ -12,10 +12,41 @@ ZONES_COUNT=3 function join_by { local IFS="$1"; shift; echo "$*"; } +function get_zones_from_region() { + # Get all zones from the region, filtering out AI zones + mapfile -t AVAILABILITY_ZONES < <(gcloud compute zones list --filter="region:${GCP_REGION} AND status:UP" --format='value(name)' | grep -v '\-ai[0-9]' | shuf) + + # Take the first ZONES_COUNT zones + ZONES=("${AVAILABILITY_ZONES[@]:0:${ZONES_COUNT}}") + ZONES_STR="[ $(join_by , "${ZONES[@]}") ]" + echo "GCP region: ${GCP_REGION} (zones: ${ZONES_STR})" +} + function get_zones_by_machine_type() { local machine_type=$1 - mapfile -t AVAILABILITY_ZONES < <(gcloud compute machine-types list --filter="zone~${GCP_REGION} AND name=${machine_type}" --format='value(zone)' | sort) + # Get all zones that support this machine type + mapfile -t AVAILABILITY_ZONES < <(gcloud compute machine-types list --filter="zone~${GCP_REGION} AND name=${machine_type}" --format='value(zone)') + + # Filter out AI zones if this is not an AI machine type (AI types start with "a2-") + if [[ ! "${machine_type}" =~ ^a2- ]]; then + # Filter out zones containing "-ai" followed by a digit (e.g., us-central1-ai1a) + local filtered_zones=() + for zone in "${AVAILABILITY_ZONES[@]}"; do + if [[ ! "${zone}" =~ -ai[0-9] ]]; then + filtered_zones+=("${zone}") + fi + done + # Only use filtered zones if we found non-AI zones, otherwise use all zones + if [[ ${#filtered_zones[@]} -gt 0 ]]; then + AVAILABILITY_ZONES=("${filtered_zones[@]}") + fi + fi + + # Shuffle zones randomly to spread load across zones instead of always picking alphabetically first + mapfile -t AVAILABILITY_ZONES < <(printf '%s\n' "${AVAILABILITY_ZONES[@]}" | shuf) + + # Take the first ZONES_COUNT zones ZONES=("${AVAILABILITY_ZONES[@]:0:${ZONES_COUNT}}") ZONES_STR="[ $(join_by , "${ZONES[@]}") ]" echo "[${machine_type}] GCP region: ${GCP_REGION} (zones: ${ZONES_STR})" @@ -51,6 +82,9 @@ if [[ -n "${CONTROL_PLANE_ZONES}" ]]; then ZONES_STR="${CONTROL_PLANE_ZONES}" elif [[ -n "${CONTROL_PLANE_NODE_TYPE}" ]]; then get_zones_by_machine_type "${CONTROL_PLANE_NODE_TYPE}" +else + # If no zones are set, get standard zones from the region (excluding AI zones) + get_zones_from_region fi if [[ -n "${ZONES_STR}" ]]; then cat >> "${PATCH}" << EOF From ff3888932942ee28e08fcec252ae97955d84c4f0 Mon Sep 17 00:00:00 2001 From: Deep Mistry Date: Wed, 28 Jan 2026 16:15:06 -0500 Subject: [PATCH 2/2] Set zones from regions with AI zones (us-central1, us-south1) to avoid AI zone selection --- .../ipi/conf/gcp/ipi-conf-gcp-commands.sh | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh index bb63c34e1a9cd..4fe9d7d608c08 100755 --- a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh +++ b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh @@ -61,6 +61,26 @@ if [[ -z "${COMPUTE_NODE_TYPE}" ]]; then fi fi +# Get standard zones from the region (excluding AI zones) and randomize selection +# This prevents control plane nodes from being placed in AI zones when zones aren't explicitly set +function get_zones_from_region() { + local zone_count=${1:-3} + # Get all zones from the region, filtering out AI zones and randomizing + mapfile -t AVAILABILITY_ZONES < <(gcloud compute zones list --filter="region:${GCP_REGION} AND status:UP" --format='value(name)' 2>/dev/null | grep -v '\-ai[0-9]' | shuf) + + # Take the first zone_count zones + local zones=("${AVAILABILITY_ZONES[@]:0:${zone_count}}") + # Format as YAML array: [zone1, zone2, zone3] + local zones_str="[" + for i in "${!zones[@]}"; do + if [[ $i -gt 0 ]]; then + zones_str+=", " + fi + zones_str+="${zones[$i]}" + done + zones_str+="]" + echo "${zones_str}" +} cat >> "${CONFIG}" << EOF baseDomain: ${GCP_BASE_DOMAIN} @@ -87,6 +107,43 @@ compute: type: ${COMPUTE_NODE_TYPE} EOF +# Set zones for control plane and compute in regions with AI zones to avoid AI zones +# AI zones (e.g., us-central1-ai1a, us-south1-ai1b) are optimized for GPU/AI machine types +# and should not be used for standard machine types like control plane nodes +if [[ "${GCP_REGION}" == "us-central1" ]] || [[ "${GCP_REGION}" == "us-south1" ]]; then + export GCP_SHARED_CREDENTIALS_FILE="${CLUSTER_PROFILE_DIR}/gce.json" + GOOGLE_PROJECT_ID=$(jq -r .project_id ${GCP_SHARED_CREDENTIALS_FILE} 2>/dev/null || echo "") + if [[ -n "${GOOGLE_PROJECT_ID}" ]]; then + sa_email=$(jq -r .client_email ${GCP_SHARED_CREDENTIALS_FILE} 2>/dev/null || echo "") + if [[ -n "${sa_email}" ]] && ! gcloud auth list 2>/dev/null | grep -qE "\*\s+${sa_email}"; then + gcloud auth activate-service-account --key-file="${GCP_SHARED_CREDENTIALS_FILE}" 2>/dev/null || true + gcloud config set project "${GOOGLE_PROJECT_ID}" 2>/dev/null || true + fi + + # Get zones for control plane (3 zones for HA) + CONTROL_PLANE_ZONES_STR=$(get_zones_from_region 3) + # Get zones for compute (same zones for consistency) + COMPUTE_ZONES_STR="${CONTROL_PLANE_ZONES_STR}" + + # Apply zones via patch if we got valid zones + if [[ -n "${CONTROL_PLANE_ZONES_STR}" ]] && [[ "${CONTROL_PLANE_ZONES_STR}" != "[]" ]]; then + PATCH="${SHARED_DIR}/install-config-zones.yaml.patch" + cat > "${PATCH}" << ZONESPATCH +controlPlane: + platform: + gcp: + zones: ${CONTROL_PLANE_ZONES_STR} +compute: +- platform: + gcp: + zones: ${COMPUTE_ZONES_STR} +ZONESPATCH + yq-go m -x -i "${CONFIG}" "${PATCH}" + rm "${PATCH}" + fi + fi +fi + if [ ${RT_ENABLED} = "true" ]; then cat > "${SHARED_DIR}/manifest_mc-kernel-rt.yml" << EOF apiVersion: machineconfiguration.openshift.io/v1