Skip to content

Commit 6b6dde6

Browse files
committed
Fix IBM Power/Z (ppc64le/s390x) e2e test reliability
Comprehensive fixes for IBM Power and Z e2e test reliability on shared Evergreen CI machines. Problems fixed: - Process namespace join errors from orphaned conmon processes - Stale crun lock files and container state from previous runs - Registry IPv6 connection refused (podman tries ::1 before 127.0.0.1) - Registry cleanup during kicbase image build - Rootless vs rootful podman mode conflicts - Non-idempotent minikube setup - Missing boto3/requests due to skipped requirements install Key changes: - Clean orphaned conmon processes and stale lock files before setup - Bind registry to 127.0.0.1:5000 to avoid IPv6 issues - Use 127.0.0.1 for all curl health checks - Add registry restart before podman push - Use rootful podman (--rootless=false) for reliable CNI networking - Add idempotent checks (skip if minikube already healthy) - Always install requirements.txt
1 parent 59b9ccf commit 6b6dde6

File tree

6 files changed

+117
-79
lines changed

6 files changed

+117
-79
lines changed

.evergreen-functions.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,6 @@ functions:
297297
params:
298298
env:
299299
SKIP_MINIKUBE_SETUP: ${skip_minikube_setup!|false}
300-
SKIP_INSTALL_REQUIREMENTS: ${skip_install_python_requirements!|true}
301300
working_dir: src/github.com/mongodb/mongodb-kubernetes
302301
add_to_path:
303302
- ${workdir}/bin

scripts/dev/recreate_python_venv.sh

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,8 @@ PYENV_VERSION="${PYTHON_VERSION}" python -m venv venv
118118
source venv/bin/activate
119119
pip install --upgrade pip
120120

121-
skip_requirements="${SKIP_INSTALL_REQUIREMENTS:-false}"
122-
if [[ "${skip_requirements}" != "true" ]]; then
123-
echo "Installing requirements.txt..."
124-
pip install -r requirements.txt
125-
else
126-
echo "Skipping requirements.txt installation."
127-
pip install requests
128-
fi
121+
echo "Installing requirements.txt..."
122+
pip install -r requirements.txt
129123

130124
echo "Python venv was recreated successfully."
131125
echo "Using Python: $(which python) ($(python --version))" >&2
Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,74 @@
11
#!/usr/bin/env bash
22

3-
set -Eeou pipefail
3+
set -Eeoux pipefail
44

5-
echo "Cleaning DNF cache..."
6-
sudo dnf clean all && sudo rm -r /var/cache/dnf
5+
echo "Setting up IBM container runtime (rootful podman for minikube)"
76

8-
echo "Installing/upgrading crun..."
9-
sudo dnf upgrade -y crun --disableplugin=subscription-manager || \
10-
sudo dnf install -y crun --disableplugin=subscription-manager || \
11-
sudo yum upgrade -y crun --disableplugin=subscription-manager || \
12-
sudo yum install -y crun --disableplugin=subscription-manager
13-
14-
if ! crun --version &>/dev/null; then
15-
echo "❌ crun installation failed"
16-
exit 1
7+
# Install crun if not present (OCI runtime for cgroup v2)
8+
if ! command -v crun &>/dev/null; then
9+
echo "Installing crun..."
10+
sudo dnf install -y crun --disableplugin=subscription-manager 2>/dev/null || \
11+
sudo yum install -y crun --disableplugin=subscription-manager 2>/dev/null || \
12+
echo "Warning: Could not install crun"
13+
else
14+
echo "crun already installed: $(crun --version | head -1)"
1715
fi
1816

19-
current_version=$(crun --version | head -n1)
20-
echo "✅ Using crun: ${current_version}"
17+
# Clean up stale container state (safe for shared CI machines)
18+
cleanup_stale_state() {
19+
echo "Cleaning up stale container state..."
2120

22-
# Clean up any existing conflicting configurations
23-
echo "Cleaning up existing container configurations..."
24-
rm -f ~/.config/containers/containers.conf 2>/dev/null || true
25-
sudo rm -f /root/.config/containers/containers.conf 2>/dev/null || true
26-
sudo rm -f /etc/containers/containers.conf 2>/dev/null || true
21+
# Skip if minikube is running
22+
if command -v minikube &>/dev/null && minikube status &>/dev/null 2>&1; then
23+
echo " Minikube running - skipping cleanup"
24+
return 0
25+
fi
2726

28-
crun_path=$(which crun)
29-
echo "Using crun path: ${crun_path}"
27+
# Kill orphaned root conmon processes (PPID=1 means orphaned)
28+
for pid in $(sudo pgrep conmon 2>/dev/null); do
29+
ppid=$(ps -o ppid= -p "${pid}" 2>/dev/null | tr -d ' ')
30+
if [[ "${ppid}" == "1" ]]; then
31+
echo " Killing orphaned conmon ${pid}"
32+
sudo kill -9 "${pid}" 2>/dev/null || true
33+
fi
34+
done
3035

31-
config="[containers]
32-
cgroup_manager = \"cgroupfs\"
36+
# Clean stale lock files (safe since minikube isn't running)
37+
sudo find /run/crun -name "*.lock" -delete 2>/dev/null || true
3338

34-
[engine]
35-
runtime = \"crun\""
39+
# Prune exited containers and dangling volumes
40+
sudo podman container prune -f 2>/dev/null || true
41+
sudo podman volume prune -f 2>/dev/null || true
42+
}
3643

37-
mkdir -p ~/.config/containers
38-
echo "${config}" > ~/.config/containers/containers.conf
44+
cleanup_stale_state
3945

40-
sudo mkdir -p /root/.config/containers
41-
echo "${config}" | sudo tee /root/.config/containers/containers.conf >/dev/null
46+
# Test sudo podman (used by minikube in rootful mode)
47+
echo "Testing sudo podman..."
48+
if ! sudo podman run --rm docker.io/library/alpine:latest echo "sudo podman works" 2>/dev/null; then
49+
echo "Sudo podman not working, resetting..."
50+
sudo podman system reset --force 2>/dev/null || true
51+
sleep 1
52+
53+
if sudo podman run --rm docker.io/library/alpine:latest echo "sudo podman works" 2>/dev/null; then
54+
echo "Sudo podman working after reset"
55+
else
56+
echo "Warning: Sudo podman still not working"
57+
fi
58+
else
59+
echo "Sudo podman working"
60+
fi
61+
62+
# Configure root-level podman
63+
sudo mkdir -p /etc/containers
64+
sudo tee /etc/containers/containers.conf > /dev/null << 'EOF'
65+
[containers]
66+
cgroup_manager = "systemd"
67+
68+
[engine]
69+
runtime = "crun"
70+
EOF
4271

43-
echo "✅ Configured crun"
72+
echo "Container runtime setup complete"
73+
echo " crun: $(crun --version 2>/dev/null | head -1 || echo 'not found')"
74+
echo " podman: $(sudo podman --version)"

scripts/evergreen/setup_minikube_host.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ run_setup_step() {
4040

4141
# Setup Python environment (needed for AWS CLI pip installation)
4242
export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
43-
export SKIP_INSTALL_REQUIREMENTS=${SKIP_INSTALL_REQUIREMENTS:-true}
4443
run_setup_step "Python Virtual Environment" "scripts/dev/recreate_python_venv.sh"
4544

4645
run_setup_step "AWS CLI Setup" "scripts/evergreen/setup_aws.sh"

scripts/funcs/install

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ download_and_install_binary() {
4646

4747
mkdir -p "${dir}"
4848
echo "Downloading ${url}"
49-
curl --retry 5 --retry-delay 3 --retry-all-errors --fail --show-error --max-time 180 --silent -L "${url}" -o "${bin}"
49+
# Use longer timeout (10 min) for large binaries like minikube (~140MB) on slow IBM networks
50+
# Add -C - for resume capability in case of partial downloads
51+
curl --retry 5 --retry-delay 10 --retry-all-errors --fail --show-error --max-time 600 -L "${url}" -o "${bin}"
5052
chmod +x "${bin}"
5153
mv "${bin}" "${dir}"
5254
echo "Installed ${bin} to ${dir}"

scripts/minikube/setup_minikube.sh

Lines changed: 51 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
source scripts/dev/set_env_context.sh
55
source scripts/funcs/install
66

7+
78
set -Eeou pipefail
89

910
set_limits() {
@@ -48,10 +49,10 @@ setup_local_registry_and_custom_image() {
4849
if [[ "${ARCH}" == "ppc64le" ]]; then
4950
echo ">>> Setting up local registry and custom kicbase image for ppc64le..."
5051

51-
# Check if local registry is running (with fallback for namespace issues)
52+
# Check if local registry is running (use 127.0.0.1 to avoid IPv6 fallback delay)
5253
registry_running=false
53-
if curl -s http://localhost:5000/v2/_catalog >/dev/null 2>&1; then
54-
echo "Registry detected via HTTP check (podman ps failed)"
54+
if curl -s --max-time 5 http://127.0.0.1:5000/v2/_catalog >/dev/null 2>&1; then
55+
echo "Registry detected via HTTP check"
5556
registry_running=true
5657
fi
5758

@@ -61,15 +62,15 @@ setup_local_registry_and_custom_image() {
6162
# Clean up any existing registry first
6263
sudo podman rm -f registry 2>/dev/null || true
6364

64-
if ! sudo podman run -d -p 5000:5000 --name registry --restart=always docker.io/library/registry:2; then
65+
if ! sudo podman run -d -p 127.0.0.1:5000:5000 --name registry --restart=always docker.io/library/registry:2; then
6566
echo "❌ Failed to start local registry - trying alternative approach"
6667
exit 1
6768
fi
6869

6970
# Wait for registry to be ready
7071
echo "Waiting for registry to be ready..."
7172
for _ in {1..30}; do
72-
if curl -s http://localhost:5000/v2/_catalog >/dev/null 2>&1; then
73+
if curl -s --max-time 5 http://127.0.0.1:5000/v2/_catalog >/dev/null 2>&1; then
7374
break
7475
fi
7576
sleep 1
@@ -78,31 +79,18 @@ setup_local_registry_and_custom_image() {
7879
echo "✅ Local registry already running"
7980
fi
8081

81-
# Configure podman to trust local registry (both user and root level for minikube)
82+
# Configure podman to trust local registry (rootful only since minikube uses sudo podman)
8283
echo "Configuring registries.conf to trust local registry..."
83-
84-
# User-level config
85-
mkdir -p ~/.config/containers
86-
cat > ~/.config/containers/registries.conf << 'EOF'
87-
[[registry]]
88-
location = "localhost:5000"
89-
insecure = true
90-
EOF
91-
92-
# Root-level config (since minikube uses sudo podman)
9384
sudo mkdir -p /root/.config/containers
9485
sudo tee /root/.config/containers/registries.conf << 'EOF' >/dev/null
9586
[[registry]]
9687
location = "localhost:5000"
9788
insecure = true
9889
EOF
90+
echo "✅ Registry configuration created"
9991

100-
echo "✅ Registry configuration created for both user and root"
101-
custom_image_tag="localhost:5000/kicbase:v0.0.47"
102-
103-
# Determine image tag
104-
custom_image_tag="localhost:5000/kicbase:v0.0.47"
105-
if curl -s http://localhost:5000/v2/kicbase/tags/list | grep -q "v0.0.47"; then
92+
custom_image_tag="localhost:5000/kicbase:v0.0.48"
93+
if curl -s --max-time 5 http://127.0.0.1:5000/v2/kicbase/tags/list | grep -q "v0.0.48"; then
10694
echo "Custom kicbase image already exists in local registry"
10795
return 0
10896
fi
@@ -113,7 +101,7 @@ EOF
113101
# Build custom kicbase image
114102
mkdir -p "${PROJECT_DIR:-.}/scripts/minikube/kicbase"
115103
cat > "${PROJECT_DIR:-.}/scripts/minikube/kicbase/Dockerfile" << 'EOF'
116-
FROM gcr.io/k8s-minikube/kicbase:v0.0.47
104+
FROM gcr.io/k8s-minikube/kicbase:v0.0.48
117105
RUN if [ "$(uname -m)" = "ppc64le" ]; then \
118106
CRICTL_VERSION="v1.28.0" && \
119107
curl -L "https://github.com/kubernetes-sigs/cri-tools/releases/download/${CRICTL_VERSION}/crictl-${CRICTL_VERSION}-linux-ppc64le.tar.gz" \
@@ -129,6 +117,21 @@ EOF
129117
echo "Failed to build custom image"
130118
return 1
131119
}
120+
121+
# Use 127.0.0.1 to avoid IPv6 issues with podman on ppc64le, we might bind to ipv6 and it might not work
122+
if ! curl -s --max-time 5 http://127.0.0.1:5000/v2/_catalog >/dev/null 2>&1; then
123+
echo "Registry not responding, restarting..."
124+
sudo podman rm -f registry 2>/dev/null || true
125+
sudo podman run -d -p 127.0.0.1:5000:5000 --name registry --restart=always docker.io/library/registry:2
126+
for _ in {1..15}; do
127+
if curl -s --max-time 5 http://127.0.0.1:5000/v2/_catalog >/dev/null 2>&1; then
128+
echo "Registry restarted successfully"
129+
break
130+
fi
131+
sleep 1
132+
done
133+
fi
134+
132135
sudo podman push "${custom_image_tag}" --tls-verify=false || {
133136
echo "Failed to push to registry"
134137
return 1
@@ -139,9 +142,19 @@ EOF
139142
return 0
140143
}
141144

142-
# Start minikube with podman driver
145+
# Start minikube with podman driver (rootful mode for reliable networking)
143146
start_minikube_cluster() {
144-
echo ">>> Starting minikube cluster with podman driver..."
147+
echo ">>> Starting minikube cluster with podman driver (rootful mode)..."
148+
149+
if "${PROJECT_DIR:-.}/bin/minikube" status &>/dev/null; then
150+
echo "✅ Minikube is already running - verifying health..."
151+
if "${PROJECT_DIR:-.}/bin/minikube" kubectl -- get nodes &>/dev/null; then
152+
echo "✅ Minikube cluster is healthy - skipping setup"
153+
return 0
154+
else
155+
echo "⚠️ Minikube running but unhealthy - will recreate"
156+
fi
157+
fi
145158

146159
# Clean up any existing minikube state to avoid cached configuration issues
147160
echo "Cleaning up any existing minikube state..."
@@ -153,19 +166,27 @@ start_minikube_cluster() {
153166
echo "Ensuring clean minikube state..."
154167
"${PROJECT_DIR:-.}/bin/minikube" delete 2>/dev/null || true
155168

156-
local start_args=("--driver=podman")
169+
# Clean up stale podman volumes
170+
echo "Cleaning up stale podman volumes..."
171+
sudo podman volume rm -f minikube 2>/dev/null || true
172+
sudo podman network rm -f minikube 2>/dev/null || true
173+
174+
# Use rootful podman - rootless has iptables/CNI issues on ppc64le and s390x
175+
local start_args=("--driver=podman" "--container-runtime=containerd" "--rootless=false")
157176
start_args+=("--cpus=4" "--memory=8g")
158177

159178
if [[ "${ARCH}" == "ppc64le" ]]; then
160179
echo "Using custom kicbase image for ppc64le with crictl..."
161180

162-
start_args+=("--base-image=localhost:5000/kicbase:v0.0.47")
181+
start_args+=("--base-image=localhost:5000/kicbase:v0.0.48")
163182
start_args+=("--insecure-registry=localhost:5000")
183+
# Use bridge CNI for ppc64le - kindnet doesn't have ppc64le images
184+
start_args+=("--cni=bridge")
185+
elif [[ "${ARCH}" == "s390x" ]]; then
186+
# Use bridge CNI for s390x to avoid potential image availability issues
187+
start_args+=("--cni=bridge")
164188
fi
165189

166-
# Use default bridge CNI to avoid Docker Hub rate limiting issues
167-
# start_args+=("--cni=bridge")
168-
169190
echo "Starting minikube with args: ${start_args[*]}"
170191
if "${PROJECT_DIR:-.}/bin/minikube" start "${start_args[@]}"; then
171192
echo "✅ Minikube started successfully"
@@ -194,14 +215,6 @@ else
194215
exit 1
195216
fi
196217

197-
if [[ "${ARCH}" == "ppc64le" ]]; then
198-
echo ""
199-
echo ">>> Note: crictl will be patched into the minikube container after startup"
200-
else
201-
echo ""
202-
echo ">>> Using standard kicbase image (crictl included for x86_64/aarch64/s390x)"
203-
fi
204-
205218
# Start the minikube cluster
206219
start_minikube_cluster
207220

0 commit comments

Comments
 (0)