Skip to content

Commit ffcf704

Browse files
authored
[Feature] Remove Docker container and NodePort from compatibility test (ray-project#844)
Removes the Docker container + Ray Client + NodePort setup from the compatibility tests. Replaces this with simpler kubectl exec command execution.
1 parent 21a3611 commit ffcf704

File tree

11 files changed

+266
-350
lines changed

11 files changed

+266
-350
lines changed

tests/compatibility-test.py

Lines changed: 71 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import os
66
import random
77
import string
8-
import docker
98

109
import kuberay_utils.utils as utils
1110
from framework.prototype import (
@@ -16,6 +15,8 @@
1615
)
1716

1817
from framework.utils import (
18+
get_head_pod,
19+
pod_exec_command,
1920
shell_subprocess_run,
2021
CONST,
2122
K8S_CLUSTER_MANAGER,
@@ -38,7 +39,8 @@
3839

3940

4041
class BasicRayTestCase(unittest.TestCase):
41-
cluster_template_file = CONST.REPO_ROOT.joinpath("tests/config/ray-cluster.mini.yaml.template")
42+
"""Test the basic functionalities of RayCluster by executing simple jobs."""
43+
cluster_template = CONST.REPO_ROOT.joinpath("tests/config/ray-cluster.mini.yaml.template")
4244

4345
@classmethod
4446
def setUpClass(cls):
@@ -51,46 +53,26 @@ def setUpClass(cls):
5153
}
5254
operator_manager = OperatorManager(image_dict)
5355
operator_manager.prepare_operator()
54-
utils.create_ray_cluster(BasicRayTestCase.cluster_template_file,
55-
ray_version, ray_image)
56+
utils.create_ray_cluster(BasicRayTestCase.cluster_template, ray_version, ray_image)
5657

5758
def test_simple_code(self):
5859
"""
5960
Run a simple example in the head Pod to test the basic functionality of the Ray cluster.
6061
The example is from https://docs.ray.io/en/latest/ray-core/walkthrough.html#running-a-task.
6162
"""
6263
cluster_namespace = "default"
63-
k8s_v1_api = K8S_CLUSTER_MANAGER.k8s_client_dict[CONST.K8S_V1_CLIENT_KEY]
64-
headpods = k8s_v1_api.list_namespaced_pod(
65-
namespace = cluster_namespace, label_selector='ray.io/node-type=head')
66-
headpod_name = headpods.items[0].metadata.name
67-
shell_subprocess_run(f"kubectl exec {headpod_name} -n {cluster_namespace} --" +
68-
" python -c '''{}'''".format(
69-
'''
70-
import ray
71-
ray.init()
72-
73-
# Define the square task.
74-
@ray.remote
75-
def square(x):
76-
return x * x
77-
78-
# Launch four parallel square tasks.
79-
futures = [square.remote(i) for i in range(4)]
80-
81-
# Check results.
82-
assert ray.get(futures) == [0, 1, 4, 9]
83-
'''
84-
)
85-
)
64+
headpod = get_head_pod(cluster_namespace)
65+
headpod_name = headpod.metadata.name
66+
pod_exec_command(headpod_name, cluster_namespace, "python samples/simple_code.py")
8667

8768
def test_cluster_info(self):
8869
"""Execute "print(ray.cluster_resources())" in the head Pod."""
8970
EasyJobRule().assert_rule()
9071

9172

9273
class RayFTTestCase(unittest.TestCase):
93-
cluster_template_file = CONST.REPO_ROOT.joinpath("tests/config/ray-cluster.ray-ft.yaml.template")
74+
"""Test Ray GCS Fault Tolerance"""
75+
cluster_template = CONST.REPO_ROOT.joinpath("tests/config/ray-cluster.ray-ft.yaml.template")
9476

9577
@classmethod
9678
def setUpClass(cls):
@@ -104,8 +86,7 @@ def setUpClass(cls):
10486
}
10587
operator_manager = OperatorManager(image_dict)
10688
operator_manager.prepare_operator()
107-
utils.create_ray_cluster(RayFTTestCase.cluster_template_file,
108-
ray_version, ray_image)
89+
utils.create_ray_cluster(RayFTTestCase.cluster_template, ray_version, ray_image)
10990

11091
@unittest.skip("Skip test_kill_head due to its flakiness.")
11192
def test_kill_head(self):
@@ -128,106 +109,107 @@ def test_kill_head(self):
128109
raise Exception(f"Nonzero return code {rtn} in test_kill_head()")
129110

130111
def test_ray_serve(self):
112+
"""Kill GCS process on the head Pod and then test a deployed Ray Serve model."""
131113
cluster_namespace = "default"
132-
docker_client = docker.from_env()
133-
container = docker_client.containers.run(ray_image, remove=True, detach=True, stdin_open=True, tty=True,
134-
network_mode='host', command=["/bin/sh"])
135-
# Deploy a model with ray serve
114+
headpod = get_head_pod(cluster_namespace)
115+
headpod_name = headpod.metadata.name
116+
117+
# RAY_NAMESPACE is an abstraction in Ray. It is not a Kubernetes namespace.
136118
ray_namespace = ''.join(random.choices(string.ascii_lowercase, k=10))
137-
logger.info(f'namespace: {ray_namespace}')
119+
logger.info('Ray namespace: %s', ray_namespace)
138120

139-
utils.copy_to_container(container, 'tests/scripts', '/usr/local/', 'test_ray_serve_1.py')
140-
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_ray_serve_1.py {ray_namespace}', timeout_sec = 180)
121+
# Deploy a Ray Serve model.
122+
exit_code = pod_exec_command(headpod_name, cluster_namespace,
123+
f" python samples/test_ray_serve_1.py {ray_namespace}",
124+
check = False
125+
)
141126

142127
if exit_code != 0:
143128
show_cluster_info(cluster_namespace)
144-
raise Exception(f"There was an exception during the execution of test_ray_serve_1.py. The exit code is {exit_code}." +
145-
"See above for command output. The output will be printed by the function exec_run_container.")
146-
147-
# KubeRay only allows at most 1 head pod per RayCluster instance at the same time. In addition,
148-
# if we have 0 head pods at this moment, it indicates that the head pod crashes unexpectedly.
149-
headpods = utils.get_pod(namespace=cluster_namespace,
150-
label_selector='ray.io/node-type=head')
151-
assert(len(headpods.items) == 1)
152-
old_head_pod = headpods.items[0]
129+
raise Exception(
130+
f"Fail to execute test_ray_serve_1.py. The exit code is {exit_code}."
131+
)
132+
133+
old_head_pod = get_head_pod(cluster_namespace)
153134
old_head_pod_name = old_head_pod.metadata.name
154135
restart_count = old_head_pod.status.container_statuses[0].restart_count
155136

156-
# Kill the gcs_server process on head node. If fate sharing is enabled, the whole head node pod
157-
# will terminate.
158-
exec_command = ['pkill gcs_server']
159-
utils.pod_exec_command(pod_name=old_head_pod_name,
160-
namespace=cluster_namespace, exec_command=exec_command)
137+
# Kill the gcs_server process on head node. If fate sharing is enabled, the whole head
138+
# node pod will be terminated.
139+
pod_exec_command(old_head_pod_name, cluster_namespace, "pkill gcs_server")
161140

162141
# Waiting for all pods become ready and running.
163142
utils.wait_for_new_head(old_head_pod_name, restart_count,
164143
cluster_namespace, timeout=300, retry_interval_ms=1000)
165144

166145
# Try to connect to the deployed model again
167-
utils.copy_to_container(container, 'tests/scripts', '/usr/local/', 'test_ray_serve_2.py')
168-
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_ray_serve_2.py {ray_namespace}', timeout_sec = 180)
146+
headpod = get_head_pod(cluster_namespace)
147+
headpod_name = headpod.metadata.name
148+
exit_code = pod_exec_command(headpod_name, cluster_namespace,
149+
f" python samples/test_ray_serve_2.py {ray_namespace}",
150+
check = False
151+
)
169152

170153
if exit_code != 0:
171154
show_cluster_info(cluster_namespace)
172-
raise Exception(f"There was an exception during the execution of test_ray_serve_2.py. The exit code is {exit_code}." +
173-
"See above for command output. The output will be printed by the function exec_run_container.")
174-
175-
container.stop()
176-
docker_client.close()
155+
raise Exception(
156+
f"Fail to execute test_ray_serve_2.py. The exit code is {exit_code}."
157+
)
177158

178159
def test_detached_actor(self):
160+
"""Kill GCS process on the head Pod and then test a detached actor."""
179161
cluster_namespace = "default"
180-
docker_client = docker.from_env()
181-
container = docker_client.containers.run(ray_image, remove=True, detach=True, stdin_open=True, tty=True,
182-
network_mode='host', command=["/bin/sh"])
162+
headpod = get_head_pod(cluster_namespace)
163+
headpod_name = headpod.metadata.name
164+
165+
# RAY_NAMESPACE is an abstraction in Ray. It is not a Kubernetes namespace.
183166
ray_namespace = ''.join(random.choices(string.ascii_lowercase, k=10))
184-
logger.info(f'namespace: {ray_namespace}')
167+
logger.info('Ray namespace: %s', ray_namespace)
185168

186169
# Register a detached actor
187-
utils.copy_to_container(container, 'tests/scripts', '/usr/local/', 'test_detached_actor_1.py')
188-
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_detached_actor_1.py {ray_namespace}', timeout_sec = 180)
170+
exit_code = pod_exec_command(headpod_name, cluster_namespace,
171+
f" python samples/test_detached_actor_1.py {ray_namespace}",
172+
check = False
173+
)
189174

190175
if exit_code != 0:
191176
show_cluster_info(cluster_namespace)
192-
raise Exception(f"There was an exception during the execution of test_detached_actor_1.py. The exit code is {exit_code}." +
193-
"See above for command output. The output will be printed by the function exec_run_container.")
194-
195-
# KubeRay only allows at most 1 head pod per RayCluster instance at the same time. In addition,
196-
# if we have 0 head pods at this moment, it indicates that the head pod crashes unexpectedly.
197-
headpods = utils.get_pod(namespace=cluster_namespace,
198-
label_selector='ray.io/node-type=head')
199-
assert(len(headpods.items) == 1)
200-
old_head_pod = headpods.items[0]
177+
raise Exception(
178+
f"Fail to execute test_detached_actor_1.py. The exit code is {exit_code}."
179+
)
180+
181+
old_head_pod = get_head_pod(cluster_namespace)
201182
old_head_pod_name = old_head_pod.metadata.name
202183
restart_count = old_head_pod.status.container_statuses[0].restart_count
203184

204-
# Kill the gcs_server process on head node. If fate sharing is enabled, the whole head node pod
205-
# will terminate.
206-
exec_command = ['pkill gcs_server']
207-
utils.pod_exec_command(pod_name=old_head_pod_name,
208-
namespace=cluster_namespace, exec_command=exec_command)
185+
# Kill the gcs_server process on head node. If fate sharing is enabled, the whole head
186+
# node pod will be terminated.
187+
pod_exec_command(old_head_pod_name, cluster_namespace, "pkill gcs_server")
209188

210189
# Waiting for all pods become ready and running.
211190
utils.wait_for_new_head(old_head_pod_name, restart_count,
212191
cluster_namespace, timeout=300, retry_interval_ms=1000)
213192

214193
# Try to connect to the detached actor again.
215-
# [Note] When all pods become running and ready, the RayCluster still needs tens of seconds to relaunch actors. Hence,
216-
# `test_detached_actor_2.py` will retry until a Ray client connection succeeds.
217-
utils.copy_to_container(container, 'tests/scripts', '/usr/local/', 'test_detached_actor_2.py')
218-
exit_code, _ = utils.exec_run_container(container, f'python3 /usr/local/test_detached_actor_2.py {ray_namespace}', timeout_sec = 180)
194+
# [Note] When all pods become running and ready, the RayCluster still needs tens of seconds
195+
# to relaunch actors. Hence, `test_detached_actor_2.py` will retry until a Ray client
196+
# connection succeeds.
197+
headpod = get_head_pod(cluster_namespace)
198+
headpod_name = headpod.metadata.name
199+
exit_code = pod_exec_command(headpod_name, cluster_namespace,
200+
f" python samples/test_detached_actor_2.py {ray_namespace}",
201+
check = False
202+
)
219203

220204
if exit_code != 0:
221205
show_cluster_info(cluster_namespace)
222-
raise Exception(f"There was an exception during the execution of test_detached_actor_2.py. The exit code is {exit_code}." +
223-
"See above for command output. The output will be printed by the function exec_run_container.")
224-
225-
container.stop()
226-
docker_client.close()
206+
raise Exception(
207+
f"Fail to execute test_detached_actor_2.py. The exit code is {exit_code}."
208+
)
227209

228210
class RayServiceTestCase(unittest.TestCase):
229211
"""Integration tests for RayService"""
230-
service_template_file = 'tests/config/ray-service.yaml.template'
212+
service_template = 'tests/config/ray-service.yaml.template'
231213

232214
# The previous logic for testing updates was problematic.
233215
# We need to test RayService updates.
@@ -247,7 +229,7 @@ def setUpClass(cls):
247229
def test_ray_serve_work(self):
248230
"""Create a RayService, send a request to RayService via `curl`, and compare the result."""
249231
cr_event = utils.create_ray_service(
250-
RayServiceTestCase.service_template_file, ray_version, ray_image)
232+
RayServiceTestCase.service_template, ray_version, ray_image)
251233
# When Pods are READY and RUNNING, RayService still needs tens of seconds to be ready
252234
# for serving requests. This `sleep` function is a workaround, and should be removed
253235
# when https://github.com/ray-project/kuberay/pull/730 is merged.

tests/config/ray-cluster.mini.yaml.template

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,17 @@ spec:
3535
name: dashboard
3636
- containerPort: 10001
3737
name: client
38+
volumeMounts:
39+
- mountPath: /home/ray/samples
40+
name: test-script-configmap
41+
volumes:
42+
- name: test-script-configmap
43+
configMap:
44+
name: test-script
45+
# An array of keys from the ConfigMap to create as files
46+
items:
47+
- key: simple_code.py
48+
path: simple_code.py
3849
workerGroupSpecs:
3950
- replicas: 1
4051
minReplicas: 1
@@ -55,3 +66,23 @@ spec:
5566
containers:
5667
- name: ray-worker
5768
image: $ray_image
69+
---
70+
apiVersion: v1
71+
kind: ConfigMap
72+
metadata:
73+
name: test-script
74+
data:
75+
simple_code.py: |
76+
import ray
77+
ray.init()
78+
79+
# Define the square task.
80+
@ray.remote
81+
def square(x):
82+
return x * x
83+
84+
# Launch four parallel square tasks.
85+
futures = [square.remote(i) for i in range(4)]
86+
87+
# Check results.
88+
assert ray.get(futures) == [0, 1, 4, 9]

0 commit comments

Comments
 (0)