55import os
66import random
77import string
8- import docker
98
109import kuberay_utils .utils as utils
1110from framework .prototype import (
1615)
1716
1817from framework .utils import (
18+ get_head_pod ,
19+ pod_exec_command ,
1920 shell_subprocess_run ,
2021 CONST ,
2122 K8S_CLUSTER_MANAGER ,
3839
3940
4041class BasicRayTestCase (unittest .TestCase ):
41- cluster_template_file = CONST .REPO_ROOT .joinpath ("tests/config/ray-cluster.mini.yaml.template" )
42+ """Test the basic functionalities of RayCluster by executing simple jobs."""
43+ cluster_template = CONST .REPO_ROOT .joinpath ("tests/config/ray-cluster.mini.yaml.template" )
4244
4345 @classmethod
4446 def setUpClass (cls ):
@@ -51,46 +53,26 @@ def setUpClass(cls):
5153 }
5254 operator_manager = OperatorManager (image_dict )
5355 operator_manager .prepare_operator ()
54- utils .create_ray_cluster (BasicRayTestCase .cluster_template_file ,
55- ray_version , ray_image )
56+ utils .create_ray_cluster (BasicRayTestCase .cluster_template , ray_version , ray_image )
5657
5758 def test_simple_code (self ):
5859 """
5960 Run a simple example in the head Pod to test the basic functionality of the Ray cluster.
6061 The example is from https://docs.ray.io/en/latest/ray-core/walkthrough.html#running-a-task.
6162 """
6263 cluster_namespace = "default"
63- k8s_v1_api = K8S_CLUSTER_MANAGER .k8s_client_dict [CONST .K8S_V1_CLIENT_KEY ]
64- headpods = k8s_v1_api .list_namespaced_pod (
65- namespace = cluster_namespace , label_selector = 'ray.io/node-type=head' )
66- headpod_name = headpods .items [0 ].metadata .name
67- shell_subprocess_run (f"kubectl exec { headpod_name } -n { cluster_namespace } --" +
68- " python -c '''{}'''" .format (
69- '''
70- import ray
71- ray.init()
72-
73- # Define the square task.
74- @ray.remote
75- def square(x):
76- return x * x
77-
78- # Launch four parallel square tasks.
79- futures = [square.remote(i) for i in range(4)]
80-
81- # Check results.
82- assert ray.get(futures) == [0, 1, 4, 9]
83- '''
84- )
85- )
64+ headpod = get_head_pod (cluster_namespace )
65+ headpod_name = headpod .metadata .name
66+ pod_exec_command (headpod_name , cluster_namespace , "python samples/simple_code.py" )
8667
8768 def test_cluster_info (self ):
8869 """Execute "print(ray.cluster_resources())" in the head Pod."""
8970 EasyJobRule ().assert_rule ()
9071
9172
9273class RayFTTestCase (unittest .TestCase ):
93- cluster_template_file = CONST .REPO_ROOT .joinpath ("tests/config/ray-cluster.ray-ft.yaml.template" )
74+ """Test Ray GCS Fault Tolerance"""
75+ cluster_template = CONST .REPO_ROOT .joinpath ("tests/config/ray-cluster.ray-ft.yaml.template" )
9476
9577 @classmethod
9678 def setUpClass (cls ):
@@ -104,8 +86,7 @@ def setUpClass(cls):
10486 }
10587 operator_manager = OperatorManager (image_dict )
10688 operator_manager .prepare_operator ()
107- utils .create_ray_cluster (RayFTTestCase .cluster_template_file ,
108- ray_version , ray_image )
89+ utils .create_ray_cluster (RayFTTestCase .cluster_template , ray_version , ray_image )
10990
11091 @unittest .skip ("Skip test_kill_head due to its flakiness." )
11192 def test_kill_head (self ):
@@ -128,106 +109,107 @@ def test_kill_head(self):
128109 raise Exception (f"Nonzero return code { rtn } in test_kill_head()" )
129110
130111 def test_ray_serve (self ):
112+ """Kill GCS process on the head Pod and then test a deployed Ray Serve model."""
131113 cluster_namespace = "default"
132- docker_client = docker . from_env ( )
133- container = docker_client . containers . run ( ray_image , remove = True , detach = True , stdin_open = True , tty = True ,
134- network_mode = 'host' , command = [ "/bin/sh" ])
135- # Deploy a model with ray serve
114+ headpod = get_head_pod ( cluster_namespace )
115+ headpod_name = headpod . metadata . name
116+
117+ # RAY_NAMESPACE is an abstraction in Ray. It is not a Kubernetes namespace.
136118 ray_namespace = '' .join (random .choices (string .ascii_lowercase , k = 10 ))
137- logger .info (f' namespace: { ray_namespace } ' )
119+ logger .info ('Ray namespace: %s' , ray_namespace )
138120
139- utils .copy_to_container (container , 'tests/scripts' , '/usr/local/' , 'test_ray_serve_1.py' )
140- exit_code , _ = utils .exec_run_container (container , f'python3 /usr/local/test_ray_serve_1.py { ray_namespace } ' , timeout_sec = 180 )
121+ # Deploy a Ray Serve model.
122+ exit_code = pod_exec_command (headpod_name , cluster_namespace ,
123+ f" python samples/test_ray_serve_1.py { ray_namespace } " ,
124+ check = False
125+ )
141126
142127 if exit_code != 0 :
143128 show_cluster_info (cluster_namespace )
144- raise Exception (f"There was an exception during the execution of test_ray_serve_1.py. The exit code is { exit_code } ." +
145- "See above for command output. The output will be printed by the function exec_run_container." )
146-
147- # KubeRay only allows at most 1 head pod per RayCluster instance at the same time. In addition,
148- # if we have 0 head pods at this moment, it indicates that the head pod crashes unexpectedly.
149- headpods = utils .get_pod (namespace = cluster_namespace ,
150- label_selector = 'ray.io/node-type=head' )
151- assert (len (headpods .items ) == 1 )
152- old_head_pod = headpods .items [0 ]
129+ raise Exception (
130+ f"Fail to execute test_ray_serve_1.py. The exit code is { exit_code } ."
131+ )
132+
133+ old_head_pod = get_head_pod (cluster_namespace )
153134 old_head_pod_name = old_head_pod .metadata .name
154135 restart_count = old_head_pod .status .container_statuses [0 ].restart_count
155136
156- # Kill the gcs_server process on head node. If fate sharing is enabled, the whole head node pod
157- # will terminate.
158- exec_command = ['pkill gcs_server' ]
159- utils .pod_exec_command (pod_name = old_head_pod_name ,
160- namespace = cluster_namespace , exec_command = exec_command )
137+ # Kill the gcs_server process on head node. If fate sharing is enabled, the whole head
138+ # node pod will be terminated.
139+ pod_exec_command (old_head_pod_name , cluster_namespace , "pkill gcs_server" )
161140
162141 # Waiting for all pods become ready and running.
163142 utils .wait_for_new_head (old_head_pod_name , restart_count ,
164143 cluster_namespace , timeout = 300 , retry_interval_ms = 1000 )
165144
166145 # Try to connect to the deployed model again
167- utils .copy_to_container (container , 'tests/scripts' , '/usr/local/' , 'test_ray_serve_2.py' )
168- exit_code , _ = utils .exec_run_container (container , f'python3 /usr/local/test_ray_serve_2.py { ray_namespace } ' , timeout_sec = 180 )
146+ headpod = get_head_pod (cluster_namespace )
147+ headpod_name = headpod .metadata .name
148+ exit_code = pod_exec_command (headpod_name , cluster_namespace ,
149+ f" python samples/test_ray_serve_2.py { ray_namespace } " ,
150+ check = False
151+ )
169152
170153 if exit_code != 0 :
171154 show_cluster_info (cluster_namespace )
172- raise Exception (f"There was an exception during the execution of test_ray_serve_2.py. The exit code is { exit_code } ." +
173- "See above for command output. The output will be printed by the function exec_run_container." )
174-
175- container .stop ()
176- docker_client .close ()
155+ raise Exception (
156+ f"Fail to execute test_ray_serve_2.py. The exit code is { exit_code } ."
157+ )
177158
178159 def test_detached_actor (self ):
160+ """Kill GCS process on the head Pod and then test a detached actor."""
179161 cluster_namespace = "default"
180- docker_client = docker .from_env ()
181- container = docker_client .containers .run (ray_image , remove = True , detach = True , stdin_open = True , tty = True ,
182- network_mode = 'host' , command = ["/bin/sh" ])
162+ headpod = get_head_pod (cluster_namespace )
163+ headpod_name = headpod .metadata .name
164+
165+ # RAY_NAMESPACE is an abstraction in Ray. It is not a Kubernetes namespace.
183166 ray_namespace = '' .join (random .choices (string .ascii_lowercase , k = 10 ))
184- logger .info (f' namespace: { ray_namespace } ' )
167+ logger .info ('Ray namespace: %s' , ray_namespace )
185168
186169 # Register a detached actor
187- utils .copy_to_container (container , 'tests/scripts' , '/usr/local/' , 'test_detached_actor_1.py' )
188- exit_code , _ = utils .exec_run_container (container , f'python3 /usr/local/test_detached_actor_1.py { ray_namespace } ' , timeout_sec = 180 )
170+ exit_code = pod_exec_command (headpod_name , cluster_namespace ,
171+ f" python samples/test_detached_actor_1.py { ray_namespace } " ,
172+ check = False
173+ )
189174
190175 if exit_code != 0 :
191176 show_cluster_info (cluster_namespace )
192- raise Exception (f"There was an exception during the execution of test_detached_actor_1.py. The exit code is { exit_code } ." +
193- "See above for command output. The output will be printed by the function exec_run_container." )
194-
195- # KubeRay only allows at most 1 head pod per RayCluster instance at the same time. In addition,
196- # if we have 0 head pods at this moment, it indicates that the head pod crashes unexpectedly.
197- headpods = utils .get_pod (namespace = cluster_namespace ,
198- label_selector = 'ray.io/node-type=head' )
199- assert (len (headpods .items ) == 1 )
200- old_head_pod = headpods .items [0 ]
177+ raise Exception (
178+ f"Fail to execute test_detached_actor_1.py. The exit code is { exit_code } ."
179+ )
180+
181+ old_head_pod = get_head_pod (cluster_namespace )
201182 old_head_pod_name = old_head_pod .metadata .name
202183 restart_count = old_head_pod .status .container_statuses [0 ].restart_count
203184
204- # Kill the gcs_server process on head node. If fate sharing is enabled, the whole head node pod
205- # will terminate.
206- exec_command = ['pkill gcs_server' ]
207- utils .pod_exec_command (pod_name = old_head_pod_name ,
208- namespace = cluster_namespace , exec_command = exec_command )
185+ # Kill the gcs_server process on head node. If fate sharing is enabled, the whole head
186+ # node pod will be terminated.
187+ pod_exec_command (old_head_pod_name , cluster_namespace , "pkill gcs_server" )
209188
210189 # Waiting for all pods become ready and running.
211190 utils .wait_for_new_head (old_head_pod_name , restart_count ,
212191 cluster_namespace , timeout = 300 , retry_interval_ms = 1000 )
213192
214193 # Try to connect to the detached actor again.
215- # [Note] When all pods become running and ready, the RayCluster still needs tens of seconds to relaunch actors. Hence,
216- # `test_detached_actor_2.py` will retry until a Ray client connection succeeds.
217- utils .copy_to_container (container , 'tests/scripts' , '/usr/local/' , 'test_detached_actor_2.py' )
218- exit_code , _ = utils .exec_run_container (container , f'python3 /usr/local/test_detached_actor_2.py { ray_namespace } ' , timeout_sec = 180 )
194+ # [Note] When all pods become running and ready, the RayCluster still needs tens of seconds
195+ # to relaunch actors. Hence, `test_detached_actor_2.py` will retry until a Ray client
196+ # connection succeeds.
197+ headpod = get_head_pod (cluster_namespace )
198+ headpod_name = headpod .metadata .name
199+ exit_code = pod_exec_command (headpod_name , cluster_namespace ,
200+ f" python samples/test_detached_actor_2.py { ray_namespace } " ,
201+ check = False
202+ )
219203
220204 if exit_code != 0 :
221205 show_cluster_info (cluster_namespace )
222- raise Exception (f"There was an exception during the execution of test_detached_actor_2.py. The exit code is { exit_code } ." +
223- "See above for command output. The output will be printed by the function exec_run_container." )
224-
225- container .stop ()
226- docker_client .close ()
206+ raise Exception (
207+ f"Fail to execute test_detached_actor_2.py. The exit code is { exit_code } ."
208+ )
227209
228210class RayServiceTestCase (unittest .TestCase ):
229211 """Integration tests for RayService"""
230- service_template_file = 'tests/config/ray-service.yaml.template'
212+ service_template = 'tests/config/ray-service.yaml.template'
231213
232214 # The previous logic for testing updates was problematic.
233215 # We need to test RayService updates.
@@ -247,7 +229,7 @@ def setUpClass(cls):
247229 def test_ray_serve_work (self ):
248230 """Create a RayService, send a request to RayService via `curl`, and compare the result."""
249231 cr_event = utils .create_ray_service (
250- RayServiceTestCase .service_template_file , ray_version , ray_image )
232+ RayServiceTestCase .service_template , ray_version , ray_image )
251233 # When Pods are READY and RUNNING, RayService still needs tens of seconds to be ready
252234 # for serving requests. This `sleep` function is a workaround, and should be removed
253235 # when https://github.com/ray-project/kuberay/pull/730 is merged.
0 commit comments