13
13
# limitations under the License.
14
14
15
15
"""
16
- The config sub-module contains the definition of the RayJobClusterConfigV2 dataclass,
16
+ The config sub-module contains the definition of the RayJobClusterConfig dataclass,
17
17
which is used to specify resource requirements and other details when creating a
18
18
Cluster object.
19
19
"""
@@ -139,6 +139,14 @@ class RayJobClusterConfig:
139
139
A list of V1Volume objects to add to the Cluster
140
140
volume_mounts:
141
141
A list of V1VolumeMount objects to add to the Cluster
142
+ enable_gcs_ft:
143
+ A boolean indicating whether to enable GCS fault tolerance.
144
+ redis_address:
145
+ The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
146
+ redis_password_secret:
147
+ Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
148
+ external_storage_namespace:
149
+ The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
142
150
"""
143
151
144
152
head_cpu_requests : Union [int , str ] = 2
@@ -165,8 +173,33 @@ class RayJobClusterConfig:
165
173
annotations : Dict [str , str ] = field (default_factory = dict )
166
174
volumes : list [V1Volume ] = field (default_factory = list )
167
175
volume_mounts : list [V1VolumeMount ] = field (default_factory = list )
176
+ enable_gcs_ft : bool = False
177
+ redis_address : Optional [str ] = None
178
+ redis_password_secret : Optional [Dict [str , str ]] = None
179
+ external_storage_namespace : Optional [str ] = None
168
180
169
181
def __post_init__ (self ):
182
+ if self .enable_gcs_ft :
183
+ if not self .redis_address :
184
+ raise ValueError (
185
+ "redis_address must be provided when enable_gcs_ft is True"
186
+ )
187
+
188
+ if self .redis_password_secret and not isinstance (
189
+ self .redis_password_secret , dict
190
+ ):
191
+ raise ValueError (
192
+ "redis_password_secret must be a dictionary with 'name' and 'key' fields"
193
+ )
194
+
195
+ if self .redis_password_secret and (
196
+ "name" not in self .redis_password_secret
197
+ or "key" not in self .redis_password_secret
198
+ ):
199
+ raise ValueError (
200
+ "redis_password_secret must contain both 'name' and 'key' fields"
201
+ )
202
+
170
203
self ._validate_types ()
171
204
self ._memory_to_string ()
172
205
self ._validate_gpu_config (self .head_accelerators )
@@ -251,6 +284,11 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
251
284
"workerGroupSpecs" : [self ._build_worker_group_spec (cluster_name )],
252
285
}
253
286
287
+ # Add GCS fault tolerance if enabled
288
+ if self .enable_gcs_ft :
289
+ gcs_ft_options = self ._build_gcs_ft_options ()
290
+ ray_cluster_spec ["gcsFaultToleranceOptions" ] = gcs_ft_options
291
+
254
292
return ray_cluster_spec
255
293
256
294
def _build_head_group_spec (self ) -> Dict [str , Any ]:
@@ -453,3 +491,25 @@ def _generate_volumes(self) -> list:
453
491
def _build_env_vars (self ) -> list :
454
492
"""Build environment variables list."""
455
493
return [V1EnvVar (name = key , value = value ) for key , value in self .envs .items ()]
494
+
495
+ def _build_gcs_ft_options (self ) -> Dict [str , Any ]:
496
+ """Build GCS fault tolerance options."""
497
+ gcs_ft_options = {"redisAddress" : self .redis_address }
498
+
499
+ if (
500
+ hasattr (self , "external_storage_namespace" )
501
+ and self .external_storage_namespace
502
+ ):
503
+ gcs_ft_options ["externalStorageNamespace" ] = self .external_storage_namespace
504
+
505
+ if hasattr (self , "redis_password_secret" ) and self .redis_password_secret :
506
+ gcs_ft_options ["redisPassword" ] = {
507
+ "valueFrom" : {
508
+ "secretKeyRef" : {
509
+ "name" : self .redis_password_secret ["name" ],
510
+ "key" : self .redis_password_secret ["key" ],
511
+ }
512
+ }
513
+ }
514
+
515
+ return gcs_ft_options
0 commit comments