Skip to content

Commit 49e7d69

Browse files
committed
feat(RHOAIENG-29330):Deny RayCluster creation with Ray Version mismatches
1 parent d467810 commit 49e7d69

File tree

6 files changed

+485
-0
lines changed

6 files changed

+485
-0
lines changed

docs/sphinx/user-docs/cluster-configuration.rst

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,79 @@ requirements for creating the Ray Cluster.
4444
documentation on building a custom image
4545
`here <https://github.com/opendatahub-io/distributed-workloads/tree/main/images/runtime/examples>`__.
4646

47+
Ray Version Compatibility
48+
-------------------------
49+
50+
The CodeFlare SDK requires that the Ray version in your runtime image matches the Ray version used by the SDK itself.When you specify a custom runtime image, the SDK will automatically validate that the Ray version in the image matches this requirement.
51+
52+
Version Validation Behavior
53+
~~~~~~~~~~~~~~~~~~~~~~~~~~~
54+
55+
The SDK performs the following validation when creating a cluster:
56+
57+
1. **Compatible versions**: If the runtime image contains Ray 2.47.1, the cluster will be created successfully.
58+
59+
2. **Version mismatch**: If the runtime image contains a different Ray version, cluster creation will fail with a detailed error message explaining the mismatch.
60+
61+
3. **Unknown versions**: If the SDK cannot determine the Ray version from the image name (e.g., SHA-based tags), a warning will be issued but cluster creation will continue.
62+
63+
Examples
64+
~~~~~~~~
65+
66+
**Compatible image (recommended)**:
67+
68+
.. code:: python
69+
70+
# This will work - versions match
71+
cluster = Cluster(ClusterConfiguration(
72+
name='ray-example',
73+
image='quay.io/modh/ray:2.47.1-py311-cu121'
74+
))
75+
76+
**Incompatible image (will fail)**:
77+
78+
.. code:: python
79+
80+
# This will fail with a version mismatch error
81+
cluster = Cluster(ClusterConfiguration(
82+
name='ray-example',
83+
image='ray:2.46.0' # Different version!
84+
))
85+
86+
**SHA-based image (will warn)**:
87+
88+
.. code:: python
89+
90+
# This will issue a warning but continue
91+
cluster = Cluster(ClusterConfiguration(
92+
name='ray-example',
93+
image='quay.io/modh/ray@sha256:abc123...'
94+
))
95+
96+
Best Practices
97+
~~~~~~~~~~~~~~
98+
99+
- **Use versioned tags**: Always use semantic version tags (e.g., `ray:2.47.1`) rather than `latest` or SHA-based tags for better version detection.
100+
101+
- **Test compatibility**: When building custom images, test them with the CodeFlare SDK to ensure compatibility.
102+
103+
- **Check SDK version**: You can check the Ray version used by the SDK with:
104+
105+
.. code:: python
106+
107+
from codeflare_sdk.common.utils.constants import RAY_VERSION
108+
print(f"CodeFlare SDK uses Ray version: {RAY_VERSION}")
109+
110+
**Why is version matching important?**
111+
112+
Ray version mismatches can cause:
113+
114+
- Incompatible API calls between the SDK and Ray cluster
115+
- Unexpected behavior in job submission and cluster management
116+
- Potential data corruption or job failures
117+
- Difficult-to-debug runtime errors
118+
119+
47120
Ray Usage Statistics
48121
-------------------
49122

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2024 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
from codeflare_sdk.common.utils.constants import RAY_VERSION, CUDA_RUNTIME_IMAGE
17+
18+
19+
class TestConstants:
20+
"""Test constants module for expected values."""
21+
22+
def test_ray_version_is_defined(self):
23+
"""Test that RAY_VERSION constant is properly defined."""
24+
assert RAY_VERSION is not None
25+
assert isinstance(RAY_VERSION, str)
26+
assert RAY_VERSION == "2.47.1"
27+
28+
def test_cuda_runtime_image_is_defined(self):
29+
"""Test that CUDA_RUNTIME_IMAGE constant is properly defined."""
30+
assert CUDA_RUNTIME_IMAGE is not None
31+
assert isinstance(CUDA_RUNTIME_IMAGE, str)
32+
assert "quay.io/modh/ray" in CUDA_RUNTIME_IMAGE
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# Copyright 2024 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
from codeflare_sdk.common.utils.validation import (
17+
extract_ray_version_from_image,
18+
validate_ray_version_compatibility,
19+
)
20+
from codeflare_sdk.common.utils.constants import RAY_VERSION
21+
22+
23+
class TestRayVersionDetection:
24+
"""Test Ray version detection from container image names."""
25+
26+
def test_extract_ray_version_standard_format(self):
27+
"""Test extraction from standard Ray image formats."""
28+
# Standard format
29+
assert extract_ray_version_from_image("ray:2.47.1") == "2.47.1"
30+
assert extract_ray_version_from_image("ray:2.46.0") == "2.46.0"
31+
assert extract_ray_version_from_image("ray:1.13.0") == "1.13.0"
32+
33+
def test_extract_ray_version_with_registry(self):
34+
"""Test extraction from images with registry prefixes."""
35+
assert extract_ray_version_from_image("quay.io/ray:2.47.1") == "2.47.1"
36+
assert extract_ray_version_from_image("docker.io/rayproject/ray:2.47.1") == "2.47.1"
37+
assert extract_ray_version_from_image("gcr.io/my-project/ray:2.47.1") == "2.47.1"
38+
39+
def test_extract_ray_version_with_suffixes(self):
40+
"""Test extraction from images with version suffixes."""
41+
assert extract_ray_version_from_image("quay.io/modh/ray:2.47.1-py311-cu121") == "2.47.1"
42+
assert extract_ray_version_from_image("ray:2.47.1-py311") == "2.47.1"
43+
assert extract_ray_version_from_image("ray:2.47.1-gpu") == "2.47.1"
44+
assert extract_ray_version_from_image("ray:2.47.1-rocm62") == "2.47.1"
45+
46+
def test_extract_ray_version_complex_registry_paths(self):
47+
"""Test extraction from complex registry paths."""
48+
assert extract_ray_version_from_image("quay.io/modh/ray:2.47.1-py311-cu121") == "2.47.1"
49+
assert extract_ray_version_from_image("registry.company.com/team/ray:2.47.1") == "2.47.1"
50+
51+
def test_extract_ray_version_no_version_found(self):
52+
"""Test cases where no version can be extracted."""
53+
# SHA-based tags
54+
assert extract_ray_version_from_image("quay.io/modh/ray@sha256:6d076aeb38ab3c34a6a2ef0f58dc667089aa15826fa08a73273c629333e12f1e") is None
55+
56+
# Non-semantic versions
57+
assert extract_ray_version_from_image("ray:latest") is None
58+
assert extract_ray_version_from_image("ray:nightly") is None
59+
assert extract_ray_version_from_image("ray:v2.47") is None # Missing patch version
60+
61+
# Non-Ray images
62+
assert extract_ray_version_from_image("python:3.11") is None
63+
assert extract_ray_version_from_image("ubuntu:20.04") is None
64+
65+
# Empty or None
66+
assert extract_ray_version_from_image("") is None
67+
assert extract_ray_version_from_image(None) is None
68+
69+
def test_extract_ray_version_edge_cases(self):
70+
"""Test edge cases for version extraction."""
71+
# Version with 'v' prefix should not match our pattern
72+
assert extract_ray_version_from_image("ray:v2.47.1") is None
73+
74+
# Multiple version-like patterns - should match the first valid one
75+
assert extract_ray_version_from_image("registry/ray:2.47.1-based-on-1.0.0") == "2.47.1"
76+
77+
78+
class TestRayVersionValidation:
79+
"""Test Ray version compatibility validation."""
80+
81+
def test_validate_compatible_versions(self):
82+
"""Test validation with compatible Ray versions."""
83+
# Exact match
84+
is_compatible, message = validate_ray_version_compatibility(f"ray:{RAY_VERSION}")
85+
assert is_compatible is True
86+
assert "Ray versions match" in message
87+
88+
# With registry and suffixes
89+
is_compatible, message = validate_ray_version_compatibility(f"quay.io/modh/ray:{RAY_VERSION}-py311-cu121")
90+
assert is_compatible is True
91+
assert "Ray versions match" in message
92+
93+
def test_validate_incompatible_versions(self):
94+
"""Test validation with incompatible Ray versions."""
95+
# Different version
96+
is_compatible, message = validate_ray_version_compatibility("ray:2.46.0")
97+
assert is_compatible is False
98+
assert "Ray version mismatch detected" in message
99+
assert "CodeFlare SDK uses Ray" in message
100+
assert "runtime image uses Ray" in message
101+
102+
# Older version
103+
is_compatible, message = validate_ray_version_compatibility("ray:1.13.0")
104+
assert is_compatible is False
105+
assert "Ray version mismatch detected" in message
106+
107+
def test_validate_empty_image(self):
108+
"""Test validation with no custom image (should use default)."""
109+
# Empty string
110+
is_compatible, message = validate_ray_version_compatibility("")
111+
assert is_compatible is True
112+
assert "Using default Ray image compatible with SDK" in message
113+
114+
# None
115+
is_compatible, message = validate_ray_version_compatibility(None)
116+
assert is_compatible is True
117+
assert "Using default Ray image compatible with SDK" in message
118+
119+
def test_validate_unknown_version(self):
120+
"""Test validation when version cannot be determined."""
121+
# SHA-based image
122+
is_compatible, message = validate_ray_version_compatibility(
123+
"quay.io/modh/ray@sha256:6d076aeb38ab3c34a6a2ef0f58dc667089aa15826fa08a73273c629333e12f1e"
124+
)
125+
assert is_compatible is True
126+
assert "Warning: Cannot determine Ray version" in message
127+
128+
# Custom image without version
129+
is_compatible, message = validate_ray_version_compatibility("my-custom-ray:latest")
130+
assert is_compatible is True
131+
assert "Warning: Cannot determine Ray version" in message
132+
133+
def test_validate_custom_sdk_version(self):
134+
"""Test validation with custom SDK version."""
135+
# Compatible with custom SDK version
136+
is_compatible, message = validate_ray_version_compatibility("ray:2.46.0", "2.46.0")
137+
assert is_compatible is True
138+
assert "Ray versions match" in message
139+
140+
# Incompatible with custom SDK version
141+
is_compatible, message = validate_ray_version_compatibility("ray:2.47.1", "2.46.0")
142+
assert is_compatible is False
143+
assert "CodeFlare SDK uses Ray 2.46.0" in message
144+
assert "runtime image uses Ray 2.47.1" in message
145+
146+
def test_validate_message_content(self):
147+
"""Test that validation messages contain expected guidance."""
148+
# Mismatch message should contain helpful guidance
149+
is_compatible, message = validate_ray_version_compatibility("ray:2.46.0")
150+
assert is_compatible is False
151+
assert "compatibility issues" in message.lower()
152+
assert "unexpected behavior" in message.lower()
153+
assert "please use a runtime image" in message.lower()
154+
assert "update your sdk version" in message.lower()
155+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Copyright 2024 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Validation utilities for the CodeFlare SDK.
17+
18+
This module contains validation functions used across the SDK for ensuring
19+
configuration compatibility and correctness.
20+
"""
21+
22+
import re
23+
from typing import Optional, Tuple
24+
from .constants import RAY_VERSION
25+
26+
27+
def extract_ray_version_from_image(image_name: str) -> Optional[str]:
28+
"""
29+
Extract Ray version from a container image name.
30+
31+
Supports various image naming patterns:
32+
- quay.io/modh/ray:2.47.1-py311-cu121
33+
- ray:2.47.1
34+
- some-registry/ray:2.47.1-py311
35+
- quay.io/modh/ray@sha256:... (falls back to None)
36+
37+
Args:
38+
image_name: The container image name/tag
39+
40+
Returns:
41+
The extracted Ray version, or None if not found
42+
"""
43+
if not image_name:
44+
return None
45+
46+
# Pattern to match semantic version after ray: or ray/
47+
# Looks for patterns like ray:2.47.1, ray:2.47.1-py311, etc.
48+
patterns = [
49+
r'ray:(\d+\.\d+\.\d+)', # ray:2.47.1
50+
r'ray/[^:]*:(\d+\.\d+\.\d+)', # registry/ray:2.47.1
51+
r'/ray:(\d+\.\d+\.\d+)', # any-registry/ray:2.47.1
52+
]
53+
54+
for pattern in patterns:
55+
match = re.search(pattern, image_name)
56+
if match:
57+
return match.group(1)
58+
59+
# If we can't extract version, return None to indicate unknown
60+
return None
61+
62+
63+
def validate_ray_version_compatibility(image_name: str, sdk_ray_version: str = RAY_VERSION) -> Tuple[bool, str]:
64+
"""
65+
Validate that the Ray version in the runtime image matches the SDK's Ray version.
66+
67+
Args:
68+
image_name: The container image name/tag
69+
sdk_ray_version: The Ray version used by the CodeFlare SDK
70+
71+
Returns:
72+
tuple: (is_compatible, message)
73+
- is_compatible: True if versions match or cannot be determined, False if mismatch
74+
- message: Descriptive message about the validation result
75+
"""
76+
if not image_name:
77+
# No custom image specified, will use default - this is compatible
78+
return True, "Using default Ray image compatible with SDK"
79+
80+
image_ray_version = extract_ray_version_from_image(image_name)
81+
82+
if image_ray_version is None:
83+
# Cannot determine version from image name, issue a warning but allow
84+
return True, f"Warning: Cannot determine Ray version from image '{image_name}'. Please ensure it's compatible with Ray {sdk_ray_version}"
85+
86+
if image_ray_version != sdk_ray_version:
87+
# Version mismatch detected
88+
message = (
89+
f"Ray version mismatch detected!\n"
90+
f"CodeFlare SDK uses Ray {sdk_ray_version}, but runtime image uses Ray {image_ray_version}.\n"
91+
f"This mismatch can cause compatibility issues and unexpected behavior.\n"
92+
f"Please use a runtime image with Ray {sdk_ray_version} or update your SDK version."
93+
)
94+
return False, message
95+
96+
# Versions match
97+
return True, f"Ray versions match: SDK and runtime image both use Ray {sdk_ray_version}"
98+

src/codeflare_sdk/ray/cluster/config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from dataclasses import dataclass, field, fields
2424
from typing import Dict, List, Optional, Union, get_args, get_origin
2525
from kubernetes.client import V1Toleration, V1Volume, V1VolumeMount
26+
from ...common.utils.validation import validate_ray_version_compatibility
2627

2728
dir = pathlib.Path(__file__).parent.parent.resolve()
2829

@@ -182,6 +183,7 @@ def __post_init__(self):
182183
self._validate_extended_resource_requests(
183184
self.worker_extended_resource_requests
184185
)
186+
self._validate_ray_version_compatibility()
185187

186188
def _combine_extended_resource_mapping(self):
187189
if overwritten := set(self.extended_resource_mapping.keys()).intersection(
@@ -289,3 +291,15 @@ def check_type(value, expected_type):
289291
return isinstance(value, expected_type)
290292

291293
return check_type(value, expected_type)
294+
295+
def _validate_ray_version_compatibility(self):
296+
"""
297+
Validate that the Ray version in the runtime image matches the SDK's Ray version.
298+
Raises ValueError if there is a version mismatch.
299+
"""
300+
is_compatible, message = validate_ray_version_compatibility(self.image)
301+
302+
if not is_compatible:
303+
raise ValueError(message)
304+
elif "Warning:" in message:
305+
warnings.warn(message)

0 commit comments

Comments
 (0)