Skip to content

Commit fef17f1

Browse files
committed
feat: return cluster error messages on wait handler
1 parent e2dcaa7 commit fef17f1

File tree

5 files changed

+85
-2
lines changed

5 files changed

+85
-2
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131
- Add field `Labels` (type `*map[string]string`) to structs `LoadBalancer`, `CreateLoadBalancerPayload`, `UpdateLoadBalancerPayload`
3232
- `git`: [v0.8.0](services/git/CHANGELOG.md#v080)
3333
- **Feature:** Add support for the instance patch operation
34+
- `ske`
35+
- [v1.12.0](services/ske/CHANGELOG.md#v141)
36+
- **Feature:** Exit `CreateOrUpdateClusterWaitHandler` early when the cluster reports structured errors and is in a failure state.
37+
38+
3439

3540
## Release (2025-09-11)
3641
- `cdn`: [v1.5.0](services/cdn/CHANGELOG.md#v150)

services/ske/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
## v1.4.1
2+
- **Feature:** Exit `CreateOrUpdateClusterWaitHandler` early when the cluster reports structured errors and is in a failure state.
3+
14
## v1.4.0
25
- **Feature:** Add new field `Kubernetes` to `Nodepool` model
36

services/ske/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v1.4.0
1+
v1.4.1

services/ske/wait/wait.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ const (
3333
CredentialsRotationStateCompleted = "COMPLETED"
3434
// Deprecated: InvalidArgusInstanceErrorCode is deprecated and will be removed after 14th November 2025. Use [ske.RUNTIMEERRORCODE_OBSERVABILITY_INSTANCE_NOT_FOUND] instead.
3535
InvalidArgusInstanceErrorCode = "SKE_ARGUS_INSTANCE_NOT_FOUND"
36+
37+
ClusterNoValidHostFound = "SKE_NODE_NO_VALID_HOST_FOUND"
3638
)
3739

3840
type APIClientClusterInterface interface {
@@ -42,6 +44,8 @@ type APIClientClusterInterface interface {
4244

4345
// CreateOrUpdateClusterWaitHandler will wait for cluster creation or update
4446
func CreateOrUpdateClusterWaitHandler(ctx context.Context, a APIClientClusterInterface, projectId, region, name string) *wait.AsyncActionHandler[ske.Cluster] {
47+
startTime := time.Now()
48+
4549
handler := wait.New(func() (waitFinished bool, response *ske.Cluster, err error) {
4650
s, err := a.GetClusterExecute(ctx, projectId, region, name)
4751
if err != nil {
@@ -56,6 +60,26 @@ func CreateOrUpdateClusterWaitHandler(ctx context.Context, a APIClientClusterInt
5660
return true, s, nil
5761
}
5862

63+
// If cluster is UNSPECIFIED or UNHEALTHY and has structured errors, exit early
64+
hasStructuredErrors := s.Status.Errors != nil && len(*s.Status.Errors) > 0
65+
if (state == ske.CLUSTERSTATUSSTATE_UNSPECIFIED || state == ske.CLUSTERSTATUSSTATE_UNHEALTHY) && hasStructuredErrors {
66+
for _, clusterError := range *s.Status.Errors {
67+
if clusterError.Code != nil && clusterError.Message != nil {
68+
return true, s, nil
69+
}
70+
}
71+
}
72+
73+
// Waiter has been running more than 15 minutes and cluster is still in CREATING or RECONCILING state with errors
74+
if time.Since(startTime) > 15*time.Minute &&
75+
(state == ske.CLUSTERSTATUSSTATE_CREATING || state == ske.CLUSTERSTATUSSTATE_RECONCILING) && hasStructuredErrors {
76+
for _, clusterError := range *s.Status.Errors {
77+
if clusterError.Code != nil && clusterError.Message != nil {
78+
return true, s, nil
79+
}
80+
}
81+
}
82+
5983
if state == ske.CLUSTERSTATUSSTATE_HEALTHY || state == ske.CLUSTERSTATUSSTATE_HIBERNATED {
6084
return true, s, nil
6185
}

services/ske/wait/wait_test.go

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ type apiClientClusterMocked struct {
1818
name string
1919
resourceState ske.ClusterStatusState
2020
invalidArgusInstance bool
21+
errorList *[]ske.ClusterError
2122
}
2223

2324
const testRegion = "eu01"
@@ -45,7 +46,17 @@ func (a *apiClientClusterMocked) GetClusterExecute(_ context.Context, _, _, _ st
4546
return &ske.Cluster{
4647
Name: utils.Ptr("cluster"),
4748
Status: &ske.ClusterStatus{
48-
Aggregated: &rs,
49+
Aggregated: utils.Ptr(rs),
50+
Error: func() *ske.RuntimeError {
51+
if a.invalidArgusInstance {
52+
return &ske.RuntimeError{
53+
Code: utils.Ptr(ske.RUNTIMEERRORCODE_OBSERVABILITY_INSTANCE_NOT_FOUND),
54+
Message: utils.Ptr("invalid argus instance"),
55+
}
56+
}
57+
return nil
58+
}(),
59+
Errors: a.errorList,
4960
},
5061
}, nil
5162
}
@@ -77,6 +88,7 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) {
7788
invalidArgusInstance bool
7889
wantErr bool
7990
wantResp bool
91+
errorList *[]ske.ClusterError
8092
}{
8193
{
8294
desc: "create_succeeded",
@@ -120,6 +132,40 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) {
120132
wantErr: true,
121133
wantResp: false,
122134
},
135+
{
136+
desc: "status_errors_present_state_unhealthy",
137+
getFails: false,
138+
resourceState: ske.CLUSTERSTATUSSTATE_UNHEALTHY,
139+
errorList: &[]ske.ClusterError{
140+
{
141+
Code: utils.Ptr("ERR_CODE"),
142+
Message: utils.Ptr("Error 1"),
143+
},
144+
{
145+
Code: utils.Ptr("ERR_OTHER"),
146+
Message: utils.Ptr("Error 2"),
147+
},
148+
},
149+
wantErr: false,
150+
wantResp: true,
151+
},
152+
{
153+
desc: "status_errors_present_state_unspecified",
154+
getFails: false,
155+
resourceState: ske.CLUSTERSTATUSSTATE_UNSPECIFIED,
156+
errorList: &[]ske.ClusterError{
157+
{
158+
Code: utils.Ptr("ERR_CODE"),
159+
Message: utils.Ptr("Error 1"),
160+
},
161+
{
162+
Code: utils.Ptr("ERR_OTHER"),
163+
Message: utils.Ptr("Error 2"),
164+
},
165+
},
166+
wantErr: false,
167+
wantResp: true,
168+
},
123169
}
124170
for _, tt := range tests {
125171
t.Run(tt.desc, func(t *testing.T) {
@@ -130,6 +176,7 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) {
130176
name: name,
131177
resourceState: tt.resourceState,
132178
invalidArgusInstance: tt.invalidArgusInstance,
179+
errorList: tt.errorList,
133180
}
134181
var wantRes *ske.Cluster
135182
rs := ske.ClusterStatusState(tt.resourceState)
@@ -147,6 +194,10 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) {
147194
Message: utils.Ptr("invalid argus instance"),
148195
}
149196
}
197+
198+
if tt.errorList != nil && len(*tt.errorList) > 0 {
199+
wantRes.Status.Errors = tt.errorList
200+
}
150201
}
151202

152203
handler := CreateOrUpdateClusterWaitHandler(context.Background(), apiClient, "", testRegion, name)

0 commit comments

Comments
 (0)