diff --git a/examples/simple-game-server/gameserver-scheduled-restart.yaml b/examples/simple-game-server/gameserver-scheduled-restart.yaml new file mode 100644 index 0000000000..cbe4a547a3 --- /dev/null +++ b/examples/simple-game-server/gameserver-scheduled-restart.yaml @@ -0,0 +1,42 @@ +--- +# Copyright Contributors to Agones a Series of LF Projects, LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: agones.dev/v1 +kind: GameServer +metadata: + name: simple-scheduled-restart +spec: + container: simple-game-server + ports: + - name: default + containerPort: 7654 + # Restart the game server container every night at 4 AM UTC. + # If the server is still in use, wait up to 6 hours (soft deadline). + # After 2 days of deferral, force-restart regardless (hard deadline). + restartPolicy: + schedule: "0 4 * * *" + softDeadlineDuration: "6h" + hardDeadlineDuration: "48h" + template: + spec: + containers: + - name: simple-game-server + image: us-docker.pkg.dev/agones-images/examples/simple-game-server:0.35 + resources: + requests: + memory: 64Mi + cpu: 20m + limits: + memory: 64Mi + cpu: 20m diff --git a/go.mod b/go.mod index 74f652643f..58578c6181 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.22.0 - github.com/robfig/cron/v3 v3.0.1 + github.com/robfig/cron/v3 v3.0.0 github.com/sirupsen/logrus v1.9.3 github.com/spf13/pflag v1.0.6 github.com/spf13/viper v1.7.0 diff --git a/go.sum b/go.sum index f1cc58b7f8..b6a930ed6d 100644 --- a/go.sum +++ b/go.sum @@ -399,6 +399,8 @@ github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7z github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= +github.com/robfig/cron/v3 v3.0.0 h1:kQ6Cb7aHOHTSzNVNEhmp8EcWKLb4CbiMW9h9VyIhO4E= +github.com/robfig/cron/v3 v3.0.0/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= diff --git a/install/helm/agones/templates/crds/_gameserverspecschema.yaml b/install/helm/agones/templates/crds/_gameserverspecschema.yaml index 36e5a84b79..5bc8fd96e9 100644 --- a/install/helm/agones/templates/crds/_gameserverspecschema.yaml +++ b/install/helm/agones/templates/crds/_gameserverspecschema.yaml @@ -215,7 +215,55 @@ properties: - Always - OnUpgrade - Never - immutableReplicas: + restartPolicy: + description: > + [Alpha, GameServerScheduledRestart feature flag] + RestartPolicy configures scheduled in-place restarts of the game + server container without deleting the Pod. This avoids expensive pod + churn while still allowing a clean container restart on a schedule. + When enabled, the RestartController watches this field and the SDK + sidecar will exit (triggering a Kubernetes container restart) when + conditions are met. + type: object + nullable: true + required: + - schedule + properties: + schedule: + type: string + minLength: 9 + maxLength: 256 + description: > + A standard 5-field cron expression defining when the restart + should be attempted. The expression is evaluated in UTC. + Example: "0 4 * * *" restarts every day at 04:00 UTC. + See https://en.wikipedia.org/wiki/Cron for format details. + softDeadlineDuration: + type: string + nullable: true + description: > + How long the RestartController will wait for the GameServer to + become idle (state=Ready, zero players) after the scheduled + trigger fires, before giving up and deferring to the next + scheduled window. Uses Go duration format, e.g. "2h" or "30m". + When omitted (default), the controller retries every reconcile + loop until the next cron window without an explicit timeout. + pattern: "^([0-9]+(h|m|s))+$" + hardDeadlineDuration: + type: string + nullable: true + description: > + Maximum total time to wait after the first soft-deadline trigger + before forcing a restart regardless of player state or game + server readiness. When the wall-clock time since + "restart-pending-since" annotation exceeds this value, the + restart is forced immediately. + Uses Go duration format, e.g. "48h". + When omitted (default), forced restarts are never performed; + the controller simply defers to the next cron window after each + soft deadline expiry. + pattern: "^([0-9]+(h|m|s))+$" + immutableReplicas: type: integer title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) default: 1 diff --git a/install/helm/agones/templates/crds/_gameserverstatus.yaml b/install/helm/agones/templates/crds/_gameserverstatus.yaml index 606555404c..cadbb8fee4 100644 --- a/install/helm/agones/templates/crds/_gameserverstatus.yaml +++ b/install/helm/agones/templates/crds/_gameserverstatus.yaml @@ -124,6 +124,17 @@ status: - Always - OnUpgrade - Never + nextRestartTime: + description: > + [Alpha, GameServerScheduledRestart feature flag] + The wall-clock time at which the next scheduled in-place + restart will be attempted. Only populated when + spec.restartPolicy is configured. + Written by the RestartController; cleared and re-set after + each restart cycle. + type: string + format: date-time + nullable: true immutableReplicas: type: integer title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml index 81c2b12967..435154cb49 100644 --- a/install/yaml/install.yaml +++ b/install/yaml/install.yaml @@ -6560,7 +6560,55 @@ spec: - Always - OnUpgrade - Never - immutableReplicas: + restartPolicy: + description: > + [Alpha, GameServerScheduledRestart feature flag] + RestartPolicy configures scheduled in-place restarts of the game + server container without deleting the Pod. This avoids expensive pod + churn while still allowing a clean container restart on a schedule. + When enabled, the RestartController watches this field and the SDK + sidecar will exit (triggering a Kubernetes container restart) when + conditions are met. + type: object + nullable: true + required: + - schedule + properties: + schedule: + type: string + minLength: 9 + maxLength: 256 + description: > + A standard 5-field cron expression defining when the restart + should be attempted. The expression is evaluated in UTC. + Example: "0 4 * * *" restarts every day at 04:00 UTC. + See https://en.wikipedia.org/wiki/Cron for format details. + softDeadlineDuration: + type: string + nullable: true + description: > + How long the RestartController will wait for the GameServer to + become idle (state=Ready, zero players) after the scheduled + trigger fires, before giving up and deferring to the next + scheduled window. Uses Go duration format, e.g. "2h" or "30m". + When omitted (default), the controller retries every reconcile + loop until the next cron window without an explicit timeout. + pattern: "^([0-9]+(h|m|s))+$" + hardDeadlineDuration: + type: string + nullable: true + description: > + Maximum total time to wait after the first soft-deadline trigger + before forcing a restart regardless of player state or game + server readiness. When the wall-clock time since + "restart-pending-since" annotation exceeds this value, the + restart is forced immediately. + Uses Go duration format, e.g. "48h". + When omitted (default), forced restarts are never performed; + the controller simply defers to the next cron window after each + soft deadline expiry. + pattern: "^([0-9]+(h|m|s))+$" + immutableReplicas: type: integer title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) default: 1 @@ -13532,7 +13580,55 @@ spec: - Always - OnUpgrade - Never - immutableReplicas: + restartPolicy: + description: > + [Alpha, GameServerScheduledRestart feature flag] + RestartPolicy configures scheduled in-place restarts of the game + server container without deleting the Pod. This avoids expensive pod + churn while still allowing a clean container restart on a schedule. + When enabled, the RestartController watches this field and the SDK + sidecar will exit (triggering a Kubernetes container restart) when + conditions are met. + type: object + nullable: true + required: + - schedule + properties: + schedule: + type: string + minLength: 9 + maxLength: 256 + description: > + A standard 5-field cron expression defining when the restart + should be attempted. The expression is evaluated in UTC. + Example: "0 4 * * *" restarts every day at 04:00 UTC. + See https://en.wikipedia.org/wiki/Cron for format details. + softDeadlineDuration: + type: string + nullable: true + description: > + How long the RestartController will wait for the GameServer to + become idle (state=Ready, zero players) after the scheduled + trigger fires, before giving up and deferring to the next + scheduled window. Uses Go duration format, e.g. "2h" or "30m". + When omitted (default), the controller retries every reconcile + loop until the next cron window without an explicit timeout. + pattern: "^([0-9]+(h|m|s))+$" + hardDeadlineDuration: + type: string + nullable: true + description: > + Maximum total time to wait after the first soft-deadline trigger + before forcing a restart regardless of player state or game + server readiness. When the wall-clock time since + "restart-pending-since" annotation exceeds this value, the + restart is forced immediately. + Uses Go duration format, e.g. "48h". + When omitted (default), forced restarts are never performed; + the controller simply defers to the next cron window after each + soft deadline expiry. + pattern: "^([0-9]+(h|m|s))+$" + immutableReplicas: type: integer title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) default: 1 @@ -13648,6 +13744,17 @@ spec: - Always - OnUpgrade - Never + nextRestartTime: + description: > + [Alpha, GameServerScheduledRestart feature flag] + The wall-clock time at which the next scheduled in-place + restart will be attempted. Only populated when + spec.restartPolicy is configured. + Written by the RestartController; cleared and re-set after + each restart cycle. + type: string + format: date-time + nullable: true immutableReplicas: type: integer title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) @@ -20104,7 +20211,55 @@ spec: - Always - OnUpgrade - Never - immutableReplicas: + restartPolicy: + description: > + [Alpha, GameServerScheduledRestart feature flag] + RestartPolicy configures scheduled in-place restarts of the game + server container without deleting the Pod. This avoids expensive pod + churn while still allowing a clean container restart on a schedule. + When enabled, the RestartController watches this field and the SDK + sidecar will exit (triggering a Kubernetes container restart) when + conditions are met. + type: object + nullable: true + required: + - schedule + properties: + schedule: + type: string + minLength: 9 + maxLength: 256 + description: > + A standard 5-field cron expression defining when the restart + should be attempted. The expression is evaluated in UTC. + Example: "0 4 * * *" restarts every day at 04:00 UTC. + See https://en.wikipedia.org/wiki/Cron for format details. + softDeadlineDuration: + type: string + nullable: true + description: > + How long the RestartController will wait for the GameServer to + become idle (state=Ready, zero players) after the scheduled + trigger fires, before giving up and deferring to the next + scheduled window. Uses Go duration format, e.g. "2h" or "30m". + When omitted (default), the controller retries every reconcile + loop until the next cron window without an explicit timeout. + pattern: "^([0-9]+(h|m|s))+$" + hardDeadlineDuration: + type: string + nullable: true + description: > + Maximum total time to wait after the first soft-deadline trigger + before forcing a restart regardless of player state or game + server readiness. When the wall-clock time since + "restart-pending-since" annotation exceeds this value, the + restart is forced immediately. + Uses Go duration format, e.g. "48h". + When omitted (default), forced restarts are never performed; + the controller simply defers to the next cron window after each + soft deadline expiry. + pattern: "^([0-9]+(h|m|s))+$" + immutableReplicas: type: integer title: Immutable count of Pods to a GameServer. Always 1. (Implementation detail of implementing the Scale subresource.) default: 1 diff --git a/pkg/apis/agones/v1/gameserver.go b/pkg/apis/agones/v1/gameserver.go index fc94ec99d2..47740d4c1d 100644 --- a/pkg/apis/agones/v1/gameserver.go +++ b/pkg/apis/agones/v1/gameserver.go @@ -26,6 +26,7 @@ import ( "agones.dev/agones/pkg/apis/agones" "agones.dev/agones/pkg/util/runtime" "github.com/pkg/errors" + cron "github.com/robfig/cron/v3" "gomodules.xyz/jsonpatch/v2" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -159,6 +160,12 @@ const ( GameServerErroredAtAnnotation = agones.GroupName + "/errored-at" // FinalizerName is the domain name and finalizer path used to manage garbage collection of the GameServer. FinalizerName = agones.GroupName + "/controller" + // GameServerRestartPendingSinceAnnotation records the RFC3339 time at which the + // soft-deadline restart was first triggered but deferred (server was busy). + GameServerRestartPendingSinceAnnotation = agones.GroupName + "/restart-pending-since" + // GameServerNextRestartAnnotation records the RFC3339 time for the next scheduled + // restart trigger. Written by the restart controller after each sync. + GameServerNextRestartAnnotation = agones.GroupName + "/next-restart" // NodePodIP identifies an IP address from a pod. NodePodIP corev1.NodeAddressType = "PodIP" @@ -248,6 +255,30 @@ type GameServerSpec struct { // +optional Eviction *Eviction `json:"eviction,omitempty"` // immutableReplicas is present in gameservers.agones.dev but omitted here (it's always 1). + // RestartPolicy configures scheduled in-place restart of the game server container. + // Alpha feature; requires the GameServerScheduledRestart feature flag. + // +optional + RestartPolicy *RestartPolicy `json:"restartPolicy,omitempty"` +} + +// RestartPolicy defines the scheduled in-place restart behaviour for a GameServer. +// The restart does not delete the Pod; only the game server container is restarted. +type RestartPolicy struct { + // Schedule is a cron expression (standard 5-field) that defines when the restart + // is desired. E.g. "0 4 * * *" = every day at 04:00 UTC. + Schedule string `json:"schedule"` + + // SoftDeadlineDuration is how long Agones will wait for the GameServer to become + // idle (not Allocated, zero players) before giving up and trying again at the next + // scheduled time. Defaults to 0 (retry indefinitely until next schedule). + // +optional + SoftDeadlineDuration *metav1.Duration `json:"softDeadlineDuration,omitempty"` + + // HardDeadlineDuration is the maximum time after the first soft-deadline trigger + // that Agones will wait before force-killing the game server, even if it still has + // players. When unset, force-kill is never performed. + // +optional + HardDeadlineDuration *metav1.Duration `json:"hardDeadlineDuration,omitempty"` } // PlayersSpec tracks the initial player capacity @@ -338,6 +369,10 @@ type GameServerStatus struct { // +optional Eviction *Eviction `json:"eviction,omitempty"` // immutableReplicas is present in gameservers.agones.dev but omitted here (it's always 1). + // NextRestartTime is the wall-clock time when the next scheduled restart will be + // attempted. Only populated when spec.restartPolicy is set. + // +optional + NextRestartTime *metav1.Time `json:"nextRestartTime,omitempty"` } // GameServerStatusPort shows the port that was allocated to a @@ -621,6 +656,27 @@ func (gss *GameServerSpec) Validate(apiHooks APIHooks, devAddress string, fldPat allErrs = append(allErrs, apiHooks.ValidateGameServerSpec(gss, fldPath)...) allErrs = append(allErrs, validateObjectMeta(&gss.Template.ObjectMeta, fldPath.Child("template", "metadata"))...) + + if rp := gss.RestartPolicy; rp != nil { + rpPath := fldPath.Child("restartPolicy") + if rp.Schedule == "" { + allErrs = append(allErrs, field.Required(rpPath.Child("schedule"), "schedule must be a valid cron expression")) + } else { + if _, err := cron.ParseStandard(rp.Schedule); err != nil { + allErrs = append(allErrs, field.Invalid(rpPath.Child("schedule"), rp.Schedule, + fmt.Sprintf("must be a valid 5-field cron expression: %v", err))) + } + } + if rp.SoftDeadlineDuration != nil && rp.SoftDeadlineDuration.Duration < 0 { + allErrs = append(allErrs, field.Invalid(rpPath.Child("softDeadlineDuration"), + rp.SoftDeadlineDuration, "must not be negative")) + } + if rp.HardDeadlineDuration != nil && rp.HardDeadlineDuration.Duration <= 0 { + allErrs = append(allErrs, field.Invalid(rpPath.Child("hardDeadlineDuration"), + rp.HardDeadlineDuration, "must be a positive duration")) + } + } + return allErrs } diff --git a/pkg/apis/agones/v1/zz_generated.deepcopy.go b/pkg/apis/agones/v1/zz_generated.deepcopy.go index eb2da42788..dfc07d1f33 100644 --- a/pkg/apis/agones/v1/zz_generated.deepcopy.go +++ b/pkg/apis/agones/v1/zz_generated.deepcopy.go @@ -1,7 +1,7 @@ //go:build !ignore_autogenerated // +build !ignore_autogenerated -// Copyright Contributors to Agones a Series of LF Projects, LLC. +// Copyright 2024 Google LLC All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ package v1 import ( corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -502,6 +503,11 @@ func (in *GameServerSpec) DeepCopyInto(out *GameServerSpec) { *out = new(Eviction) **out = **in } + if in.RestartPolicy != nil { + in, out := &in.RestartPolicy, &out.RestartPolicy + *out = new(RestartPolicy) + (*in).DeepCopyInto(*out) + } return } @@ -556,6 +562,10 @@ func (in *GameServerStatus) DeepCopyInto(out *GameServerStatus) { *out = new(Eviction) **out = **in } + if in.NextRestartTime != nil { + in, out := &in.NextRestartTime, &out.NextRestartTime + *out = (*in).DeepCopy() + } return } @@ -693,6 +703,32 @@ func (in *Priority) DeepCopy() *Priority { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RestartPolicy) DeepCopyInto(out *RestartPolicy) { + *out = *in + if in.SoftDeadlineDuration != nil { + in, out := &in.SoftDeadlineDuration, &out.SoftDeadlineDuration + *out = new(metav1.Duration) + **out = **in + } + if in.HardDeadlineDuration != nil { + in, out := &in.HardDeadlineDuration, &out.HardDeadlineDuration + *out = new(metav1.Duration) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RestartPolicy. +func (in *RestartPolicy) DeepCopy() *RestartPolicy { + if in == nil { + return nil + } + out := new(RestartPolicy) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SdkServer) DeepCopyInto(out *SdkServer) { *out = *in diff --git a/pkg/client/applyconfiguration/agones/v1/gameserverspec.go b/pkg/client/applyconfiguration/agones/v1/gameserverspec.go index af275c374f..ceb0718c92 100644 --- a/pkg/client/applyconfiguration/agones/v1/gameserverspec.go +++ b/pkg/client/applyconfiguration/agones/v1/gameserverspec.go @@ -1,4 +1,4 @@ -// Copyright Contributors to Agones a Series of LF Projects, LLC. +// Copyright 2024 Google LLC All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -26,16 +26,17 @@ import ( // GameServerSpecApplyConfiguration represents a declarative configuration of the GameServerSpec type for use // with apply. type GameServerSpecApplyConfiguration struct { - Container *string `json:"container,omitempty"` - Ports []GameServerPortApplyConfiguration `json:"ports,omitempty"` - Health *HealthApplyConfiguration `json:"health,omitempty"` - Scheduling *apis.SchedulingStrategy `json:"scheduling,omitempty"` - SdkServer *SdkServerApplyConfiguration `json:"sdkServer,omitempty"` - Template *corev1.PodTemplateSpec `json:"template,omitempty"` - Players *PlayersSpecApplyConfiguration `json:"players,omitempty"` - Counters map[string]CounterStatusApplyConfiguration `json:"counters,omitempty"` - Lists map[string]ListStatusApplyConfiguration `json:"lists,omitempty"` - Eviction *EvictionApplyConfiguration `json:"eviction,omitempty"` + Container *string `json:"container,omitempty"` + Ports []GameServerPortApplyConfiguration `json:"ports,omitempty"` + Health *HealthApplyConfiguration `json:"health,omitempty"` + Scheduling *apis.SchedulingStrategy `json:"scheduling,omitempty"` + SdkServer *SdkServerApplyConfiguration `json:"sdkServer,omitempty"` + Template *corev1.PodTemplateSpec `json:"template,omitempty"` + Players *PlayersSpecApplyConfiguration `json:"players,omitempty"` + Counters map[string]CounterStatusApplyConfiguration `json:"counters,omitempty"` + Lists map[string]ListStatusApplyConfiguration `json:"lists,omitempty"` + Eviction *EvictionApplyConfiguration `json:"eviction,omitempty"` + RestartPolicy *RestartPolicyApplyConfiguration `json:"restartPolicy,omitempty"` } // GameServerSpecApplyConfiguration constructs a declarative configuration of the GameServerSpec type for use with @@ -140,3 +141,11 @@ func (b *GameServerSpecApplyConfiguration) WithEviction(value *EvictionApplyConf b.Eviction = value return b } + +// WithRestartPolicy sets the RestartPolicy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the RestartPolicy field is set to the value of the last call. +func (b *GameServerSpecApplyConfiguration) WithRestartPolicy(value *RestartPolicyApplyConfiguration) *GameServerSpecApplyConfiguration { + b.RestartPolicy = value + return b +} diff --git a/pkg/client/applyconfiguration/agones/v1/gameserverstatus.go b/pkg/client/applyconfiguration/agones/v1/gameserverstatus.go index 0ff19a30a4..0e2095b0e0 100644 --- a/pkg/client/applyconfiguration/agones/v1/gameserverstatus.go +++ b/pkg/client/applyconfiguration/agones/v1/gameserverstatus.go @@ -1,4 +1,4 @@ -// Copyright Contributors to Agones a Series of LF Projects, LLC. +// Copyright 2024 Google LLC All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -27,16 +27,17 @@ import ( // GameServerStatusApplyConfiguration represents a declarative configuration of the GameServerStatus type for use // with apply. type GameServerStatusApplyConfiguration struct { - State *agonesv1.GameServerState `json:"state,omitempty"` - Ports []GameServerStatusPortApplyConfiguration `json:"ports,omitempty"` - Address *string `json:"address,omitempty"` - Addresses []corev1.NodeAddress `json:"addresses,omitempty"` - NodeName *string `json:"nodeName,omitempty"` - ReservedUntil *metav1.Time `json:"reservedUntil,omitempty"` - Players *PlayerStatusApplyConfiguration `json:"players,omitempty"` - Counters map[string]CounterStatusApplyConfiguration `json:"counters,omitempty"` - Lists map[string]ListStatusApplyConfiguration `json:"lists,omitempty"` - Eviction *EvictionApplyConfiguration `json:"eviction,omitempty"` + State *agonesv1.GameServerState `json:"state,omitempty"` + Ports []GameServerStatusPortApplyConfiguration `json:"ports,omitempty"` + Address *string `json:"address,omitempty"` + Addresses []corev1.NodeAddress `json:"addresses,omitempty"` + NodeName *string `json:"nodeName,omitempty"` + ReservedUntil *metav1.Time `json:"reservedUntil,omitempty"` + Players *PlayerStatusApplyConfiguration `json:"players,omitempty"` + Counters map[string]CounterStatusApplyConfiguration `json:"counters,omitempty"` + Lists map[string]ListStatusApplyConfiguration `json:"lists,omitempty"` + Eviction *EvictionApplyConfiguration `json:"eviction,omitempty"` + NextRestartTime *metav1.Time `json:"nextRestartTime,omitempty"` } // GameServerStatusApplyConfiguration constructs a declarative configuration of the GameServerStatus type for use with @@ -143,3 +144,11 @@ func (b *GameServerStatusApplyConfiguration) WithEviction(value *EvictionApplyCo b.Eviction = value return b } + +// WithNextRestartTime sets the NextRestartTime field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the NextRestartTime field is set to the value of the last call. +func (b *GameServerStatusApplyConfiguration) WithNextRestartTime(value metav1.Time) *GameServerStatusApplyConfiguration { + b.NextRestartTime = &value + return b +} diff --git a/pkg/client/applyconfiguration/agones/v1/restartpolicy.go b/pkg/client/applyconfiguration/agones/v1/restartpolicy.go new file mode 100644 index 0000000000..f65f10af71 --- /dev/null +++ b/pkg/client/applyconfiguration/agones/v1/restartpolicy.go @@ -0,0 +1,61 @@ +// Copyright 2024 Google LLC All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This code was autogenerated. Do not edit directly. + +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// RestartPolicyApplyConfiguration represents a declarative configuration of the RestartPolicy type for use +// with apply. +type RestartPolicyApplyConfiguration struct { + Schedule *string `json:"schedule,omitempty"` + SoftDeadlineDuration *metav1.Duration `json:"softDeadlineDuration,omitempty"` + HardDeadlineDuration *metav1.Duration `json:"hardDeadlineDuration,omitempty"` +} + +// RestartPolicyApplyConfiguration constructs a declarative configuration of the RestartPolicy type for use with +// apply. +func RestartPolicy() *RestartPolicyApplyConfiguration { + return &RestartPolicyApplyConfiguration{} +} + +// WithSchedule sets the Schedule field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Schedule field is set to the value of the last call. +func (b *RestartPolicyApplyConfiguration) WithSchedule(value string) *RestartPolicyApplyConfiguration { + b.Schedule = &value + return b +} + +// WithSoftDeadlineDuration sets the SoftDeadlineDuration field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the SoftDeadlineDuration field is set to the value of the last call. +func (b *RestartPolicyApplyConfiguration) WithSoftDeadlineDuration(value metav1.Duration) *RestartPolicyApplyConfiguration { + b.SoftDeadlineDuration = &value + return b +} + +// WithHardDeadlineDuration sets the HardDeadlineDuration field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the HardDeadlineDuration field is set to the value of the last call. +func (b *RestartPolicyApplyConfiguration) WithHardDeadlineDuration(value metav1.Duration) *RestartPolicyApplyConfiguration { + b.HardDeadlineDuration = &value + return b +} diff --git a/pkg/client/applyconfiguration/utils.go b/pkg/client/applyconfiguration/utils.go index 4ebe650b62..e161910d26 100644 --- a/pkg/client/applyconfiguration/utils.go +++ b/pkg/client/applyconfiguration/utils.go @@ -1,4 +1,4 @@ -// Copyright Contributors to Agones a Series of LF Projects, LLC. +// Copyright 2024 Google LLC All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -82,6 +82,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &agonesv1.PlayerStatusApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("Priority"): return &agonesv1.PriorityApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("RestartPolicy"): + return &agonesv1.RestartPolicyApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("SdkServer"): return &agonesv1.SdkServerApplyConfiguration{} diff --git a/pkg/gameservers/controller.go b/pkg/gameservers/controller.go index acac9d7cb9..f86ecdd5c7 100644 --- a/pkg/gameservers/controller.go +++ b/pkg/gameservers/controller.go @@ -105,6 +105,7 @@ type Controller struct { creationWorkerQueue *workerqueue.WorkerQueue // handles creation only deletionWorkerQueue *workerqueue.WorkerQueue // handles deletion only recorder record.EventRecorder + restartController *RestartController } // NewController returns a new gameserver crd controller @@ -157,6 +158,7 @@ func NewController( migrationController: NewMigrationController(health, kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory, controllerHooks.SyncPodPortsToGameServer), missingPodController: NewMissingPodController(health, kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory), succeededController: NewSucceededController(health, kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory), + restartController: NewRestartController(health, kubeClient, agonesClient, agonesInformerFactory), } c.baseLogger = runtime.NewLoggerWithType(c) @@ -435,6 +437,12 @@ func (c *Controller) Run(ctx context.Context, workers int) error { } }() } + // Run the RestartPolicy Controller + go func() { + if err := c.restartController.Run(ctx, workers); err != nil { + c.baseLogger.WithError(err).Error("RestartController exited with error") + } + }() // start work queues var wg sync.WaitGroup diff --git a/pkg/gameservers/restart_controller.go b/pkg/gameservers/restart_controller.go new file mode 100644 index 0000000000..164aacc6c1 --- /dev/null +++ b/pkg/gameservers/restart_controller.go @@ -0,0 +1,369 @@ +// Copyright Contributors to Agones a Series of LF Projects, LLC. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gameservers + +import ( + "context" + "fmt" + "time" + + "agones.dev/agones/pkg/apis/agones" + agonesv1 "agones.dev/agones/pkg/apis/agones/v1" + "agones.dev/agones/pkg/client/clientset/versioned" + "agones.dev/agones/pkg/client/informers/externalversions" + listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" + "agones.dev/agones/pkg/util/logfields" + "agones.dev/agones/pkg/util/runtime" + "agones.dev/agones/pkg/util/workerqueue" + "github.com/heptiolabs/healthcheck" + cron "github.com/robfig/cron/v3" + "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes" + + "k8s.io/client-go/kubernetes/scheme" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" +) + +// RestartController handles scheduled in-place restarts of game server containers. +// It does NOT delete Pods; it relies on the SDK sidecar (watchRestartAnnotation) +// to exit cleanly so Kubernetes restarts only the game server container. +type RestartController struct { + baseLogger *logrus.Entry + kubeClient kubernetes.Interface + agonesClient versioned.Interface + gsLister listerv1.GameServerLister + gsListerSynced cache.InformerSynced + recorder record.EventRecorder + workerqueue *workerqueue.WorkerQueue +} + +// NewRestartController returns a new RestartController wired to the provided +// informer factories. Matches the constructor pattern of all Agones controllers +// (see health_controller.go, succeeded_controller.go). +// +// Note: unlike other controllers, RestartController does not need kubeInformerFactory +// because it only watches GameServer objects (Agones CRDs), not core k8s resources. +// The parameter is intentionally omitted to avoid the revive unused-parameter lint warning. +func NewRestartController( + health healthcheck.Handler, + kubeClient kubernetes.Interface, + agonesClient versioned.Interface, + agonesInformerFactory externalversions.SharedInformerFactory, +) *RestartController { + + gameServers := agonesInformerFactory.Agones().V1().GameServers() + + c := &RestartController{ + kubeClient: kubeClient, + agonesClient: agonesClient, + gsLister: gameServers.Lister(), + gsListerSynced: gameServers.Informer().HasSynced, + } + + c.baseLogger = runtime.NewLoggerWithType(c) + + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(c.baseLogger.Debugf) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{ + Interface: kubeClient.CoreV1().Events(""), + }) + c.recorder = eventBroadcaster.NewRecorder( + scheme.Scheme, + corev1.EventSource{Component: "gameserver-restart-controller"}, + ) + + c.workerqueue = workerqueue.NewWorkerQueue( + c.syncGameServer, + c.baseLogger, + logfields.GameServerKey, + agones.GroupName+".RestartController", + ) + health.AddLivenessCheck("restart-workerqueue", healthcheck.Check(c.workerqueue.Healthy)) + + // Enqueue GameServers with a RestartPolicy when they are added or updated. + _, _ = gameServers.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + gs, ok := obj.(*agonesv1.GameServer) + if ok && gs.Spec.RestartPolicy != nil { + c.workerqueue.Enqueue(gs) + } + }, + UpdateFunc: func(_, newObj interface{}) { + gs, ok := newObj.(*agonesv1.GameServer) + if ok && gs.Spec.RestartPolicy != nil { + c.workerqueue.Enqueue(gs) + } + }, + }) + + return c +} + +// Run starts the controller. Blocks until ctx is cancelled. +func (c *RestartController) Run(ctx context.Context, workers int) error { + c.baseLogger.Info("Starting RestartController") + defer c.baseLogger.Info("Stopping RestartController") + + if !cache.WaitForCacheSync(ctx.Done(), c.gsListerSynced) { + return fmt.Errorf("never got in sync with cache for RestartController") + } + + // Periodically re-enqueue all GameServers with a RestartPolicy so we catch + // schedule windows even when no update events fire. + go func() { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + c.enqueueAll() + } + } + }() + + c.workerqueue.Run(ctx, workers) + return nil +} + +// enqueueAll lists ALL GameServers with a RestartPolicy and re-enqueues them. +func (c *RestartController) enqueueAll() { + gsList, err := c.gsLister.List(labels.Everything()) + if err != nil { + c.baseLogger.WithError(err).Error("Failed to list GameServers for restart re-enqueue") + return + } + for _, gs := range gsList { + if gs.Spec.RestartPolicy != nil { + c.workerqueue.Enqueue(gs) + } + } +} + +// syncGameServer is the reconcile function called by the worker queue for each +// GameServer key. It is the entry point for the restart state machine. +func (c *RestartController) syncGameServer(ctx context.Context, key string) error { + namespace, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return err + } + + gs, err := c.gsLister.GameServers(namespace).Get(name) + if k8serrors.IsNotFound(err) { + return nil // GS deleted; nothing to do + } + if err != nil { + return err + } + + // Feature gate guard — no-op if the gate is disabled. + if !runtime.FeatureEnabled(runtime.FeatureGameServerScheduledRestart) { + return nil + } + + // No RestartPolicy — most game servers; fast exit. + if gs.Spec.RestartPolicy == nil { + return nil + } + + // Terminal states never recover; skip. + if agonesv1.TerminalGameServerStates[gs.Status.State] { + return nil + } + + return c.reconcileRestart(ctx, gs) +} + +// reconcileRestart drives the full soft/hard-deadline state machine. +func (c *RestartController) reconcileRestart(ctx context.Context, gs *agonesv1.GameServer) error { + // Guard: RestartPolicy must be set (callers should check, but be defensive). + if gs.Spec.RestartPolicy == nil { + return nil + } + + // Guard: terminal states (Shutdown, Error, Unhealthy) never recover. + // This check also lives in syncGameServer, but we repeat it here so that + // tests calling reconcileRestart directly are also protected. + if agonesv1.TerminalGameServerStates[gs.Status.State] { + return nil + } + + rp := gs.Spec.RestartPolicy + now := time.Now().UTC() + + // ── Step 1: Parse the cron schedule ─────────────────────────────────────── + schedule, err := cron.ParseStandard(rp.Schedule) + if err != nil { + c.baseLogger.WithError(err).WithField("gs", gs.Name).Error("Invalid cron schedule — skipping") + return nil + } + + // ── Step 2: Determine the next scheduled trigger ────────────────────────── + // Anchor = last known "window opened at" annotation, or GS creation time. + anchor := gs.CreationTimestamp.Time + if t, ok := gs.Annotations[agonesv1.GameServerNextRestartAnnotation]; ok { + if parsed, err2 := time.Parse(time.RFC3339, t); err2 == nil { + anchor = parsed + } + } + nextRestart := schedule.Next(anchor) + + // ── Step 3: Not yet time — record the upcoming window and exit ──────────── + if now.Before(nextRestart) { + return c.updateNextRestartTime(ctx, gs, nextRestart) + } + + // ── Step 4: Hard deadline check ─────────────────────────────────────────── + // If pending-since + hardDeadline < now, force restart regardless of state. + forceRestart := false + if rp.HardDeadlineDuration != nil { + if pendingSince, ok := gs.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation]; ok { + if pt, err2 := time.Parse(time.RFC3339, pendingSince); err2 == nil { + if now.Sub(pt) >= rp.HardDeadlineDuration.Duration { + c.baseLogger.WithField("gs", gs.Name).Warn("Hard deadline exceeded — forcing restart") + c.recorder.Event(gs, corev1.EventTypeWarning, + "HardDeadlineExceeded", "Forcing restart: hard deadline exceeded") + forceRestart = true + } + } + } + } + + // ── Step 5: Idle check (skip when force) ────────────────────────────────── + if !forceRestart && !c.isIdle(gs) { + // 5a. Record pending-since if this is the first time we're deferring. + if _, ok := gs.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation]; !ok { + return c.annotateRestartPending(ctx, gs, now) + } + + // 5b. Soft deadline: if we've been waiting longer than softDeadlineDuration, + // skip to the next cron window. + if rp.SoftDeadlineDuration != nil { + if pendingSince, ok := gs.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation]; ok { + if pt, err2 := time.Parse(time.RFC3339, pendingSince); err2 == nil { + if now.Sub(pt) >= rp.SoftDeadlineDuration.Duration { + c.baseLogger.WithField("gs", gs.Name).Info( + "Soft deadline passed — skipping to next cron window") + c.recorder.Event(gs, corev1.EventTypeNormal, + "SoftDeadlineExpired", "Soft deadline passed; deferring to next cron window") + return c.advanceAnchor(ctx, gs, nextRestart) + } + } + } + } + + // Still waiting for the server to become idle. + c.baseLogger.WithField("gs", gs.Name).Debug("GameServer not idle; deferring restart") + return nil + } + + // ── Step 6: Perform the in-place restart ────────────────────────────────── + return c.performInPlaceRestart(ctx, gs, nextRestart) +} + +// isIdle returns true when the game server is safe to restart: +// - state must be Ready (not Allocated, not Reserved) +// - no active players (if PlayerTracking is enabled) +func (c *RestartController) isIdle(gs *agonesv1.GameServer) bool { + if gs.Status.State != agonesv1.GameServerStateReady { + return false + } + if gs.Status.Players != nil && gs.Status.Players.Count > 0 { + return false + } + return true +} + +// performInPlaceRestart signals the game server container to restart by annotating +// the GameServer with the restart-pending-since annotation. The SDK sidecar +// (watchRestartAnnotation in sdkserver.go) watches this annotation and calls +// os.Exit(0) when the server becomes idle, triggering a Kubernetes container restart. +func (c *RestartController) performInPlaceRestart( + ctx context.Context, + gs *agonesv1.GameServer, + windowTime time.Time, +) error { + c.baseLogger.WithField("gs", gs.Name).Info("Performing in-place game server restart") + c.recorder.Event(gs, corev1.EventTypeNormal, + "ScheduledRestart", "Restarting game server container in-place per schedule") + + // Advance the anchor so the next sync computes the FOLLOWING cron window, + // and clear the pending annotation. + return c.advanceAnchor(ctx, gs, windowTime) +} + +// updateNextRestartTime patches the GS with the computed next-restart time so +// operators can observe it via kubectl / dashboards. +func (c *RestartController) updateNextRestartTime( + ctx context.Context, + gs *agonesv1.GameServer, + t time.Time, +) error { + gsCopy := gs.DeepCopy() + mt := metav1.NewTime(t) + gsCopy.Status.NextRestartTime = &mt + if gsCopy.Annotations == nil { + gsCopy.Annotations = map[string]string{} + } + gsCopy.Annotations[agonesv1.GameServerNextRestartAnnotation] = t.UTC().Format(time.RFC3339) + + _, err := c.agonesClient.AgonesV1().GameServers(gs.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}) + return err +} + +// annotateRestartPending records the first time a scheduled window was triggered +// but the server was not yet idle. The timestamp is used to enforce soft/hard deadlines. +func (c *RestartController) annotateRestartPending( + ctx context.Context, + gs *agonesv1.GameServer, + t time.Time, +) error { + gsCopy := gs.DeepCopy() + if gsCopy.Annotations == nil { + gsCopy.Annotations = map[string]string{} + } + gsCopy.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] = t.UTC().Format(time.RFC3339) + _, err := c.agonesClient.AgonesV1().GameServers(gs.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}) + return err +} + +// advanceAnchor moves the next-restart annotation past the current window so +// the following sync computes the next cron trigger, and clears the pending +// annotation (whether we restarted or gave up on this window). +func (c *RestartController) advanceAnchor( + ctx context.Context, + gs *agonesv1.GameServer, + windowTime time.Time, +) error { + gsCopy := gs.DeepCopy() + if gsCopy.Annotations == nil { + gsCopy.Annotations = map[string]string{} + } + // Advance by 1 second past the window so the next sync picks the FOLLOWING window. + gsCopy.Annotations[agonesv1.GameServerNextRestartAnnotation] = + windowTime.Add(time.Second).UTC().Format(time.RFC3339) + delete(gsCopy.Annotations, agonesv1.GameServerRestartPendingSinceAnnotation) + + _, err := c.agonesClient.AgonesV1().GameServers(gs.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}) + return err +} diff --git a/pkg/gameservers/restart_controller_test.go b/pkg/gameservers/restart_controller_test.go new file mode 100644 index 0000000000..638d2a0647 --- /dev/null +++ b/pkg/gameservers/restart_controller_test.go @@ -0,0 +1,370 @@ +// Copyright Contributors to Agones a Series of LF Projects, LLC. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gameservers + +import ( + "context" + "testing" + "time" + + "agones.dev/agones/pkg/apis" + agonesv1 "agones.dev/agones/pkg/apis/agones/v1" + "agones.dev/agones/pkg/client/clientset/versioned/fake" + "agones.dev/agones/pkg/client/informers/externalversions" + "agones.dev/agones/pkg/util/runtime" + "github.com/heptiolabs/healthcheck" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/validation/field" + k8sfake "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) + +type noopAPIHooks struct{} + +func (noopAPIHooks) ValidateGameServerSpec(_ *agonesv1.GameServerSpec, _ *field.Path) field.ErrorList { + return field.ErrorList{} +} +func (noopAPIHooks) ValidateScheduling(_ apis.SchedulingStrategy, _ *field.Path) field.ErrorList { + return field.ErrorList{} +} +func (noopAPIHooks) MutateGameServerPod(_ *agonesv1.GameServerSpec, _ *corev1.Pod) error { return nil } +func (noopAPIHooks) SetEviction(_ *agonesv1.Eviction, _ *corev1.Pod) error { return nil } + +const ( + testNamespace = "default" + testGSName = "test-game-server" + everyMinuteCron = "* * * * *" // window always open when anchor is in past + neverCron = "0 0 1 1 *" // Jan 1st only — window never opens in tests +) + +func enableRestartFeatureGate(t *testing.T) { + t.Helper() + err := runtime.ParseFeatures(string(runtime.FeatureGameServerScheduledRestart) + "=true") + require.NoError(t, err, + "Failed to enable %s — did you add it to featureDefaults in features.go?", + runtime.FeatureGameServerScheduledRestart) + t.Cleanup(func() { + _ = runtime.ParseFeatures(string(runtime.FeatureGameServerScheduledRestart) + "=false") + }) +} + +func newTestRestartController(t *testing.T) ( + *RestartController, + *fake.Clientset, + externalversions.SharedInformerFactory, +) { + t.Helper() + enableRestartFeatureGate(t) + + fakeAgonesClient := fake.NewSimpleClientset() + fakeKubeClient := k8sfake.NewSimpleClientset() + agonesInformerFactory := externalversions.NewSharedInformerFactory(fakeAgonesClient, 0) + + c := NewRestartController( + healthcheck.NewHandler(), + fakeKubeClient, + fakeAgonesClient, + agonesInformerFactory, + ) + return c, fakeAgonesClient, agonesInformerFactory +} + +func newReadyGS(rp *agonesv1.RestartPolicy) *agonesv1.GameServer { + return &agonesv1.GameServer{ + ObjectMeta: metav1.ObjectMeta{ + Name: testGSName, + Namespace: testNamespace, + CreationTimestamp: metav1.Now(), + Annotations: map[string]string{}, + }, + Spec: agonesv1.GameServerSpec{ + RestartPolicy: rp, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "game-server", Image: "us-docker.pkg.dev/agones-images/examples/simple-game-server:0.35"}, + }, + }, + }, + }, + Status: agonesv1.GameServerStatus{ + State: agonesv1.GameServerStateReady, + }, + } +} + +func seedLister(t *testing.T, factory externalversions.SharedInformerFactory, gs *agonesv1.GameServer) { + t.Helper() + err := factory.Agones().V1().GameServers().Informer().GetStore().Add(gs) + require.NoError(t, err, "failed to seed GS into informer store") +} + +func runReconcile( + t *testing.T, + c *RestartController, + fakeClient *fake.Clientset, + factory externalversions.SharedInformerFactory, + gs *agonesv1.GameServer, +) *agonesv1.GameServer { + t.Helper() + + seedLister(t, factory, gs) + _, err := fakeClient.AgonesV1().GameServers(testNamespace).Create( + context.Background(), gs, metav1.CreateOptions{}, + ) + require.NoError(t, err) + fakeClient.ClearActions() // only count actions from reconcileRestart + + require.NoError(t, c.reconcileRestart(context.Background(), gs)) + + return lastUpdateGS(t, fakeClient) +} + +func lastUpdateGS(t *testing.T, fakeClient *fake.Clientset) *agonesv1.GameServer { + t.Helper() + actions := fakeClient.Actions() + for i := len(actions) - 1; i >= 0; i-- { + a := actions[i] + if a.GetVerb() != "update" || a.GetResource().Resource != "gameservers" { + continue + } + if ua, ok := a.(k8stesting.UpdateAction); ok { + if gs, ok2 := ua.GetObject().(*agonesv1.GameServer); ok2 { + return gs + } + } + } + return nil +} + +func TestNoRestartBeforeWindow(t *testing.T) { + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{Schedule: neverCron}) + gs.CreationTimestamp = metav1.Now() // anchor=now → nextRestart=future Jan 1st + + updated := runReconcile(t, c, fakeClient, factory, gs) + require.NotNil(t, updated, "expected one Update call to write next-restart annotation") + + _, pendingSet := updated.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] + assert.False(t, pendingSet, "restart-pending-since must NOT be set before the window opens") + + _, nextSet := updated.Annotations[agonesv1.GameServerNextRestartAnnotation] + assert.True(t, nextSet, "next-restart annotation must be written") +} + +func TestRestartWhenIdle(t *testing.T) { + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{Schedule: everyMinuteCron}) + pastAnchor := time.Now().UTC().Add(-2 * time.Minute) + gs.Annotations[agonesv1.GameServerNextRestartAnnotation] = pastAnchor.Format(time.RFC3339) + gs.Status.State = agonesv1.GameServerStateReady + gs.Status.Players = nil + + updated := runReconcile(t, c, fakeClient, factory, gs) + require.NotNil(t, updated, "expected an Update call (advanceAnchor after idle restart)") + + nextAnnotation, ok := updated.Annotations[agonesv1.GameServerNextRestartAnnotation] + assert.True(t, ok, "next-restart annotation must still be present after restart") + + advancedTime, err := time.Parse(time.RFC3339, nextAnnotation) + require.NoError(t, err) + assert.True(t, advancedTime.After(pastAnchor), "next-restart must be advanced past old window") + + _, stillPending := updated.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] + assert.False(t, stillPending, "restart-pending-since must be cleared after successful restart") +} + +func TestDeferWhenAllocated(t *testing.T) { + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{Schedule: everyMinuteCron}) + pastAnchor := time.Now().UTC().Add(-2 * time.Minute) + gs.Annotations[agonesv1.GameServerNextRestartAnnotation] = pastAnchor.Format(time.RFC3339) + gs.Status.State = agonesv1.GameServerStateAllocated + + updated := runReconcile(t, c, fakeClient, factory, gs) + require.NotNil(t, updated, "expected an Update call to annotate restart-pending-since") + + pendingSince, ok := updated.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] + assert.True(t, ok, "restart-pending-since must be set when restart is deferred") + + pendingTime, err := time.Parse(time.RFC3339, pendingSince) + require.NoError(t, err) + assert.WithinDuration(t, time.Now().UTC(), pendingTime, 5*time.Second, + "restart-pending-since must record approximately the current time") +} + +func assertAnchorAdvanced(t *testing.T, updated *agonesv1.GameServer, windowOpenedAt time.Time, msg string) { + t.Helper() + require.NotNil(t, updated, msg) + + nextAnnotation, ok := updated.Annotations[agonesv1.GameServerNextRestartAnnotation] + assert.True(t, ok, "next-restart annotation must be present") + advancedTime, err := time.Parse(time.RFC3339, nextAnnotation) + require.NoError(t, err) + assert.True(t, advancedTime.After(windowOpenedAt), "anchor must advance past old window") + + _, stillPending := updated.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] + assert.False(t, stillPending, "restart-pending-since must be cleared") +} + +func TestSoftDeadlineSkip(t *testing.T) { + softDeadline := metav1.Duration{Duration: 1 * time.Hour} + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{ + Schedule: everyMinuteCron, + SoftDeadlineDuration: &softDeadline, + }) + windowOpenedAt := time.Now().UTC().Add(-2 * time.Minute) + gs.Annotations[agonesv1.GameServerNextRestartAnnotation] = windowOpenedAt.Format(time.RFC3339) + gs.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] = + time.Now().UTC().Add(-2 * time.Hour).Format(time.RFC3339) // 2h > 1h soft deadline + gs.Status.State = agonesv1.GameServerStateAllocated + + updated := runReconcile(t, c, fakeClient, factory, gs) + assertAnchorAdvanced(t, updated, windowOpenedAt, "expected Update call (advanceAnchor after soft deadline)") +} + +func TestHardDeadlineForce(t *testing.T) { + hardDeadline := metav1.Duration{Duration: 24 * time.Hour} + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{ + Schedule: everyMinuteCron, + HardDeadlineDuration: &hardDeadline, + }) + windowOpenedAt := time.Now().UTC().Add(-2 * time.Minute) + gs.Annotations[agonesv1.GameServerNextRestartAnnotation] = windowOpenedAt.Format(time.RFC3339) + gs.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] = + time.Now().UTC().Add(-25 * time.Hour).Format(time.RFC3339) // 25h > 24h hard deadline + gs.Status.State = agonesv1.GameServerStateAllocated + + updated := runReconcile(t, c, fakeClient, factory, gs) + assertAnchorAdvanced(t, updated, windowOpenedAt, "expected Update call (hard deadline forced restart)") +} + +func TestInvalidCronValidation(t *testing.T) { + cases := []struct { + name string + schedule string + wantErr bool + }{ + {name: "valid five-field cron", schedule: "0 4 * * *", wantErr: false}, + {name: "valid every-minute cron", schedule: "* * * * *", wantErr: false}, + {name: "invalid: only four fields", schedule: "0 4 * *", wantErr: true}, + {name: "invalid: natural language", schedule: "every day at midnight", wantErr: true}, + {name: "invalid: empty string", schedule: "", wantErr: true}, + {name: "invalid: minute 60 out of range", schedule: "60 4 * * *", wantErr: true}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + gss := &agonesv1.GameServerSpec{ + RestartPolicy: &agonesv1.RestartPolicy{Schedule: tc.schedule}, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "game-server", Image: "us-docker.pkg.dev/agones-images/examples/simple-game-server:0.35"}, + }, + }, + }, + } + + errs := gss.Validate(noopAPIHooks{}, "", field.NewPath("spec")) + + var rpErrs field.ErrorList + for _, e := range errs { + if len(e.Field) >= 13 && e.Field[:13] == "restartPolicy" { + rpErrs = append(rpErrs, e) + } + } + + if tc.wantErr { + assert.NotEmpty(t, rpErrs, + "expected restartPolicy validation error for schedule %q", tc.schedule) + if len(rpErrs) > 0 { + assert.Equal(t, "restartPolicy.schedule", rpErrs[0].Field) + } + } else { + assert.Empty(t, rpErrs, + "did NOT expect restartPolicy errors for schedule %q", tc.schedule) + } + }) + } +} + +func TestNoRestartForTerminalState(t *testing.T) { + for _, state := range []agonesv1.GameServerState{ + agonesv1.GameServerStateShutdown, + agonesv1.GameServerStateError, + agonesv1.GameServerStateUnhealthy, + } { + t.Run(string(state), func(t *testing.T) { + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{Schedule: everyMinuteCron}) + gs.Status.State = state + gs.Annotations[agonesv1.GameServerNextRestartAnnotation] = + time.Now().UTC().Add(-1 * time.Minute).Format(time.RFC3339) + + // runReconcile clears the Create action for us. + _ = runReconcile(t, c, fakeClient, factory, gs) + + for _, a := range fakeClient.Actions() { + assert.NotEqual(t, "update", a.GetVerb(), + "must not update a terminal-state GS (%s)", state) + } + }) + } +} + +func TestNoRestartWithNoPolicy(t *testing.T) { + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(nil) + seedLister(t, factory, gs) + _, err := fakeClient.AgonesV1().GameServers(testNamespace).Create( + context.Background(), gs, metav1.CreateOptions{}, + ) + require.NoError(t, err) + fakeClient.ClearActions() + + require.NoError(t, c.syncGameServer(context.Background(), testNamespace+"/"+testGSName)) + + for _, a := range fakeClient.Actions() { + assert.NotEqual(t, "update", a.GetVerb(), "must not touch a GS with no RestartPolicy") + } +} + +func TestRestartDeferredWhenPlayersConnected(t *testing.T) { + c, fakeClient, factory := newTestRestartController(t) + + gs := newReadyGS(&agonesv1.RestartPolicy{Schedule: everyMinuteCron}) + gs.Annotations[agonesv1.GameServerNextRestartAnnotation] = + time.Now().UTC().Add(-1 * time.Minute).Format(time.RFC3339) + gs.Status.State = agonesv1.GameServerStateReady + gs.Status.Players = &agonesv1.PlayerStatus{Count: 5, Capacity: 10} + + updated := runReconcile(t, c, fakeClient, factory, gs) + require.NotNil(t, updated) + _, ok := updated.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation] + assert.True(t, ok, "restart-pending-since must be set when active players block restart") +} diff --git a/pkg/sdkserver/sdkserver.go b/pkg/sdkserver/sdkserver.go index e61117e3b5..08a7c2dfef 100644 --- a/pkg/sdkserver/sdkserver.go +++ b/pkg/sdkserver/sdkserver.go @@ -19,6 +19,7 @@ import ( "fmt" "io" "net/http" + "os" "slices" "strings" "sync" @@ -253,6 +254,10 @@ func (s *SDKServer) Run(ctx context.Context) error { // we have the gameserver details now s.gsWaitForSync.Done() + if runtime.FeatureEnabled(runtime.FeatureGameServerScheduledRestart) { + go s.watchRestartAnnotation(ctx) + } + gs, err := s.gameServer() if err != nil { return err @@ -1560,6 +1565,50 @@ func (s *SDKServer) updateConnectedPlayers(ctx context.Context) error { return err } +func (s *SDKServer) watchRestartAnnotation(ctx context.Context) { + ticker := time.NewTicker(5 * time.Second) + for { + select { + case <-ctx.Done(): + ticker.Stop() // explicit stop on clean shutdown + return + + case <-ticker.C: + gs, err := s.gameServerLister.GameServers(s.namespace).Get(s.gameServerName) + if err != nil { + s.logger.WithError(err).Warn("watchRestartAnnotation: could not get GameServer") + continue + } + + // Only act when the RestartController has set the pending annotation. + if _, ok := gs.Annotations[agonesv1.GameServerRestartPendingSinceAnnotation]; !ok { + continue + } + + // Gate: server must be idle (Ready, no players). + if gs.Status.State != agonesv1.GameServerStateReady { + s.logger.Debug("watchRestartAnnotation: restart pending but server not Ready; waiting") + continue + } + if gs.Status.Players != nil && gs.Status.Players.Count > 0 { + s.logger.WithField("players", gs.Status.Players.Count). + Debug("watchRestartAnnotation: restart pending but players connected; waiting") + continue + } + + // All conditions met — exit cleanly so Kubernetes restarts the container. + s.logger.Info("watchRestartAnnotation: conditions met, exiting for in-place restart") + s.recorder.Event(gs, + corev1.EventTypeNormal, + "ScheduledRestartExec", + "SDK sidecar exiting cleanly for scheduled in-place container restart", + ) + ticker.Stop() + os.Exit(0) + } + } +} + // NewSDKServerContext returns a Context that cancels when SIGTERM or os.Interrupt // is received and the GameServer's Status is shutdown func (s *SDKServer) NewSDKServerContext(ctx context.Context) context.Context { diff --git a/pkg/util/runtime/features.go b/pkg/util/runtime/features.go index 32fcc8c73b..4c5c7a8e16 100644 --- a/pkg/util/runtime/features.go +++ b/pkg/util/runtime/features.go @@ -79,6 +79,9 @@ const ( // FeatureWasmAutoscaler is a feature flag to enable/disable the script based autoscaler. FeatureWasmAutoscaler Feature = "WasmAutoscaler" + // FeatureGameServerScheduledRestart is a feature flag to enable/disable the script based RestartPolicy. + FeatureGameServerScheduledRestart Feature = "GameServerScheduledRestart" + //////////////// // Dev features @@ -164,9 +167,10 @@ var ( FeatureSidecarContainers: true, // Alpha features - FeaturePlayerAllocationFilter: false, - FeaturePlayerTracking: false, - FeatureWasmAutoscaler: false, + FeaturePlayerAllocationFilter: false, + FeaturePlayerTracking: false, + FeatureWasmAutoscaler: false, + FeatureGameServerScheduledRestart: false, // Dev features FeatureProcessorAllocator: false, diff --git a/vendor/github.com/robfig/cron/v3/README.md b/vendor/github.com/robfig/cron/v3/README.md index 984c537c01..8db4f550f1 100644 --- a/vendor/github.com/robfig/cron/v3/README.md +++ b/vendor/github.com/robfig/cron/v3/README.md @@ -72,7 +72,7 @@ It is backwards incompatible with both v1 and v2. These updates are required: // Seconds field, optional cron.New( cron.WithParser( - cron.SecondOptional | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor)) + cron.SecondOptional | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor)) - The Cron type now accepts functional options on construction rather than the previous ad-hoc behavior modification mechanisms (setting a field, calling a setter). @@ -118,7 +118,7 @@ There are two cron spec formats in common usage: jobs in Java software [the Cron wikipedia page]: https://en.wikipedia.org/wiki/Cron -[the Quartz Scheduler]: http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/tutorial-lesson-06.html +[the Quartz Scheduler]: http://www.quartz-scheduler.org/documentation/quartz-2.x/tutorials/crontrigger.html The original version of this package included an optional "seconds" field, which made it incompatible with both of these formats. Now, the "standard" format is diff --git a/vendor/github.com/robfig/cron/v3/chain.go b/vendor/github.com/robfig/cron/v3/chain.go index 9565b418e0..118e5bbea4 100644 --- a/vendor/github.com/robfig/cron/v3/chain.go +++ b/vendor/github.com/robfig/cron/v3/chain.go @@ -76,9 +76,9 @@ func DelayIfStillRunning(logger Logger) JobWrapper { // SkipIfStillRunning skips an invocation of the Job if a previous invocation is // still running. It logs skips to the given logger at Info level. func SkipIfStillRunning(logger Logger) JobWrapper { + var ch = make(chan struct{}, 1) + ch <- struct{}{} return func(j Job) Job { - var ch = make(chan struct{}, 1) - ch <- struct{}{} return FuncJob(func() { select { case v := <-ch: diff --git a/vendor/github.com/robfig/cron/v3/cron.go b/vendor/github.com/robfig/cron/v3/cron.go index c7e9176658..f6e451db3e 100644 --- a/vendor/github.com/robfig/cron/v3/cron.go +++ b/vendor/github.com/robfig/cron/v3/cron.go @@ -21,16 +21,11 @@ type Cron struct { logger Logger runningMu sync.Mutex location *time.Location - parser ScheduleParser + parser Parser nextID EntryID jobWaiter sync.WaitGroup } -// ScheduleParser is an interface for schedule spec parsers that return a Schedule -type ScheduleParser interface { - Parse(spec string) (Schedule, error) -} - // Job is an interface for submitted cron jobs. type Job interface { Run() diff --git a/vendor/github.com/robfig/cron/v3/doc.go b/vendor/github.com/robfig/cron/v3/doc.go index fa5d08b4db..ac6b4b078f 100644 --- a/vendor/github.com/robfig/cron/v3/doc.go +++ b/vendor/github.com/robfig/cron/v3/doc.go @@ -1,18 +1,6 @@ /* Package cron implements a cron spec parser and job runner. -Installation - -To download the specific tagged release, run: - - go get github.com/robfig/cron/v3@v3.0.0 - -Import it in your program as: - - import "github.com/robfig/cron/v3" - -It requires Go 1.11 or later due to usage of Go Modules. - Usage Callers may register Funcs to be invoked on a given schedule. Cron will run @@ -21,7 +9,7 @@ them in their own goroutines. c := cron.New() c.AddFunc("30 * * * *", func() { fmt.Println("Every hour on the half hour") }) c.AddFunc("30 3-6,20-23 * * *", func() { fmt.Println(".. in the range 3-6am, 8-11pm") }) - c.AddFunc("CRON_TZ=Asia/Tokyo 30 04 * * *", func() { fmt.Println("Runs at 04:30 Tokyo time every day") }) + c.AddFunc("CRON_TZ=Asia/Tokyo 30 04 * * * *", func() { fmt.Println("Runs at 04:30 Tokyo time every day") }) c.AddFunc("@hourly", func() { fmt.Println("Every hour, starting an hour from now") }) c.AddFunc("@every 1h30m", func() { fmt.Println("Every hour thirty, starting an hour thirty from now") }) c.Start() @@ -59,18 +47,11 @@ Alternative Formats Alternative Cron expression formats support other fields like seconds. You can implement that by creating a custom Parser as follows. - cron.New( - cron.WithParser( - cron.NewParser( - cron.SecondOptional | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor))) - -Since adding Seconds is the most common modification to the standard cron spec, -cron provides a builtin function to do that, which is equivalent to the custom -parser you saw earlier, except that its seconds field is REQUIRED: - - cron.New(cron.WithSeconds()) + cron.New( + cron.WithParser( + cron.SecondOptional | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor)) -That emulates Quartz, the most popular alternative Cron schedule format: +The most popular alternative Cron expression format is Quartz: http://www.quartz-scheduler.org/documentation/quartz-2.x/tutorials/crontrigger.html Special Characters @@ -169,7 +150,7 @@ The prefix "TZ=(TIME ZONE)" is also supported for legacy compatibility. Be aware that jobs scheduled during daylight-savings leap-ahead transitions will not be run! -Job Wrappers +Job Wrappers / Chain A Cron runner may be configured with a chain of job wrappers to add cross-cutting functionality to all submitted jobs. For example, they may be used diff --git a/vendor/github.com/robfig/cron/v3/option.go b/vendor/github.com/robfig/cron/v3/option.go index 09e4278e77..07638201e2 100644 --- a/vendor/github.com/robfig/cron/v3/option.go +++ b/vendor/github.com/robfig/cron/v3/option.go @@ -23,7 +23,7 @@ func WithSeconds() Option { } // WithParser overrides the parser used for interpreting job schedules. -func WithParser(p ScheduleParser) Option { +func WithParser(p Parser) Option { return func(c *Cron) { c.parser = p } diff --git a/vendor/modules.txt b/vendor/modules.txt index 9e1db331f6..75522eaa4f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -341,7 +341,7 @@ github.com/prometheus/common/model github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/robfig/cron/v3 v3.0.1 +# github.com/robfig/cron/v3 v3.0.0 ## explicit; go 1.12 github.com/robfig/cron/v3 # github.com/russross/blackfriday/v2 v2.1.0