diff --git a/api/v1beta1/nodeset_keys.go b/api/v1beta1/nodeset_keys.go index 06dcd590..6b74460c 100644 --- a/api/v1beta1/nodeset_keys.go +++ b/api/v1beta1/nodeset_keys.go @@ -23,3 +23,11 @@ func (o *NodeSet) HeadlessServiceKey() types.NamespacedName { Namespace: o.Namespace, } } + +func (o *NodeSet) SshHostKeys() types.NamespacedName { + key := o.Key() + return types.NamespacedName{ + Name: fmt.Sprintf("%s-ssh-host-keys", key.Name), + Namespace: o.Namespace, + } +} diff --git a/api/v1beta1/nodeset_types.go b/api/v1beta1/nodeset_types.go index 8e36536c..f6052e88 100644 --- a/api/v1beta1/nodeset_types.go +++ b/api/v1beta1/nodeset_types.go @@ -37,6 +37,10 @@ type NodeSetSpec struct { // +optional Slurmd ContainerWrapper `json:"slurmd,omitempty"` + // SSH configuration for worker pods. + // +optional + Ssh NodeSetSsh `json:"ssh,omitzero"` + // The logfile sidecar configuration. // +optional LogFile ContainerWrapper `json:"logfile,omitzero"` @@ -112,6 +116,14 @@ type NodeSetPartition struct { Config string `json:"config,omitzero"` } +// NodeSetSsh defines SSH configuration for NodeSet worker pods. +type NodeSetSsh struct { + // Enabled controls whether SSH access is enabled for this NodeSet. + // When enabled, SSH host keys will be created and mounted, and port 22 will be exposed. + // +default:=false + Enabled bool `json:"enabled"` +} + // NodeSetUpdateStrategy indicates the strategy that the NodeSet // controller will be used to perform updates. It includes any additional // parameters necessary to perform the update for the indicated strategy. diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index bad48853..1667c646 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -567,6 +567,7 @@ func (in *NodeSetSpec) DeepCopyInto(out *NodeSetSpec) { **out = **in } in.Slurmd.DeepCopyInto(&out.Slurmd) + out.Ssh = in.Ssh in.LogFile.DeepCopyInto(&out.LogFile) in.Template.DeepCopyInto(&out.Template) out.Partition = in.Partition @@ -600,6 +601,21 @@ func (in *NodeSetSpec) DeepCopy() *NodeSetSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeSetSsh) DeepCopyInto(out *NodeSetSsh) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeSetSsh. +func (in *NodeSetSsh) DeepCopy() *NodeSetSsh { + if in == nil { + return nil + } + out := new(NodeSetSsh) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeSetStatus) DeepCopyInto(out *NodeSetStatus) { *out = *in diff --git a/config/crd/bases/slinky.slurm.net_nodesets.yaml b/config/crd/bases/slinky.slurm.net_nodesets.yaml index c67763ef..ef411654 100644 --- a/config/crd/bases/slinky.slurm.net_nodesets.yaml +++ b/config/crd/bases/slinky.slurm.net_nodesets.yaml @@ -173,6 +173,18 @@ spec: Ref: https://github.com/kubernetes/api/blob/master/core/v1/types.go#L2885 type: object x-kubernetes-preserve-unknown-fields: true + ssh: + description: SSH configuration for worker pods. + properties: + enabled: + default: false + description: |- + Enabled controls whether SSH access is enabled for this NodeSet. + When enabled, SSH host keys will be created and mounted, and port 22 will be exposed. + type: boolean + required: + - enabled + type: object taintKubeNodes: default: false description: |- diff --git a/helm/slurm-operator-crds/templates/slinky.slurm.net_nodesets.yaml b/helm/slurm-operator-crds/templates/slinky.slurm.net_nodesets.yaml index c67763ef..ef411654 100644 --- a/helm/slurm-operator-crds/templates/slinky.slurm.net_nodesets.yaml +++ b/helm/slurm-operator-crds/templates/slinky.slurm.net_nodesets.yaml @@ -173,6 +173,18 @@ spec: Ref: https://github.com/kubernetes/api/blob/master/core/v1/types.go#L2885 type: object x-kubernetes-preserve-unknown-fields: true + ssh: + description: SSH configuration for worker pods. + properties: + enabled: + default: false + description: |- + Enabled controls whether SSH access is enabled for this NodeSet. + When enabled, SSH host keys will be created and mounted, and port 22 will be exposed. + type: boolean + required: + - enabled + type: object taintKubeNodes: default: false description: |- diff --git a/helm/slurm/templates/nodeset/nodeset-cr.yaml b/helm/slurm/templates/nodeset/nodeset-cr.yaml index 22842b8e..385bd633 100644 --- a/helm/slurm/templates/nodeset/nodeset-cr.yaml +++ b/helm/slurm/templates/nodeset/nodeset-cr.yaml @@ -51,6 +51,10 @@ spec: config: {{ include "slurm.worker.partitionConfig" $nodeset.partition }} {{- end }}{{- /* if (include "slurm.worker.partitionConfig" $nodeset.partition) */}} {{- end }}{{- /* with $nodeset.partition */}} + {{- with $nodeset.ssh }} + ssh: + {{- toYaml . | nindent 4 }} + {{- end }}{{- /* with $nodeset.ssh */}} replicas: {{ $nodeset.replicas }} slurmd: {{- $_ := set $nodeset.slurmd "imagePullPolicy" (default $.Values.imagePullPolicy $nodeset.slurmd.imagePullPolicy) -}} diff --git a/helm/slurm/values.yaml b/helm/slurm/values.yaml index b27a1605..9a3e93d0 100644 --- a/helm/slurm/values.yaml +++ b/helm/slurm/values.yaml @@ -643,6 +643,11 @@ nodesets: configMap: {} # State: UP # MaxTime: UNLIMITED + # SSH configuration for this NodeSet. + # ssh: + # -- Enable SSH access to worker pods with pam_slurm_adopt. + # Ref: https://slurm.schedmd.com/pam_slurm_adopt.html + # enabled: false # -- Enable propagation of container `resources.limits` into slurmd. useResourceLimits: true # Update strategy configuration. diff --git a/internal/builder/login_secret.go b/internal/builder/login_secret.go index 3df02f9e..3f1f96aa 100644 --- a/internal/builder/login_secret.go +++ b/internal/builder/login_secret.go @@ -32,12 +32,12 @@ func (b *Builder) BuildLoginSshHostKeys(loginset *slinkyv1beta1.LoginSet) (*core Key: loginset.SshHostKeys(), Metadata: loginset.Spec.Template.PodMetadata, Data: map[string][]byte{ - sshHostEcdsaKeyFile: keyPairRsa.PrivateKey(), - sshHostEcdsaPubKeyFile: keyPairRsa.PublicKey(), + sshHostEcdsaKeyFile: keyPairEcdsa.PrivateKey(), + sshHostEcdsaPubKeyFile: keyPairEcdsa.PublicKey(), sshHostEd25519KeyFile: keyPairEd25519.PrivateKey(), sshHostEd25519PubKeyFile: keyPairEd25519.PublicKey(), - sshHostRsaKeyFile: keyPairEcdsa.PrivateKey(), - sshHostRsaPubKeyFile: keyPairEcdsa.PublicKey(), + sshHostRsaKeyFile: keyPairRsa.PrivateKey(), + sshHostRsaPubKeyFile: keyPairRsa.PublicKey(), }, Immutable: true, } diff --git a/internal/builder/worker_app.go b/internal/builder/worker_app.go index c3e7c81e..254a247d 100644 --- a/internal/builder/worker_app.go +++ b/internal/builder/worker_app.go @@ -23,6 +23,7 @@ import ( const ( SlurmdPort = 6818 + SshPort = 22 slurmdUser = "root" @@ -67,7 +68,7 @@ func (b *Builder) BuildWorkerPodTemplate(nodeset *slinkyv1beta1.NodeSet, control InitContainers: []corev1.Container{ b.logfileContainer(spec.LogFile, slurmdLogFilePath), }, - Volumes: nodesetVolumes(controller), + Volumes: nodesetVolumes(nodeset, controller), Tolerations: []corev1.Toleration{ slurmtaints.TolerationWorkerNode, }, @@ -78,7 +79,7 @@ func (b *Builder) BuildWorkerPodTemplate(nodeset *slinkyv1beta1.NodeSet, control return b.buildPodTemplate(opts) } -func nodesetVolumes(controller *slinkyv1beta1.Controller) []corev1.Volume { +func nodesetVolumes(nodeset *slinkyv1beta1.NodeSet, controller *slinkyv1beta1.Controller) []corev1.Volume { out := []corev1.Volume{ { Name: slurmEtcVolume, @@ -102,23 +103,83 @@ func nodesetVolumes(controller *slinkyv1beta1.Controller) []corev1.Volume { }, logFileVolume(), } + + // Add SSH host keys volume if SSH is enabled + if nodeset.Spec.Ssh.Enabled { + out = append(out, corev1.Volume{ + Name: sshHostKeysVolume, + VolumeSource: corev1.VolumeSource{ + Projected: &corev1.ProjectedVolumeSource{ + DefaultMode: ptr.To[int32](0o600), + Sources: []corev1.VolumeProjection{ + { + Secret: &corev1.SecretProjection{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: nodeset.SshHostKeys().Name, + }, + Items: []corev1.KeyToPath{ + {Key: sshHostRsaKeyFile, Path: sshHostRsaKeyFile, Mode: ptr.To[int32](0o600)}, + {Key: sshHostRsaPubKeyFile, Path: sshHostRsaPubKeyFile, Mode: ptr.To[int32](0o644)}, + {Key: sshHostEd25519KeyFile, Path: sshHostEd25519KeyFile, Mode: ptr.To[int32](0o600)}, + {Key: sshHostEd25519PubKeyFile, Path: sshHostEd25519PubKeyFile, Mode: ptr.To[int32](0o644)}, + {Key: sshHostEcdsaKeyFile, Path: sshHostEcdsaKeyFile, Mode: ptr.To[int32](0o600)}, + {Key: sshHostEcdsaPubKeyFile, Path: sshHostEcdsaPubKeyFile, Mode: ptr.To[int32](0o644)}, + }, + }, + }, + }, + }, + }, + }) + } + return out } func (b *Builder) slurmdContainer(nodeset *slinkyv1beta1.NodeSet, controller *slinkyv1beta1.Controller) corev1.Container { merge := nodeset.Spec.Slurmd.Container + // Base ports always include slurmd + ports := []corev1.ContainerPort{ + { + Name: labels.WorkerApp, + ContainerPort: SlurmdPort, + Protocol: corev1.ProtocolTCP, + }, + } + + // Add SSH port if enabled + if nodeset.Spec.Ssh.Enabled { + ports = append(ports, corev1.ContainerPort{ + Name: "ssh", + ContainerPort: SshPort, + Protocol: corev1.ProtocolTCP, + }) + } + + // Base volume mounts + volumeMounts := []corev1.VolumeMount{ + {Name: slurmEtcVolume, MountPath: slurmEtcDir, ReadOnly: true}, + {Name: slurmLogFileVolume, MountPath: slurmLogFileDir}, + } + + // Add SSH host key mounts if enabled + if nodeset.Spec.Ssh.Enabled { + volumeMounts = append(volumeMounts, + corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostRsaKeyFilePath, SubPath: sshHostRsaKeyFile, ReadOnly: true}, + corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostRsaKeyPubFilePath, SubPath: sshHostRsaPubKeyFile, ReadOnly: true}, + corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEd25519KeyFilePath, SubPath: sshHostEd25519KeyFile, ReadOnly: true}, + corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEd25519PubKeyFilePath, SubPath: sshHostEd25519PubKeyFile, ReadOnly: true}, + corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEcdsaKeyFilePath, SubPath: sshHostEcdsaKeyFile, ReadOnly: true}, + corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEcdsaPubKeyFilePath, SubPath: sshHostEcdsaPubKeyFile, ReadOnly: true}, + ) + } + opts := ContainerOpts{ base: corev1.Container{ - Name: labels.WorkerApp, - Args: slurmdArgs(nodeset, controller), - Ports: []corev1.ContainerPort{ - { - Name: labels.WorkerApp, - ContainerPort: SlurmdPort, - Protocol: corev1.ProtocolTCP, - }, - }, + Name: labels.WorkerApp, + Args: slurmdArgs(nodeset, controller), + Ports: ports, StartupProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ HTTPGet: &corev1.HTTPGetAction{ @@ -169,10 +230,7 @@ func (b *Builder) slurmdContainer(nodeset *slinkyv1beta1.NodeSet, controller *sl }, }, }, - VolumeMounts: []corev1.VolumeMount{ - {Name: slurmEtcVolume, MountPath: slurmEtcDir, ReadOnly: true}, - {Name: slurmLogFileVolume, MountPath: slurmLogFileDir}, - }, + VolumeMounts: volumeMounts, }, merge: merge, } diff --git a/internal/builder/worker_secret.go b/internal/builder/worker_secret.go new file mode 100644 index 00000000..31502b0a --- /dev/null +++ b/internal/builder/worker_secret.go @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (C) SchedMD LLC. +// SPDX-License-Identifier: Apache-2.0 + +package builder + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + + slinkyv1beta1 "github.com/SlinkyProject/slurm-operator/api/v1beta1" + "github.com/SlinkyProject/slurm-operator/internal/builder/labels" + "github.com/SlinkyProject/slurm-operator/internal/utils/crypto" + "github.com/SlinkyProject/slurm-operator/internal/utils/structutils" +) + +func (b *Builder) BuildWorkerSshHostKeys(nodeset *slinkyv1beta1.NodeSet) (*corev1.Secret, error) { + keyPairRsa, err := crypto.NewKeyPair(crypto.WithType(crypto.KeyPairRsa)) + if err != nil { + return nil, fmt.Errorf("failed to create RSA key pair: %w", err) + } + keyPairEd25519, err := crypto.NewKeyPair(crypto.WithType(crypto.KeyPairEd25519)) + if err != nil { + return nil, fmt.Errorf("failed to create ED25519 key pair: %w", err) + } + keyPairEcdsa, err := crypto.NewKeyPair(crypto.WithType(crypto.KeyPairEcdsa)) + if err != nil { + return nil, fmt.Errorf("failed to create ECDSA key pair: %w", err) + } + + opts := SecretOpts{ + Key: nodeset.SshHostKeys(), + Metadata: nodeset.Spec.Template.PodMetadata, + Data: map[string][]byte{ + sshHostEcdsaKeyFile: keyPairEcdsa.PrivateKey(), + sshHostEcdsaPubKeyFile: keyPairEcdsa.PublicKey(), + sshHostEd25519KeyFile: keyPairEd25519.PrivateKey(), + sshHostEd25519PubKeyFile: keyPairEd25519.PublicKey(), + sshHostRsaKeyFile: keyPairRsa.PrivateKey(), + sshHostRsaPubKeyFile: keyPairRsa.PublicKey(), + }, + Immutable: true, + } + + opts.Metadata.Labels = structutils.MergeMaps(opts.Metadata.Labels, labels.NewBuilder().WithWorkerLabels(nodeset).Build()) + + return b.BuildSecret(opts, nodeset) +} diff --git a/internal/builder/worker_secret_test.go b/internal/builder/worker_secret_test.go new file mode 100644 index 00000000..134cbefa --- /dev/null +++ b/internal/builder/worker_secret_test.go @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: Copyright (C) SchedMD LLC. +// SPDX-License-Identifier: Apache-2.0 + +package builder + +import ( + "testing" + + slinkyv1beta1 "github.com/SlinkyProject/slurm-operator/api/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestBuilder_BuildWorkerSshHostKeys(t *testing.T) { + type fields struct { + client client.Client + } + type args struct { + nodeset *slinkyv1beta1.NodeSet + } + tests := []struct { + name string + fields fields + args args + wantErr bool + }{ + { + name: "default", + fields: fields{ + client: fake.NewFakeClient(), + }, + args: args{ + nodeset: &slinkyv1beta1.NodeSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "slurm", + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := New(tt.fields.client) + got, err := b.BuildWorkerSshHostKeys(tt.args.nodeset) + if (err != nil) != tt.wantErr { + t.Errorf("Builder.BuildWorkerSshHostKeys() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if err != nil { + return + } + + switch { + case got.Data[sshHostEcdsaKeyFile] == nil && got.StringData[sshHostEcdsaKeyFile] == "": + t.Errorf("got.Data[%s] = %v", sshHostEcdsaKeyFile, got.Data[sshHostEcdsaKeyFile]) + case got.Data[sshHostEcdsaPubKeyFile] == nil && got.StringData[sshHostEcdsaPubKeyFile] == "": + t.Errorf("got.Data[%s] = %v", sshHostEcdsaPubKeyFile, got.Data[sshHostEcdsaPubKeyFile]) + + case got.Data[sshHostEd25519KeyFile] == nil && got.StringData[sshHostEd25519KeyFile] == "": + t.Errorf("got.Data[%s] = %v", sshHostEd25519KeyFile, got.Data[sshHostEd25519KeyFile]) + case got.Data[sshHostEd25519PubKeyFile] == nil && got.StringData[sshHostEd25519PubKeyFile] == "": + t.Errorf("got.Data[%s] = %v", sshHostEd25519PubKeyFile, got.Data[sshHostEd25519PubKeyFile]) + + case got.Data[sshHostRsaKeyFile] == nil && got.StringData[sshHostRsaKeyFile] == "": + t.Errorf("got.Data[%s] = %v", sshHostRsaKeyFile, got.Data[sshHostRsaKeyFile]) + case got.Data[sshHostRsaPubKeyFile] == nil && got.StringData[sshHostRsaPubKeyFile] == "": + t.Errorf("got.Data[%s] = %v", sshHostRsaPubKeyFile, got.Data[sshHostRsaPubKeyFile]) + } + }) + } +} diff --git a/internal/controller/nodeset/nodeset_sync.go b/internal/controller/nodeset/nodeset_sync.go index 39a209ae..ed0dcf91 100644 --- a/internal/controller/nodeset/nodeset_sync.go +++ b/internal/controller/nodeset/nodeset_sync.go @@ -231,6 +231,10 @@ func (r *NodeSetReconciler) sync( return err } + if err := r.syncSshHostKeys(ctx, nodeset); err != nil { + return err + } + if err := r.syncSlurmDeadline(ctx, nodeset, pods); err != nil { return err } @@ -1119,3 +1123,25 @@ func (r *NodeSetReconciler) syncClusterWorkerPDB( return nil } + +// syncSshHostKeys manages SSH host keys secret for the NodeSet if SSH is enabled +func (r *NodeSetReconciler) syncSshHostKeys( + ctx context.Context, + nodeset *slinkyv1beta1.NodeSet, +) error { + // Only create SSH host keys if SSH is enabled + if !nodeset.Spec.Ssh.Enabled { + return nil + } + + secret, err := r.builder.BuildWorkerSshHostKeys(nodeset) + if err != nil { + return fmt.Errorf("failed to build SSH host keys secret: %w", err) + } + + if err := objectutils.SyncObject(r.Client, ctx, secret, true); err != nil { + return fmt.Errorf("failed to sync SSH host keys secret (%s): %w", klog.KObj(secret), err) + } + + return nil +}