From 5278f2db99d67c7f203f74c20116908ac794935f Mon Sep 17 00:00:00 2001 From: Soichi Hayashi Date: Thu, 12 Sep 2019 19:32:24 +0000 Subject: [PATCH 01/10] fixed various install issues --- .gitignore | 1 + README.md | 16 +-------------- clouds.yaml | 11 ---------- headnode_create.sh | 51 ++++++++++++++++++++++++++++++---------------- install.sh | 12 ++++++++++- 5 files changed, 46 insertions(+), 45 deletions(-) create mode 100644 .gitignore delete mode 100644 clouds.yaml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3dc4c38 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +openrc.sh diff --git a/README.md b/README.md index 8380ee8..6b5d36e 100644 --- a/README.md +++ b/README.md @@ -21,21 +21,7 @@ To build your own Virtual cluster, starting on your localhost: 1. Clone this repo. 1. Copy the openrc for the allocation in which you'd like to create a - virtual cluster to this repo. Additionally, please create a file - called ```clouds.yaml``` with the following format (replace values in - all caps with actual values, similar to your openrc file) -``` -clouds: - THE-NAME-OF-YOUR-CLOUD: - auth: - username: OS-USERNAME-SAME-AS-OPENRC - auth_url: SAME-AS-OPENRC - project_name: SAME-AS-OPENRC - password: SAME-AS-OPENRC - user_domain_name: SAME-AS-OPENRC - project_domain_name: SAME-AS-OPENRC - identity_api_version: 3 -``` + virtual cluster to this repo. 1. If you'd like to modify your cluster, now is a good time! This local copy of the repo will be re-created on the headnode, but diff --git a/clouds.yaml b/clouds.yaml deleted file mode 100644 index eaefb1e..0000000 --- a/clouds.yaml +++ /dev/null @@ -1,11 +0,0 @@ -clouds: - THE-NAME-OF-YOUR-CLOUD: - auth: - username: OS-USERNAME-SAME-AS-OPENRC - auth_url: SAME-AS-OPENRC - project_name: SAME-AS-OPENRC - password: SAME-AS-OPENRC - user_domain_name: SAME-AS-OPENRC - project_domain_name: SAME-AS-OPENRC - identity_api_version: 3 - diff --git a/headnode_create.sh b/headnode_create.sh index 3b18123..384848b 100755 --- a/headnode_create.sh +++ b/headnode_create.sh @@ -9,6 +9,7 @@ if [[ -z "$1" ]]; then echo "NO SERVER NAME GIVEN! Please re-run with ./headnode_create.sh " exit fi +server_name=$1 if [[ ! -e ${HOME}/.ssh/id_rsa.pub ]]; then #This may be temporary... but seems fairly reasonable. @@ -21,25 +22,29 @@ source ./openrc.sh # Defining a function here to check for quotas, and exit if this script will cause problems! # also, storing 'quotas' in a global var, so we're not calling it every single time quotas=$(openstack quota show) -quota_check () -{ -quota_name=$1 -type_name=$2 #the name for a quota and the name for the thing itself are not the same -number_created=$3 #number of the thing that we'll create here. +quota_check () { + quota_name=$1 + type_name=$2 #the name for a quota and the name for the thing itself are not the same + number_created=$3 #number of the thing that we'll create here. -current_num=$(openstack $type_name list -f value | wc -l) + echo "checking quota $quota_name" + current_num=$(openstack $type_name list -f value | wc -l) -max_types=$(echo "$quotas" | awk -v quota=$quota_name '$0 ~ quota {print $4}') + max_types=$(echo "$quotas" | awk -v quota=$quota_name '$0 ~ quota {print $4}') -#echo "checking quota for $quota_name of $type_name to create $number_created - want $current_num to be less than $max_types" + #echo "checking quota for $quota_name of $type_name to create $number_created - want $current_num to be less than $max_types" -if [[ "$current_num" -lt "$((max_types + number_created))" ]]; then - return 0 -fi -return 1 + if [[ "$current_num" -lt "$((max_types + number_created))" ]]; then + return 0 + fi + + return 1 } +set -x #show use which commands are executed +set -e #terminate as soon as any command fails +quota_check "secgroups" "security group" 1 quota_check "networks" "network" 1 quota_check "subnets" "subnet" 1 quota_check "routers" "router" 1 @@ -77,7 +82,7 @@ if [[ -e ${HOME}/.ssh/id_rsa.pub ]]; then fi openstack_keys=$(openstack keypair list -f value) -home_key_in_OS=$(echo "$openstack_keys" | awk -v mykey=$home_key_fingerprint '$2 ~ mykey {print $1}') +home_key_in_OS=$(echo "$openstack_keys" | awk -v mykey=$home_key_fingerprint '$2 ~ mykey {print $server_name}') if [[ -n "$home_key_in_OS" ]]; then OS_keyname=$home_key_in_OS @@ -92,17 +97,27 @@ else OS_keyname=${OS_USERNAME}-elastic-key fi -image_name=$(openstack image list -f value | grep -i JS-API-Featured-Centos7- | grep -vi Intel | cut -f 2 -d' ') -echo "openstack server create --user-data prevent-updates.ci --flavor m1.small --image $image_name --key-name $OS_keyname --security-group $OS_USERNAME-global-ssh --security-group $OS_USERNAME-cluster-internal --nic net-id=${OS_USERNAME}-elastic-net $1" -openstack server create --user-data prevent-updates.ci --flavor m1.small --image $image_name --key-name $OS_keyname --security-group ${OS_USERNAME}-global-ssh --security-group ${OS_USERNAME}-cluster-internal --nic net-id=${OS_USERNAME}-elastic-net $1 +image_name=$(openstack image list -f value | grep -i JS-API-Featured-Centos7- | grep -vi Intel | cut -f 2 -d' ' | tail -1) + +openstack server create + --user-data prevent-updates.ci \ + --flavor m1.small \ + --image $image_name \ + --key-name $OS_keyname \ + --security-group ${OS_USERNAME}-global-ssh \ + --security-group ${OS_USERNAME}-cluster-internal \ + --nic net-id=${OS_USERNAME}-elastic-net \ + $server_name + public_ip=$(openstack floating ip create public | awk '/floating_ip_address/ {print $4}') + #For some reason there's a time issue here - adding a sleep command to allow network to become ready sleep 10 -openstack server add floating ip $1 $public_ip +openstack server add floating ip $server_name $public_ip hostname_test=$(ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no centos@$public_ip 'hostname') echo "test1: $hostname_test" -until [[ $hostname_test =~ "$1" ]]; do +until [[ $hostname_test =~ "$server_name" ]]; do sleep 2 hostname_test=$(ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no centos@$public_ip 'hostname') echo "ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no centos@$public_ip 'hostname'" diff --git a/install.sh b/install.sh index 3e504e6..1d75ac3 100755 --- a/install.sh +++ b/install.sh @@ -5,6 +5,14 @@ if [[ ! -e ./openrc.sh ]]; then exit fi +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + exit 1 +fi + +set -e +set -x + yum -y install https://github.com/openhpc/ohpc/releases/download/v1.3.GA/ohpc-release-1.3-1.el7.x86_64.rpm centos-release-openstack-rocky yum -y install ohpc-slurm-server vim ansible mailx lmod-ohpc bash-completion gnu-compilers-ohpc openmpi-gnu-ohpc lmod-defaults-gnu-openmpi-ohpc moreutils bind-utils python-openstackclient @@ -27,7 +35,7 @@ echo -e "clouds: project_name: ${OS_PROJECT_NAME} password: ${OS_PASSWORD} user_domain_name: ${OS_USER_DOMAIN_NAME} - project_domain_name: ${OS_PROJECT_DOMAIN_NAME} + project_domain_id: ${OS_PROJECT_DOMAIN_ID} identity_api_version: 3" > clouds.yaml # Defining a function here to check for quotas, and exit if this script will cause problems! @@ -187,3 +195,5 @@ systemctl enable slurmctld munge nfs-server nfs-lock nfs rpcbind nfs-idmap systemctl start munge slurmctld nfs-server nfs-lock nfs rpcbind nfs-idmap echo -e "If you wish to enable an email when node state is drain or down, please uncomment \nthe cron-node-check.sh job in /etc/crontab, and place your email of choice in the 'email_addr' variable \nat the beginning of /usr/local/sbin/cron-node-check.sh" + +rm openrc.sh From a5f8b8694bb412e2ec49b2e31f0983750b6710cc Mon Sep 17 00:00:00 2001 From: Soichi Hayashi Date: Thu, 12 Sep 2019 19:42:34 +0000 Subject: [PATCH 02/10] removing clouds.yaml also --- install.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/install.sh b/install.sh index 1d75ac3..599cb0b 100755 --- a/install.sh +++ b/install.sh @@ -196,4 +196,6 @@ systemctl start munge slurmctld nfs-server nfs-lock nfs rpcbind nfs-idmap echo -e "If you wish to enable an email when node state is drain or down, please uncomment \nthe cron-node-check.sh job in /etc/crontab, and place your email of choice in the 'email_addr' variable \nat the beginning of /usr/local/sbin/cron-node-check.sh" +echo "removing openrc.sh and clouds.yaml which contains openstack password. (once you are done with install, you can remove this whole directory)". rm openrc.sh +rm clouds.yaml From ff52845cc17d41aee498a3898ebf2ec06c00a223 Mon Sep 17 00:00:00 2001 From: Soichi Hayashi Date: Thu, 12 Sep 2019 20:43:04 +0000 Subject: [PATCH 03/10] updated install.sh so that it can be rerun added useradd for "user" user that can be used to run jobs as regular user --- install.sh | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/install.sh b/install.sh index 599cb0b..c9c2c3a 100755 --- a/install.sh +++ b/install.sh @@ -2,7 +2,7 @@ if [[ ! -e ./openrc.sh ]]; then echo "NO OPENRC FOUND! CREATE ONE, AND TRY AGAIN!" - exit + exit 1 fi if [[ $EUID -ne 0 ]]; then @@ -15,14 +15,28 @@ set -x yum -y install https://github.com/openhpc/ohpc/releases/download/v1.3.GA/ohpc-release-1.3-1.el7.x86_64.rpm centos-release-openstack-rocky -yum -y install ohpc-slurm-server vim ansible mailx lmod-ohpc bash-completion gnu-compilers-ohpc openmpi-gnu-ohpc lmod-defaults-gnu-openmpi-ohpc moreutils bind-utils python-openstackclient - -#Comment these next three steps out if re-running locally! -ssh-keygen -b 2048 -t rsa -P "" -f slurm-key +yum -y install \ + ohpc-slurm-server \ + vim ansible \ + mailx \ + lmod-ohpc \ + bash-completion \ + gnu-compilers-ohpc \ + openmpi-gnu-ohpc \ + lmod-defaults-gnu-openmpi-ohpc \ + moreutils \ + bind-utils \ + jq \ + git \ + python-openstackclient + +#create user that can be used to submit jobs +[ ! -d /home/user ] && useradd -m user + +[ ! -f slurm-key ] && ssh-keygen -b 2048 -t rsa -P "" -f slurm-key # generate a local key for centos for after homedirs are mounted! -su centos - -c 'ssh-keygen -t rsa -b 2048 -P "" -f /home/centos/.ssh/id_rsa' -su centos - -c 'cat /home/centos/.ssh/id_rsa.pub >> /home/centos/.ssh/authorized_keys' +[ ! -f /home/centos/.ssh/id_rsa ] && su centos - -c 'ssh-keygen -t rsa -b 2048 -P "" -f /home/centos/.ssh/id_rsa && cat /home/centos/.ssh/id_rsa.pub >> /home/centos/.ssh/authorized_keys' source ./openrc.sh @@ -134,7 +148,7 @@ cp prevent-updates.ci /etc/slurm/ chown slurm:slurm /etc/slurm/prevent-updates.ci -mkdir /var/log/slurm +mkdir -p /var/log/slurm touch /var/log/slurm/slurm_elastic.log touch /var/log/slurm/os_clean.log @@ -195,7 +209,3 @@ systemctl enable slurmctld munge nfs-server nfs-lock nfs rpcbind nfs-idmap systemctl start munge slurmctld nfs-server nfs-lock nfs rpcbind nfs-idmap echo -e "If you wish to enable an email when node state is drain or down, please uncomment \nthe cron-node-check.sh job in /etc/crontab, and place your email of choice in the 'email_addr' variable \nat the beginning of /usr/local/sbin/cron-node-check.sh" - -echo "removing openrc.sh and clouds.yaml which contains openstack password. (once you are done with install, you can remove this whole directory)". -rm openrc.sh -rm clouds.yaml From 0172b93643314f6f6c3490eb70ea6e221f66da6f Mon Sep 17 00:00:00 2001 From: Soichi Hayashi Date: Thu, 12 Sep 2019 21:12:17 +0000 Subject: [PATCH 04/10] added a step to create "user" --- compute_build_base_img.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/compute_build_base_img.yml b/compute_build_base_img.yml index e17de83..cfe4ded 100644 --- a/compute_build_base_img.yml +++ b/compute_build_base_img.yml @@ -3,7 +3,7 @@ - hosts: localhost vars: - compute_base_image: "JS-API-Featured-CentOS7-May-20-2019" + compute_base_image: "JS-API-Featured-CentOS7-Sep-09-2019" sec_group_global: "{{ clouds.tacc.auth.username }}-global-ssh" sec_group_internal: "{{ clouds.tacc.auth.username }}-cluster-internal" compute_base_size: "m1.small" @@ -56,6 +56,14 @@ - "openmpi-gnu-ohpc" - "ohpc-slurm-client" - "lmod-ohpc" + - "epel-release" + - "git" + - "bind-utils" + - "gcc" + - "wget" + - "jq" + - "nodejs" + - "singularity" tasks: @@ -203,6 +211,10 @@ - name: enable slurmd service: name=slurmd enabled=yes + - name: create local user called "user" + user: + name: user + #cat /etc/systemd/system/multi-user.target.wants/slurmd.service #[Unit] #Description=Slurm node daemon From 420f78896c5fa573cfd79cf09a652ece914a9af9 Mon Sep 17 00:00:00 2001 From: Soichi Hayashi Date: Fri, 13 Sep 2019 22:53:46 +0000 Subject: [PATCH 05/10] replaced OS_USRNAME with cluster_name fixed a few bugs that I've created while updating --- compute_build_base_img.yml | 17 +++++++------ compute_take_snapshot.sh | 7 ++++-- headnode_create.sh | 51 +++++++++++++++++++------------------- install.sh | 33 ++++++++++++++---------- slurm.conf | 23 +++++++++-------- slurm_resume.sh | 9 ++++--- 6 files changed, 77 insertions(+), 63 deletions(-) diff --git a/compute_build_base_img.yml b/compute_build_base_img.yml index cfe4ded..4954a8d 100644 --- a/compute_build_base_img.yml +++ b/compute_build_base_img.yml @@ -4,11 +4,11 @@ vars: compute_base_image: "JS-API-Featured-CentOS7-Sep-09-2019" - sec_group_global: "{{ clouds.tacc.auth.username }}-global-ssh" - sec_group_internal: "{{ clouds.tacc.auth.username }}-cluster-internal" + sec_group_global: "{{ clouds.tacc.cluster_name }}-global-ssh" + sec_group_internal: "{{ clouds.tacc.cluster_name }}-cluster-internal" compute_base_size: "m1.small" - network_name: "{{ clouds.tacc.auth.username }}-elastic-net" - JS_ssh_keyname: "{{ clouds.tacc.auth.username }}-{{ clouds.tacc.auth.project_name }}-slurm-key" + network_name: "{{ clouds.tacc.cluster_name }}-elastic-net" + JS_ssh_keyname: "{{ clouds.tacc.cluster_name }}-{{ clouds.tacc.auth.project_name }}-slurm-key" vars_files: - clouds.yaml @@ -19,7 +19,7 @@ os_server: timeout: 300 state: present - name: "compute-{{ clouds.tacc.auth.username }}-base-instance" + name: "compute-{{ clouds.tacc.cluster_name }}-base-instance" cloud: "tacc" image: "{{ compute_base_image }}" key_name: "{{ JS_ssh_keyname }}" @@ -56,7 +56,6 @@ - "openmpi-gnu-ohpc" - "ohpc-slurm-client" - "lmod-ohpc" - - "epel-release" - "git" - "bind-utils" - "gcc" @@ -254,11 +253,13 @@ tasks: - name: create compute instance snapshot - script: ./compute_take_snapshot.sh + script: ./compute_take_snapshot.sh "compute-{{ clouds.tacc.cluster_name }}-base-instance" - name: remove compute instance os_server: timeout: 200 state: absent - name: "compute-{{ clouds.tacc.auth.username }}-base-instance" + name: "compute-{{ clouds.tacc.cluster_name }}-base-instance" cloud: "tacc" + + diff --git a/compute_take_snapshot.sh b/compute_take_snapshot.sh index 454a833..3de4f36 100755 --- a/compute_take_snapshot.sh +++ b/compute_take_snapshot.sh @@ -2,8 +2,11 @@ source openrc.sh -compute_image="${OS_USERNAME}-compute-image-$(date +%m-%d-%Y)" -compute_instance="compute-${OS_USERNAME}-base-instance" +#compute_instance="compute-${cluster_name}-base-instance" +cluster_name=$(hostname -s) + +compute_instance=$1 +compute_image="${cluster_name}-compute-image-$(date +%m-%d-%Y)" openstack server stop $compute_instance diff --git a/headnode_create.sh b/headnode_create.sh index 384848b..309e1f2 100755 --- a/headnode_create.sh +++ b/headnode_create.sh @@ -52,28 +52,27 @@ quota_check "key-pairs" "keypair" 1 quota_check "instances" "server" 1 # Ensure that the correct private network/router/subnet exists -if [[ -z "$(openstack network list | grep ${OS_USERNAME}-elastic-net)" ]]; then - openstack network create ${OS_USERNAME}-elastic-net - openstack subnet create --network ${OS_USERNAME}-elastic-net --subnet-range 10.0.0.0/24 ${OS_USERNAME}-elastic-subnet1 +if [[ -z "$(openstack network list | grep ${server_name}-elastic-net)" ]]; then + openstack network create ${server_name}-elastic-net + openstack subnet create --network ${server_name}-elastic-net --subnet-range 10.0.0.0/24 ${server_name}-elastic-subnet1 fi ##openstack subnet list -if [[ -z "$(openstack router list | grep ${OS_USERNAME}-elastic-router)" ]]; then - openstack router create ${OS_USERNAME}-elastic-router - openstack router add subnet ${OS_USERNAME}-elastic-router ${OS_USERNAME}-elastic-subnet1 - openstack router set --external-gateway public ${OS_USERNAME}-elastic-router +if [[ -z "$(openstack router list | grep ${server_name}-elastic-router)" ]]; then + openstack router create ${server_name}-elastic-router + openstack router add subnet ${server_name}-elastic-router ${server_name}-elastic-subnet1 + openstack router set --external-gateway public ${server_name}-elastic-router fi -#openstack router show ${OS_USERNAME}-api-router security_groups=$(openstack security group list -f value) -if [[ ! ("$security_groups" =~ "${OS_USERNAME}-global-ssh") ]]; then - openstack security group create --description "ssh \& icmp enabled" $OS_USERNAME-global-ssh - openstack security group rule create --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 $OS_USERNAME-global-ssh - openstack security group rule create --protocol icmp $OS_USERNAME-global-ssh +if [[ ! ("$security_groups" =~ "${server_name}-global-ssh") ]]; then + openstack security group create --description "ssh \& icmp enabled" $server_name-global-ssh + openstack security group rule create --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 $server_name-global-ssh + openstack security group rule create --protocol icmp $server_name-global-ssh fi -if [[ ! ("$security_groups" =~ "${OS_USERNAME}-cluster-internal") ]]; then - openstack security group create --description "internal group for cluster" $OS_USERNAME-cluster-internal - openstack security group rule create --protocol tcp --dst-port 1:65535 --remote-ip 10.0.0.0/0 $OS_USERNAME-cluster-internal - openstack security group rule create --protocol icmp $OS_USERNAME-cluster-internal +if [[ ! ("$security_groups" =~ "${server_name}-cluster-internal") ]]; then + openstack security group create --description "internal group for cluster" $server_name-cluster-internal + openstack security group rule create --protocol tcp --dst-port 1:65535 --remote-ip 10.0.0.0/0 $server_name-cluster-internal + openstack security group rule create --protocol icmp $server_name-cluster-internal fi #Check if ${HOME}/.ssh/id_rsa.pub exists in JS @@ -86,27 +85,27 @@ home_key_in_OS=$(echo "$openstack_keys" | awk -v mykey=$home_key_fingerprint '$2 if [[ -n "$home_key_in_OS" ]]; then OS_keyname=$home_key_in_OS -elif [[ -n $(echo "$openstack_keys" | grep ${OS_USERNAME}-elastic-key) ]]; then - openstack keypair delete ${OS_USERNAME}-elastic-key +elif [[ -n $(echo "$openstack_keys" | grep ${server_name}-elastic-key) ]]; then + openstack keypair delete ${server_name}-elastic-key # This doesn't need to depend on the OS_PROJECT_NAME, as the slurm-key does, in install.sh and slurm_resume - openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${OS_USERNAME}-elastic-key - OS_keyname=${OS_USERNAME}-elastic-key + openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${server_name}-elastic-key + OS_keyname=${server_name}-elastic-key else # This doesn't need to depend on the OS_PROJECT_NAME, as the slurm-key does, in install.sh and slurm_resume - openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${OS_USERNAME}-elastic-key - OS_keyname=${OS_USERNAME}-elastic-key + openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${server_name}-elastic-key + OS_keyname=${server_name}-elastic-key fi image_name=$(openstack image list -f value | grep -i JS-API-Featured-Centos7- | grep -vi Intel | cut -f 2 -d' ' | tail -1) -openstack server create +openstack server create \ --user-data prevent-updates.ci \ --flavor m1.small \ --image $image_name \ --key-name $OS_keyname \ - --security-group ${OS_USERNAME}-global-ssh \ - --security-group ${OS_USERNAME}-cluster-internal \ - --nic net-id=${OS_USERNAME}-elastic-net \ + --security-group ${server_name}-global-ssh \ + --security-group ${server_name}-cluster-internal \ + --nic net-id=${server_name}-elastic-net \ $server_name public_ip=$(openstack floating ip create public | awk '/floating_ip_address/ {print $4}') diff --git a/install.sh b/install.sh index c9c2c3a..bcdf3e0 100755 --- a/install.sh +++ b/install.sh @@ -28,6 +28,7 @@ yum -y install \ bind-utils \ jq \ git \ + singularity \ python-openstackclient #create user that can be used to submit jobs @@ -40,6 +41,8 @@ yum -y install \ source ./openrc.sh +cluster_name=$(hostname -s) + #create clouds.yaml file from contents of openrc echo -e "clouds: tacc: @@ -48,6 +51,7 @@ echo -e "clouds: auth_url: ${OS_AUTH_URL} project_name: ${OS_PROJECT_NAME} password: ${OS_PASSWORD} + cluster_name: $cluster_name user_domain_name: ${OS_USER_DOMAIN_NAME} project_domain_id: ${OS_PROJECT_DOMAIN_ID} identity_api_version: 3" > clouds.yaml @@ -84,24 +88,24 @@ fi #quota_check "instances" "server" 1 -if [[ -n $(openstack keypair list | grep ${OS_USERNAME}-${OS_PROJECT_NAME}-slurm-key) ]]; then - openstack keypair delete ${OS_USERNAME}-${OS_PROJECT_NAME}-slurm-key - openstack keypair create --public-key slurm-key.pub ${OS_USERNAME}-${OS_PROJECT_NAME}-slurm-key +if [[ -n $(openstack keypair list | grep ${cluster_name}-${OS_PROJECT_NAME}-slurm-key) ]]; then + openstack keypair delete ${cluster_name}-${OS_PROJECT_NAME}-slurm-key + openstack keypair create --public-key slurm-key.pub ${cluster_name}-${OS_PROJECT_NAME}-slurm-key else - openstack keypair create --public-key slurm-key.pub ${OS_USERNAME}-${OS_PROJECT_NAME}-slurm-key + openstack keypair create --public-key slurm-key.pub ${cluser_name}-${OS_PROJECT_NAME}-slurm-key fi #make sure security groups exist... this could cause issues. if [[ ! ("$security_groups" =~ "global-ssh") ]]; then - openstack security group create --description "ssh \& icmp enabled" ${OS_USERNAME}-global-ssh - openstack security group rule create --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 ${OS_USERNAME}-global-ssh - openstack security group rule create --protocol icmp ${OS_USERNAME}-global-ssh + openstack security group create --description "ssh \& icmp enabled" ${cluster_name}-global-ssh + openstack security group rule create --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 ${cluster_name}-global-ssh + openstack security group rule create --protocol icmp ${cluster_name}-global-ssh fi if [[ ! ("$security_groups" =~ "cluster-internal") ]]; then - openstack security group create --description "internal 10.0.0.0/24 network allowed" ${OS_USERNAME}-cluster-internal - openstack security group rule create --protocol tcp --dst-port 1:65535 --remote-ip 10.0.0.0/24 ${OS_USERNAME}-cluster-internal - openstack security group rule create --protocol udp --dst-port 1:65535 --remote-ip 10.0.0.0/24 ${OS_USERNAME}-cluster-internal - openstack security group rule create --protocol icmp ${OS_USERNAME}-cluster-internal + openstack security group create --description "internal 10.0.0.0/24 network allowed" ${cluster_name}-cluster-internal + openstack security group rule create --protocol tcp --dst-port 1:65535 --remote-ip 10.0.0.0/24 ${cluster_name}-cluster-internal + openstack security group rule create --protocol udp --dst-port 1:65535 --remote-ip 10.0.0.0/24 ${cluster_name}-cluster-internal + openstack security group rule create --protocol icmp ${cluster_name}-cluster-internal fi #TACC-specific changes: @@ -117,8 +121,8 @@ fi #sed -i "s/network_name=.*/network_name=$headnode_os_subnet/" ./slurm_resume.sh #Set compute node names to $OS_USERNAME-compute- -sed -i "s/=compute-*/=${OS_USERNAME}-compute-/" ./slurm.conf -sed -i "s/Host compute-*/Host ${OS_USERNAME}-compute-/" ./ssh.cfg +sed -i "s/=compute-*/=${cluster_name}-compute-/" ./slurm.conf +sed -i "s/Host compute-*/Host ${cluster_name}-compute-/" ./ssh.cfg # Deal with files required by slurm - better way to encapsulate this section? @@ -161,6 +165,7 @@ setfacl -m u:slurm:rw /etc/ansible/hosts setfacl -m u:slurm:rwx /etc/ansible/ cp slurm_*.sh /usr/local/sbin/ +#sed -i "s/node_size=.*/node_size=m1.xlarge/" /usr/local/sbin/slurm_resume.sh cp cron-node-check.sh /usr/local/sbin/ cp clean-os-error.sh /usr/local/sbin/ @@ -209,3 +214,5 @@ systemctl enable slurmctld munge nfs-server nfs-lock nfs rpcbind nfs-idmap systemctl start munge slurmctld nfs-server nfs-lock nfs rpcbind nfs-idmap echo -e "If you wish to enable an email when node state is drain or down, please uncomment \nthe cron-node-check.sh job in /etc/crontab, and place your email of choice in the 'email_addr' variable \nat the beginning of /usr/local/sbin/cron-node-check.sh" + + diff --git a/slurm.conf b/slurm.conf index 7c7a271..e851c30 100644 --- a/slurm.conf +++ b/slurm.conf @@ -68,7 +68,8 @@ SchedulerType=sched/backfill #SchedulerRootFilter= #SelectType=select/linear SelectType=select/cons_res -SelectTypeParameters=CR_CPU +#SelectTypeParameters=CR_CPU +SelectTypeParameters=CR_CPU_Memory FastSchedule=0 #PriorityType=priority/multifactor #PriorityDecayHalfLife=14-0 @@ -98,18 +99,20 @@ AccountingStorageLoc=/var/log/slurm/slurm_jobacct.log #AccountingStorageUser= # #GENERAL RESOURCE -GresTypes="" +#GresTypes="" # #CLOUD CONFIGURATION PrivateData=cloud ResumeProgram=/usr/local/sbin/slurm_resume.sh SuspendProgram=/usr/local/sbin/slurm_suspend.sh -ResumeRate=0 #number of nodes per minute that can be created; 0 means no limit +ResumeRate=1 #number of nodes per minute that can be created; 0 means no limit ResumeTimeout=900 #max time in seconds between ResumeProgram running and when the node is ready for use -SuspendRate=0 #number of nodes per minute that can be suspended/destroyed -SuspendTime=60 #time in seconds before an idle node is suspended -SuspendTimeout=30 #time between running SuspendProgram and the node being completely down -#COMPUTE NODES -NodeName=compute-[0-1] State=CLOUD CPUs=2 -#PARTITIONS -PartitionName=cloud Nodes=compute-[0-1] Default=YES MaxTime=INFINITE State=UP +SuspendRate=1 #number of nodes per minute that can be suspended/destroyed +SuspendTime=300 #time in seconds before an idle node is suspended +SuspendTimeout=300 #time between running SuspendProgram and the node being completely down + +NodeName=compute-[0-10] State=CLOUD CPUs=24 RealMemory=60388 +#NodeName=hayashis-compute-[1-10] State=CLOUD Sockets=24 CoresPerSocket=1 ThreadsPerCore=1 RealMemory=60388 +PartitionName=cloud LLN=YES Nodes=compute-[1-10] Default=YES MaxTime=48:00:00 State=UP Shared=YES + + diff --git a/slurm_resume.sh b/slurm_resume.sh index f113c18..4642ac6 100755 --- a/slurm_resume.sh +++ b/slurm_resume.sh @@ -2,10 +2,11 @@ source /etc/slurm/openrc.sh +cluster_name=$(hostname -s) node_size="m1.small" -node_image=$(openstack image list -f value | grep -i ${OS_USERNAME}-compute-image- | cut -f 2 -d' '| tail -n 1) -key_name="${OS_USERNAME}-${OS_PROJECT_NAME}-slurm-key" -network_name=${OS_USERNAME}-elastic-net +node_image=$(openstack image list -f value | grep -i ${cluster_name}-compute-image- | cut -f 2 -d' '| tail -n 1) +key_name="${cluster_name}-${OS_PROJECT_NAME}-slurm-key" +network_name=${cluster_name}-elastic-net log_loc=/var/log/slurm/slurm_elastic.log echo "Node resume invoked: $0 $*" >> $log_loc @@ -29,7 +30,7 @@ do --image $node_image \ --key-name $key_name \ --user-data <(cat /etc/slurm/prevent-updates.ci && echo -e "hostname: $host \npreserve_hostname: true\ndebug:") \ - --security-group ${OS_USERNAME}-global-ssh --security-group ${OS_USERNAME}-cluster-internal \ + --security-group ${cluster_name}-global-ssh --security-group ${cluster_name}-cluster-internal \ --nic net-id=$network_name 2>&1 \ | tee -a $log_loc | awk '/status/ {print $4}') From e2ae1be82eaa49ac561bb1668074660789e02c05 Mon Sep 17 00:00:00 2001 From: Soichi Hayashi Date: Sat, 14 Sep 2019 00:55:01 +0000 Subject: [PATCH 06/10] added line to enable hostfs for singularity --- compute_build_base_img.yml | 25 ++++++------------------- install.sh | 1 + 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/compute_build_base_img.yml b/compute_build_base_img.yml index 4954a8d..56bd2eb 100644 --- a/compute_build_base_img.yml +++ b/compute_build_base_img.yml @@ -95,25 +95,12 @@ name: "{{ compute_base_package_list }}" state: present lock_timeout: 300 - # - "quantum-espresso-openmpi" - # - "quantum-espresso" - # - "rsync" - # - "epel-release" - # - "openmpi-devel" #torque - # - "gcc" - # - "gcc-c++" - # - "gcc-gfortran" - # - "openssl-devel" - # - "libxml2-devel" - # - "boost-devel" - # - "net-tools" - # - "strace" - # - "wget" # needed for building QE - # - "readline-devel" #req for slurm - # - "pam-devel" # req for slurm - # - "perl-ExtUtils-MakeMaker" # req for slurm - # - "fftw" # req for QE... need a better way to specify these!!! - # + + - name: enable hostfs mount on singularity + replace: + path: /etc/singularity/singularity.conf + regexp: "^(mount hostfs = no)$" + replace: "mount hostfs = yes" - name: fix slurm user uid user: diff --git a/install.sh b/install.sh index bcdf3e0..f7a64f0 100755 --- a/install.sh +++ b/install.sh @@ -26,6 +26,7 @@ yum -y install \ lmod-defaults-gnu-openmpi-ohpc \ moreutils \ bind-utils \ + nodejs \ jq \ git \ singularity \ From 3b16128a5b0cfa028e5cec8d29b255d7aae6eddc Mon Sep 17 00:00:00 2001 From: Cicada Dennis Date: Fri, 7 Feb 2020 12:39:32 -0500 Subject: [PATCH 07/10] Cicada Dennis - added quotes around variable name, so awk doesn't choke if the variable has spaces in it. --- headnode_create.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/headnode_create.sh b/headnode_create.sh index 97360d4..84c81dd 100755 --- a/headnode_create.sh +++ b/headnode_create.sh @@ -79,7 +79,7 @@ if [[ -e ${HOME}/.ssh/id_rsa.pub ]]; then fi openstack_keys=$(openstack keypair list -f value) -home_key_in_OS=$(echo "${openstack_keys}" | awk -v mykey=${home_key_fingerprint} '$2 ~ mykey {print $1}') +home_key_in_OS=$(echo "${openstack_keys}" | awk -v mykey="${home_key_fingerprint}" '$2 ~ mykey {print $1}') if [[ -n "${home_key_in_OS}" ]]; then OS_keyname=${home_key_in_OS} From 527ec7cf85d5090c8bc61cfb0f0071c2a7d69b94 Mon Sep 17 00:00:00 2001 From: Eric Coulter Date: Tue, 18 Feb 2020 13:43:56 -0500 Subject: [PATCH 08/10] Added note about node_size modification --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c92ce61..2945b71 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,9 @@ To build your own Virtual cluster, starting on your localhost: the NodeName and PartitionName line. * If you'd like to change the default node size, the ```node_size=```line in ```slurm_resume.sh``` must be changed. + This should take values corresponding to instance sizes in Jetstream, like + "m1.small" or "m1.large". Be sure to edit the ```slurm.conf``` file to + reflect the number of CPUs available. * If you'd like to enable any specific software, you should edit ```compute_build_base_img.yml```. The task named "install basic packages" can be easily extended to install anything available from a yum From 2ad3712b0d2f3dcd83559c240c078407d12de771 Mon Sep 17 00:00:00 2001 From: Eric Coulter Date: Mon, 2 Mar 2020 14:57:26 -0500 Subject: [PATCH 09/10] added openrc generation Generate openrc in /etc/slurm/ from env vars rather than copy since some openrc files ask for the password Signed-off-by: Eric Coulter --- install.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/install.sh b/install.sh index 6fd5333..7442170 100755 --- a/install.sh +++ b/install.sh @@ -145,9 +145,16 @@ setfacl -m u:slurm:rwx /etc/ chmod +t /etc -#How to generate a working openrc in the cloud-init script for this? Bash vars available? -# Gonna be tough, since openrc requires a password... -cp openrc.sh /etc/slurm/ +#Possible to handle this at the cloud-init level? From a machine w/ +# pre-loaded openrc, possible via user-data and write_files, yes. +echo -e "export OS_PROJECT_DOMAIN_NAME=tacc +export OS_USER_DOMAIN_NAME=tacc +export OS_PROJECT_NAME=${OS_PROJECT_NAME} +export OS_USERNAME=${OS_PROJECT_NAME} +export OS_PASSWORD=${OS_PASSWORD} +export OS_AUTH_URL=${OS_AUTH_URL} +export OS_IDENTITY_API_VERSION=3" > /etc/slurm/openrc.sh + chown slurm:slurm /etc/slurm/openrc.sh From 8d0cee053722730b91a7eb1c114ebdc9ebb37942 Mon Sep 17 00:00:00 2001 From: Eric Coulter Date: Mon, 2 Mar 2020 14:58:20 -0500 Subject: [PATCH 10/10] update slurm.conf at node creation time pull in slurm.conf at node creation via cloud-config write_files module, rather than having to rebuild the instance image when only changing number/size of nodes or partitions Signed-off-by: Eric Coulter --- slurm_resume.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slurm_resume.sh b/slurm_resume.sh index 3da1508..5ceea00 100755 --- a/slurm_resume.sh +++ b/slurm_resume.sh @@ -15,12 +15,15 @@ for host in $(scontrol show hostname $1) do #Launch compute nodes and check for new ip address in same subprocess - with 2s delay between Openstack requests + #--user-data <(cat /etc/slurm/prevent-updates.ci && echo -e "hostname: $host \npreserve_hostname: true\ndebug:") \ + # the current --user-data pulls in the slurm.conf as well, to avoid rebuilding node images + # when adding / changing partitions (echo "creating $host" >> $log_loc; openstack server create $host \ --flavor $node_size \ --image $node_image \ --key-name $key_name \ - --user-data <(cat /etc/slurm/prevent-updates.ci && echo -e "hostname: $host \npreserve_hostname: true\ndebug:") \ + --user-data <(cat /etc/slurm/prevent-updates.ci && echo -e "hostname: $host \npreserve_hostname: true\ndebug:" && echo -e "write_files:\n - encoding: b64\n owner: slurm\n path: /etc/slurm/slurm.conf\n permissions: 0644\n content: |\n$(cat /etc/slurm/slurm.conf | base64 | sed 's/^/ /')\n") \ --security-group ${OS_USERNAME}-global-ssh --security-group ${OS_USERNAME}-cluster-internal \ --nic net-id=$network_name 2>&1 \ | tee -a $log_loc | awk '/status/ {print $4}' >> $log_loc 2>&1;