diff --git a/lib/common.sh b/lib/common.sh index 25ae9932..d5e90d68 100644 --- a/lib/common.sh +++ b/lib/common.sh @@ -22,6 +22,11 @@ MOUNTMAPv3="${OPTDIRDATA}/mountmap" # MOUNTMAPv4="${OPTDIRDATA}/mountmapv4" +# +# This stores the map of local IP and share name an external file endpoint IP. +# +MOUNTMAPv4NONTLS="${OPTDIRDATA}/mountmapv4nontls" + RED="\e[2;31m" GREEN="\e[2;32m" YELLOW="\e[2;33m" @@ -52,7 +57,77 @@ RELEASE_NUMBER_FOR_AKS=x.y.z # How often does the watchdog look for unmounts and/or IP address changes for # Blob and nfs file endpoints. # -MONITOR_INTERVAL_SECS=5 +MONITOR_INTERVAL_SECS=30 + +# +# ------------------ Common definitions from nfsv3mountscript.sh -------------------- +# + +# +# Default order in which we try the network prefixes for a free local IP to use. +# This can be overriden using AZNFS_IP_PREFIXES environment variable. +# +DEFAULT_AZNFS_IP_PREFIXES="10.161 192.168 172.16" +IP_PREFIXES="${AZNFS_IP_PREFIXES:-${DEFAULT_AZNFS_IP_PREFIXES}}" + +# Aznfs port, defaults to 2048. +AZNFS_PORT="${AZNFS_PORT:-2048}" + +# Default to checking azure nconnect support. +AZNFS_CHECK_AZURE_NCONNECT="${AZNFS_CHECK_AZURE_NCONNECT:-1}" + +# Default to fixing mount options passed in to help the user. +AZNFS_FIX_MOUNT_OPTIONS="${AZNFS_FIX_MOUNT_OPTIONS:-1}" + +# Default to fixing dirty bytes config to help the user. +AZNFS_FIX_DIRTY_BYTES_CONFIG="${AZNFS_FIX_DIRTY_BYTES_CONFIG:-1}" + +# Read ahead size in KB defaults to 16384. +AZNFS_READ_AHEAD_KB="${AZNFS_READ_AHEAD_KB:-16384}" + +# +# Use noresvport mount option to allow using non-reserve ports by client. +# This allows much higher number of local ports to be used by NFS client and +# hence may alleviate some issues due to running out of very small resv port range. +# Blob NFS doesn't require clients to use reserve ports so we can use non-reserve +# port with Blob NFS but Linux NFS client doesn't reuse source port while reconnecting +# if noresvport option is used. This does not work will with the DRC cache. +# +AZNFS_USE_NORESVPORT="${AZNFS_USE_NORESVPORT:-0}" + +# Set the fingerprint GUID as an environment variable with a default value. +AZNFS_FINGERPRINT="${AZNFS_FINGERPRINT:-80a18d5c-9553-4c64-88dd-d7553c6b3beb}" + +# +# Default to maximum number of mount retries in case of server-side returns failure. +# Retries make the mount process more robust. Currently, we don't distinguish between +# access denied failure due to intermittent issues or genuine mount failures. We retry anyways. +# +AZNFS_MAX_MOUNT_RETRIES="${AZNFS_MAX_MOUNT_RETRIES:-3}" + +# +# Maximum number of accounts that can be mounted from the same tenant/cluster. +# Any number of containers on these many accounts can be mounted. +# With ~350 reserved ports and 16 connections per mount (with nconnect=16) leaving +# some room, 20 is a reasonable limit. +# +MAX_ACCOUNTS_MOUNTABLE_FROM_SINGLE_TENANT=20 + +# +# Local IP that is free to use. +# +LOCAL_IP="" + +# +# Choose the local IP based on last used IP in MOUNTMAPv3 if this flag is enabled. +# +OPTIMIZE_GET_FREE_LOCAL_IP=true + +# +# True if user has asked to use port 2047 using 'port=2047' mount option. +# This signifies server side nconnect which has some special needs. +# +USING_PORT_2047=false _log() { @@ -339,26 +414,31 @@ is_private_ip() } # -# Mount helper must call this function to grab a timed lease on all MOUNTMAPv3 +# Mount helper must call this function to grab a timed lease on all mountmap # entries. It should do this if it decides to use any of the entries. Once -# this is called aznfswatchdog is guaranteed to not delete any MOUNTMAPv3 till -# the next 5 minutes. +# this is called aznfswatchdog is guaranteed to not delete any mountmap entries +# till the next 5 minutes. # -# Must be called with MOUNTMAPv3 lock held. +# Must be called with mountmap lock held. # -touch_mountmapv3() +# Parameters: +# $1 - mountmap_file: The mountmap file to touch +# +touch_mountmap() { - chattr -f -i $MOUNTMAPv3 - touch $MOUNTMAPv3 + local mountmap_file=$1 + + chattr -f -i $mountmap_file + touch $mountmap_file if [ $? -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "Failed to touch ${MOUNTMAPv3}!" + chattr -f +i $mountmap_file + eecho "Failed to touch ${mountmap_file}!" return 1 fi - chattr -f +i $MOUNTMAPv3 + chattr -f +i $mountmap_file } -# Create mount map file +# Create mount map file MOUNTMAPv3 or MOUNTMAPv4 create_mountmap_file() { local mountmap_filename=MOUNTMAPv$AZNFS_VERSION @@ -372,113 +452,201 @@ create_mountmap_file() fi } +# Create mountmap file MOUNTMAPv4NONTLS +create_mountmap_file_nontlsv4() +{ + local mountmap_filename_nontls=MOUNTMAPv4NONTLS + if [ ! -f ${!mountmap_filename_nontls} ]; then + touch ${!mountmap_filename_nontls} + if [ $? -ne 0 ]; then + eecho "[FATAL] Not able to create '${!mountmap_filename_nontls}'!" + return 1 + fi + chattr -f +i ${!mountmap_filename_nontls} + fi + + local fslocation_filename=VIRTUALFSLOCATION + + if [ ! -f ${!fslocation_filename} ]; then + touch ${!fslocation_filename} + if [ $? -ne 0 ]; then + eecho "[FATAL] Not able to create '${!fslocation_filename}'!" + return 1 + fi + chattr -f +i ${!fslocation_filename} + fi +} + +# +# Calculate control file name based on storage account hostname. +# Returns: AZNFSCtrl.txt where hash is derived from the account name. +# +get_aznfs_ctrl_filename() +{ + local hostname="$1" + local account_name=${hostname%%.*} + local key="abc" + local keylen=${#key} + local acc=0 + + for (( i=0; i<${#account_name}; ++i )); do + # Extract single character (byte) from each string + local ch="${account_name:i:1}" + local kch="${key:i%keylen:1}" + + # Get decimal byte values + local b=$(printf '%d' "'$ch") + local kb=$(printf '%d' "'$kch") + + local xored=$(( (b ^ kb) & 0xFF )) + local shift_amt=$(( (i % 4) * 8 )) + acc=$(( acc ^ (xored << shift_amt ) )) + done + + acc=$(( acc & 0xFFFFFFFF )) + echo "AZNFSCtrl.txt${acc}" +} + # # MOUNTMAPv3 is accessed by both mount.aznfs and aznfswatchdog service. Update it # only after taking exclusive lock. # -# Add entry to MOUNTMAPv3 in case of a new mount or IP change for blob FQDN. +# Add entry to mountmap in case of a new mount or IP change for blob/file FQDN. # -# This also ensures that the corresponding DNAT rule is created so that MOUNTMAPv3 +# This also ensures that the corresponding DNAT rule is created so that mountmap # entry and DNAT rule are always in sync. +# For Nfsv4 Non TLS, also add CRC32 based on the account name +# +# Parameters: +# $1 - entry: The entry to add (format: "host ip nfsip") +# $2 - mountmap_file: The mountmap file to update # -ensure_mountmapv3_exist_nolock() +ensure_mountmap_exist_nolock() { - IFS=" " read l_host l_ip l_nfsip <<< "$1" + local entry=$1 + local mountmap_file=$2 + + IFS=" " read l_host l_ip l_nfsip <<< "$entry" if ! ensure_iptable_entry $l_ip $l_nfsip; then - eecho "[$1] failed to add to ${MOUNTMAPv3}!" + eecho "[$entry] failed to add to ${mountmap_file}!" return 1 fi + line="$entry" + if [ "$AZNFS_VERSION" = "4" ]; then + #calculate crc32 and then append to the line + local ctrl_filename=$(get_aznfs_ctrl_filename "$l_host") + vecho "Control file for $l_host: $ctrl_filename" + line+=" $ctrl_filename" + fi - egrep -q "^${1}$" $MOUNTMAPv3 + egrep -q "^${line}$" $mountmap_file if [ $? -ne 0 ]; then - chattr -f -i $MOUNTMAPv3 - echo "$1" >> $MOUNTMAPv3 + chattr -f -i $mountmap_file + echo "$line" >> $mountmap_file if [ $? -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "[$1] failed to add to ${MOUNTMAPv3}!" - # Could not add MOUNTMAPv3 entry, delete the DNAT rule added above. + chattr -f +i $mountmap_file + eecho "[$entry] failed to add to ${mountmap_file}!" + # Could not add mountmap entry, delete the DNAT rule added above. ensure_iptable_entry_not_exist $l_ip $l_nfsip return 1 fi - chattr -f +i $MOUNTMAPv3 + chattr -f +i $mountmap_file else - pecho "[$1] already exists in ${MOUNTMAPv3}." + pecho "[$entry] already exists in ${mountmap_file}." fi } -ensure_mountmapv3_exist() +# +# Add entry to mountmap with exclusive lock. +# +# Parameters: +# $1 - entry: The entry to add (format: "host ip nfsip") +# $2 - mountmap_file: The mountmap file to update +# +ensure_mountmap_exist() { + local entry=$1 + local mountmap_file=$2 + ( flock -e 999 - ensure_mountmapv3_exist_nolock "$1" + ensure_mountmap_exist_nolock "$entry" "$mountmap_file" return $? - ) 999<$MOUNTMAPv3 + ) 999<$mountmap_file } # -# Delete entry from MOUNTMAPv3 and also the corresponding iptable rule. +# Delete entry from mountmap and also the corresponding iptable rule. +# +# Parameters: +# $1 - line: The entry to delete +# $2 - mountmap_file: The mountmap file to update +# $3 - ifmatch (optional): Only delete if mountmap mtime matches this value # -ensure_mountmapv3_not_exist() +ensure_mountmap_not_exist() { + local line=$1 + local mountmap_file=$2 + local ifmatch="$3" + ( flock -e 999 # - # If user wants to delete the entry only if MOUNTMAPv3 has not changed since + # If user wants to delete the entry only if mountmap has not changed since # he looked up, honour that. # - local ifmatch="$2" if [ -n "$ifmatch" ]; then - local mtime=$(stat -c%Y $MOUNTMAPv3) + local mtime=$(stat -c%Y $mountmap_file) if [ "$mtime" != "$ifmatch" ]; then - eecho "[$1] Refusing to remove from ${MOUNTMAPv3} as $mtime != $ifmatch!" + eecho "[$line] Refusing to remove from ${mountmap_file} as $mtime != $ifmatch!" return 1 fi fi - # Delete iptable rule corresponding to the outgoing MOUNTMAPv3 entry. - IFS=" " read l_host l_ip l_nfsip <<< "$1" + # Delete iptable rule corresponding to the outgoing mountmap entry. + IFS=" " read l_host l_ip l_nfsip l_aznfsctrlfile <<< "$line" if [ -n "$l_host" -a -n "$l_ip" -a -n "$l_nfsip" ]; then if ! ensure_iptable_entry_not_exist $l_ip $l_nfsip; then - eecho "[$1] Refusing to remove from ${MOUNTMAPv3} as iptable entry could not be deleted!" + eecho "[$line] Refusing to remove from ${mountmap_file} as iptable entry could not be deleted!" return 1 fi fi - chattr -f -i $MOUNTMAPv3 + chattr -f -i $mountmap_file # # We do this thing instead of inplace update by sed as that has a - # very bad side-effect of creating a new MOUNTMAPv3 file. This breaks + # very bad side-effect of creating a new mountmap file. This breaks # any locking that we dependent on the old file. # - out=$(sed "\%^${1}$%d" $MOUNTMAPv3) + out=$(sed "\%^${line}$%d" $mountmap_file) ret=$? if [ $ret -eq 0 ]; then # - # If this echo fails then MOUNTMAPv3 could be truncated. In that case we need + # If this echo fails then mountmap could be truncated. In that case we need # to reconcile it from the mount info and iptable info. That needs to be done # out-of-band. # - echo "$out" > $MOUNTMAPv3 + echo "$out" > $mountmap_file ret=$? out= if [ $ret -ne 0 ]; then - eecho "*** [FATAL] MOUNTMAPv3 may be in inconsistent state, contact Microsoft support ***" + eecho "*** [FATAL] $mountmap_file may be in inconsistent state, contact Microsoft support ***" fi fi if [ $ret -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "[$1] failed to remove from ${MOUNTMAPv3}!" + chattr -f +i $mountmap_file + eecho "[$line] failed to remove from ${mountmap_file}!" # Reinstate DNAT rule deleted above. ensure_iptable_entry $l_ip $l_nfsip return 1 fi - chattr -f +i $MOUNTMAPv3 + chattr -f +i $mountmap_file # Return the mtime after our mods. - echo $(stat -c%Y $MOUNTMAPv3) - ) 999<$MOUNTMAPv3 + echo $(stat -c%Y $mountmap_file) + ) 999<$mountmap_file } # @@ -487,66 +655,451 @@ ensure_mountmapv3_not_exist() # corresponding to old entry and adding the DNAT rule corresponding to the new # entry. # -update_mountmapv3_entry() +# Parameters: +# $1 - old: The old entry to replace +# $2 - new: The new entry to replace with +# $3 - mountmap_file: The mountmap file to update +# +update_mountmap_entry() { local old=$1 local new=$2 + local mountmap_file=$3 - vecho "Updating mountmapv3 entry [$old -> $new]" + vecho "Updating mountmap entry [$old -> $new] in $mountmap_file" ( flock -e 999 - IFS=" " read l_host l_ip l_nfsip_old <<< "$old" + IFS=" " read l_host l_ip l_nfsip_old l_aznfsctrlfile <<< "$old" if [ -n "$l_host" -a -n "$l_ip" -a -n "$l_nfsip_old" ]; then if ! ensure_iptable_entry_not_exist $l_ip $l_nfsip_old; then - eecho "[$old] Refusing to remove from ${MOUNTMAPv3} as old iptable entry could not be deleted!" + eecho "[$old] Refusing to remove from ${mountmap_file} as old iptable entry could not be deleted!" return 1 fi fi - IFS=" " read l_host l_ip l_nfsip_new <<< "$new" + IFS=" " read l_host l_ip l_nfsip_new l_aznfsctrlfile <<< "$new" if [ -n "$l_host" -a -n "$l_ip" -a -n "$l_nfsip_new" ]; then if ! ensure_iptable_entry $l_ip $l_nfsip_new; then - eecho "[$new] Refusing to remove from ${MOUNTMAPv3} as new iptable entry could not be added!" + eecho "[$new] Refusing to remove from ${mountmap_file} as new iptable entry could not be added!" # Roll back. ensure_iptable_entry $l_ip $l_nfsip_old return 1 fi fi - chattr -f -i $MOUNTMAPv3 + chattr -f -i $mountmap_file # # We do this thing instead of inplace update by sed as that has a - # very bad side-effect of creating a new MOUNTMAPv3 file. This breaks + # very bad side-effect of creating a new mountmap file. This breaks # any locking that we dependent on the old file. # - out=$(sed "s%^${old}$%${new}%g" $MOUNTMAPv3) + out=$(sed "s%^${old}$%${new}%g" $mountmap_file) ret=$? if [ $ret -eq 0 ]; then # - # If this echo fails then MOUNTMAPv3 could be truncated. In that case we need + # If this echo fails then mountmap could be truncated. In that case we need # to reconcile it from the mount info and iptable info. That needs to be done # out-of-band. # - echo "$out" > $MOUNTMAPv3 + echo "$out" > $mountmap_file ret=$? out= if [ $ret -ne 0 ]; then - eecho "*** [FATAL] MOUNTMAPv3 may be in inconsistent state, contact Microsoft support ***" + eecho "*** [FATAL] $mountmap_file may be in inconsistent state, contact Microsoft support ***" fi fi if [ $ret -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "[$old -> $new] failed to update ${MOUNTMAPv3}!" + chattr -f +i $mountmap_file + eecho "[$old -> $new] failed to update ${mountmap_file}!" # Roll back. ensure_iptable_entry_not_exist $l_ip $l_nfsip_new ensure_iptable_entry $l_ip $l_nfsip_old return 1 fi - chattr -f +i $MOUNTMAPv3 - ) 999<$MOUNTMAPv3 + chattr -f +i $mountmap_file + ) 999<$mountmap_file +} + +# +# Is the given address one of the host addresses? +# +is_host_ip() +{ + # + # Do not make this local as status gathering does not work well when + # collecting command o/p to local variables. + # + route=$(ip -4 route get fibmatch $1 2>/dev/null) + if [ $? -ne 0 ]; then + return 1 + fi + + if ! echo "$route" | grep -q "scope host"; then + return 1 + fi + + return 0 +} + +# +# Is the given address one of the addresses directly reachable from the host? +# +is_link_ip() +{ + # + # Do not make this local as status gathering does not work well when + # collecting command o/p to local variables. + # + route=$(ip -4 route get fibmatch $1 2>/dev/null) + if [ $? -ne 0 ]; then + return 1 + fi + + if ! echo "$route" | grep -q "scope link"; then + return 1 + fi + + return 0 +} + +# +# Check if a given IPv4 address is responding to ICMP pings. +# Uses a 3 secs timeout to bail out in time if address is not responding. +# +is_pinging() +{ + # + # Unless env var AZNFS_PING_LOCAL_IP_BEFORE_USE is set, pretend IP address + # is available. + # + if [ "$AZNFS_PING_LOCAL_IP_BEFORE_USE" != "1" ]; then + return 1 + fi + + local ip=$1 + + # 3 secs timeout should be good. + ping -4 -W3 -c1 $ip > /dev/null 2>&1 +} + +# +# Returns number of octets in an IPv4 prefix. +# If IP prefix is not valid or is not a private IP address prefix, it returns 0. +# +# f.e. For 10 it will return 1, for 10.10 it will return 2, for 10.10.10 it will +# return 3 and for 10.10.10.10, it will return 4. +# +octets_in_ipv4_prefix() +{ + local ip=$1 + local octet="[0-9]{1,3}" + local octetdot="${octet}\." + + if ! is_valid_ipv4_prefix $ip; then + echo 0 + return + fi + + # + # Check if the IP prefix belongs to the private IP range (10.0.0.0/8, + # 172.16.0.0/12, or 192.168.0.0/16), i.e., will the user provided prefix + # result in a private IP address. + # + [[ $ip =~ ^10(\.${octet})*$ ]] || + [[ $ip =~ ^172\.(1[6-9]|2[0-9]|3[0-1])(\.${octet})*$ ]] || + [[ $ip =~ ^192\.168(\.${octet})*$ ]] + + if [ $? -ne 0 ]; then + echo 0 + return + fi + + # 4 octets. + [[ $ip =~ ^(${octetdot}){3}${octet}$ ]] && echo 4 && return; + + # 3 octets + [[ $ip =~ ^(${octetdot}){2}${octet}$ ]] && echo 3 && return; + + # 2 octets. + [[ $ip =~ ^(${octetdot}){1}${octet}$ ]] && echo 2 && return; + + # 1 octet. + [[ $ip =~ ^${octet}$ ]] && echo 1 && return; + + echo 0 +} + +# +# Search for a free local IP with the given prefix. +# Takes the IP prefix and the mountmap file to use. +# +search_free_local_ip_with_prefix() +{ + local initial_ip_prefix=$1 + local mountmap_file=$2 + local num_octets=$(octets_in_ipv4_prefix $ip_prefix) + + if [ $num_octets -ne 2 -a $num_octets -ne 3 ]; then + eecho "Invalid IPv4 prefix: ${ip_prefix}" + eecho "Valid prefix must have either 2 or 3 octets and must be a valid private IPv4 address prefix." + eecho "Examples of valid private IPv4 prefixes are 10.10, 10.10.10, 192.168, 192.168.10 etc." + return 1 + fi + + local local_ip="" + local optimize_get_free_local_ip=false + local used_local_ips_with_same_prefix=$(cat $mountmap_file | awk '{print $2}' | grep "^${initial_ip_prefix}\." | sort -t . -k 1,1n -k 2,2n -k 3,3n -k 4,4n) + local iptable_entries=$(iptables-save -t nat) + + _3rdoctet=100 + ip_prefix=$initial_ip_prefix + + # + # Optimize the process to get free local IP by starting the loop to choose + # 3rd and 4th octet from the number which was used last and still exist in + # mountmap instead of starting it from 100. + # + if [ $OPTIMIZE_GET_FREE_LOCAL_IP == true -a -n "$used_local_ips_with_same_prefix" ]; then + + last_used_ip=$(echo "$used_local_ips_with_same_prefix" | tail -n1) + + IFS="." read _ _ last_used_3rd_octet last_used_4th_octet <<< "$last_used_ip" + + if [ $num_octets -eq 2 ]; then + if [ "$last_used_3rd_octet" == "254" -a "$last_used_4th_octet" == "254" ]; then + return 1 + fi + + _3rdoctet=$last_used_3rd_octet + optimize_get_free_local_ip=true + else + if [ "$last_used_4th_octet" == "254" ]; then + return 1 + fi + + optimize_get_free_local_ip=true + fi + fi + + while true; do + if [ $num_octets -eq 2 ]; then + for ((; _3rdoctet<255; _3rdoctet++)); do + ip_prefix="${initial_ip_prefix}.$_3rdoctet" + + if is_link_ip $ip_prefix; then + vecho "Skipping link network ${ip_prefix}!" + continue + fi + + break + done + + if [ $_3rdoctet -eq 255 ]; then + # + # If the IP prefix had 2 octets and we exhausted all possible + # values of the 3rd and 4th octet, then we have failed the + # search for free local IP within the given prefix. + # + return 1 + fi + fi + + if $optimize_get_free_local_ip; then + _4thoctet=$(expr ${last_used_4th_octet} + 1) + optimize_get_free_local_ip=false + else + _4thoctet=100 + fi + + for ((; _4thoctet<255; _4thoctet++)); do + local_ip="${ip_prefix}.$_4thoctet" + + is_ip_used_by_aznfs=$(echo "$used_local_ips_with_same_prefix" | grep "^${local_ip}$") + if [ -n "$is_ip_used_by_aznfs" ]; then + vecho "$local_ip is in use by aznfs!" + continue + fi + + if is_host_ip $local_ip; then + vecho "Skipping host address ${local_ip}!" + continue + fi + + if is_link_ip $local_ip; then + vecho "Skipping link network ${local_ip}!" + continue + fi + + if [ "$nfs_ip" == "$local_ip" ]; then + vecho "Skipping private endpoint IP ${nfs_ip}!" + continue + fi + + is_present_in_iptables=$(echo "$iptable_entries" | grep -c "\<${local_ip}\>") + if [ $is_present_in_iptables -ne 0 ]; then + vecho "$local_ip is already present in iptables!" + continue + fi + + # + # Try pinging the address to be sure it is not in use in the + # client network. + # + # Note: If the address exists but not responding to ICMP ping then + # we will incorrectly treat it as non-exixtent. + # + if is_pinging $local_ip; then + vecho "Skipping $local_ip as it appears to be in use on the network!" + continue + fi + + vecho "Using local IP ($local_ip) for aznfs." + break + done + + if [ $_4thoctet -eq 255 ]; then + if [ $num_octets -eq 2 ]; then + let _3rdoctet++ + continue + else + # + # If the IP prefix had 3 octets and we exhausted all possible + # values of the 4th octet, then we have failed the search for + # free local IP within the given prefix. + # + return 1 + fi + fi + + # + # Happy path! + # + # Add this entry to mountmap while we have the mountmap lock. + # This is to avoid assigning same local ip to parallel mount requests + # for different endpoints. + # ensure_mountmap_exist_nolock will also create a matching iptable DNAT rule. + # + LOCAL_IP=$local_ip + ensure_mountmap_exist_nolock "$nfs_host $LOCAL_IP $nfs_ip" "$mountmap_file" + + return 0 + done + + # We will never reach here. +} + +# +# Get a local IP that is free to use. Set global variable LOCAL_IP if found. +# Takes the mountmap file to use for tracking used IPs. +# +get_free_local_ip() +{ + local mountmap_file=$1 + + for ip_prefix in $IP_PREFIXES; do + vecho "Trying IP prefix ${ip_prefix}." + if search_free_local_ip_with_prefix "$ip_prefix" "$mountmap_file"; then + return 0 + fi + done + + # + # If the above loop is not able to find a free local IP using optimized way, + # do a linear search to get the free local IP. + # + vecho "Falling back to linear search for free ip!" + OPTIMIZE_GET_FREE_LOCAL_IP=false + for ip_prefix in $IP_PREFIXES; do + vecho "Trying IP prefix ${ip_prefix}." + if search_free_local_ip_with_prefix "$ip_prefix" "$mountmap_file"; then + return 0 + fi + done + + # If we come here we did not get a free address to use. + return 1 +} + +# +# To maintain consistency in case of regional account and in general to avoid creating +# multiple DNAT entries corresponding to one LOCAL_IP, first check for resolved IP in mountmap. +# This will help keep mountmap and DNAT entries in sync with each other. +# If the current resolved IP is different from the one stored in mountmap then it means that the IP has changed +# since the mountmap entry was created (could be due to migration or more likely due to RAs roundrobin DNS). +# In any case this will be properly handled by aznfswatchdog next time it checks for IP change for this fqdn. +# +# Parameters: +# $1 - fqdn: The FQDN to resolve +# $2 - mountmap_file: The mountmap file to check for existing IP +# +resolve_ipv4_with_preference_to_mountmap() +{ + local fqdn=$1 + local mountmap_file=$2 + + exec {fd}<$mountmap_file + flock -e $fd + + local mountmap_entry=$(grep -m1 "^${fqdn} " $mountmap_file) + + flock -u $fd + exec {fd}<&- + + IFS=" " read _ local_ip old_nfs_ip <<< "$mountmap_entry" + if [ -n "$old_nfs_ip" ]; then + echo "$old_nfs_ip" + return 2 + fi + + # + # Resolve FQDN to IPv4 using DNS if not found in the mountmap. + # + resolve_ipv4 "$fqdn" "true" +} + +# +# For the given AZNFS endpoint FQDN return a local IP that should proxy it. +# If there is at least one mount to the same FQDN it MUST return the local IP +# used for that, else assign a new free local IP. +# +# Parameters: +# $1 - fqdn: The FQDN to get a local IP for +# $2 - mountmap_file: The mountmap file to use +# +get_local_ip_for_fqdn() +{ + local fqdn=$1 + local mountmap_file=$2 + local mountmap_entry=$(grep -m1 "^${fqdn} " $mountmap_file) + # One local ip per fqdn, so return existing one if already present. + IFS=" " read _ local_ip _ <<< "$mountmap_entry" + + if [ -n "$local_ip" ]; then + LOCAL_IP=$local_ip + + # + # Ask aznfswatchdog to stay away while we are using this proxy IP. + # This is similar to holding a timed lease, we can safely use this + # proxy IP w/o worrying about aznfswatchdog deleting it for 5 minutes. + # + touch_mountmap $mountmap_file + + # + # This is not really needed since iptable entry must also be present, + # but it's always better to ensure mountmap and iptable entries are + # in sync. + # + ensure_iptable_entry $local_ip $nfs_ip + return 0 + fi + + # + # First mount of an account on this client. + # + get_free_local_ip $mountmap_file } # @@ -744,6 +1297,12 @@ if ! create_mountmap_file; then exit 1 fi +# Create mount map file nontls v4 + +if ! create_mountmap_file_nontlsv4; then + exit 1 +fi + ulimitfd=$(ulimit -n 2>/dev/null) if [ -n "$ulimitfd" -a $ulimitfd -lt 131072 ]; then ulimit -n 131072 diff --git a/src/aznfswatchdog b/src/aznfswatchdog index 77cf1dc1..eef76e77 100644 --- a/src/aznfswatchdog +++ b/src/aznfswatchdog @@ -26,13 +26,6 @@ AZNFS_SKIP_UNMOUNT_CLEANUP="${AZNFS_SKIP_UNMOUNT_CLEANUP:-0}" # TIMEWAIT timeout to be used for conntrack entries. AZNFS_TIMEWAIT_TIMEOUT="${AZNFS_TIMEWAIT_TIMEOUT:-65}" -# -# Environment variable to control skipping of IP change detection for regional accounts. -# By default we want to skip IP change detection for regional accounts, but if we want to -# disable skipping we can set this environment variable to 0. -# -AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS="${AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS:-1}" - next_ip_change_detection_epoch=0 # @@ -61,6 +54,14 @@ if [ ! -s $RANDBYTES ]; then fi chattr -f +i $RANDBYTES +# +# Environment variable to control skipping of IP change detection for regional accounts. +# By default we want to skip IP change detection for regional accounts, but if we want to +# disable skipping we can set this environment variable to 0. +# +AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS="${AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS:-1}" + +# Associative arrays for regional account tracking declare -A ip_change_count declare -A last_ip_change_time declare -A regional_accounts @@ -342,7 +343,7 @@ process_nfsv3_mounts() # exec {fd}<$MOUNTMAPv3 flock -e $fd - mtime_mountmap=$(stat -c%Y $MOUNTMAPv3) + mtime_mountmapv3=$(stat -c%Y $MOUNTMAPv3) IFS=$'\n' lines=$(cat $MOUNTMAPv3) flock -u $fd exec {fd}<&- @@ -360,7 +361,7 @@ process_nfsv3_mounts() # do_unmount_gc=false if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then - if [ $epoch_now -ge $(expr $mtime_mountmap + $MOUNTMAP_INACTIVITY_SECS) ]; then + if [ $epoch_now -ge $(expr $mtime_mountmapv3 + $MOUNTMAP_INACTIVITY_SECS) ]; then do_unmount_gc=true fi fi @@ -398,24 +399,24 @@ process_nfsv3_mounts() if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" ]; then wecho "[FATAL] Deleting invalid line in $MOUNTMAPv3: [$line]!" - l_mtime=$(ensure_mountmapv3_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi # Since we added it to the MOUNTMAPv3 file, it cannot be invalid. if ! is_private_ip "$l_ip"; then wecho "[FATAL] local ip ($l_ip) is invalid!" - l_mtime=$(ensure_mountmapv3_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi # Since we added it to the MOUNTMAPv3 file, it cannot be invalid. if ! is_valid_ipv4_address "$l_nfsip"; then wecho "[FATAL] Blob endpoint ip ($l_nfsip) is invalid!" - l_mtime=$(ensure_mountmapv3_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi @@ -429,16 +430,16 @@ process_nfsv3_mounts() pecho "No mounted shares for host $l_host, deleting from ${MOUNTMAPv3} [$line]." # Delete IFF mountmap is not changed since we read it above. - l_mtime=$(ensure_mountmapv3_not_exist "$line" "$mtime_mountmap") + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3" "$mtime_mountmapv3") # # Update ifmatch time in case of successful updation of MOUNTMAPv3, # so that we can distinguish between MOUNTMAPv3 mtime changing because # of our action or some mount helper changing it. In the former case - # it's safe to update the MOUNTMAPv3, so update mtime_mountmap to the + # it's safe to update the MOUNTMAPv3, so update mtime_mountmapv3 to the # mtime after this update. # - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi else @@ -493,8 +494,8 @@ process_nfsv3_mounts() if [ "$new_ip" == "NXDOMAIN" ]; then pecho "Account corresponding to $l_host seems to have been deleted, deleting from ${MOUNTMAPv3} [$line]!" - l_mtime=$(ensure_mountmapv3_not_exist "$line" "$mtime_mountmap") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3" "$mtime_mountmapv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime else eecho "resolve_ipv4($l_host) failed: $new_ip" fi @@ -510,7 +511,7 @@ process_nfsv3_mounts() check_for_regional_account "$l_host" # This will update DNAT rule as well. - if ! update_mountmapv3_entry "$line" "$l_host $l_ip $new_ip"; then + if ! update_mountmap_entry "$line" "$l_host $l_ip $new_ip" "$MOUNTMAPv3"; then eecho "Will reattempt the operation in next iteration." else mountpoint2048=$(echo "$findmnt" 2>/dev/null | egrep -m1 " nfs ${l_ip}:.*\" | awk '{print $4}') diff --git a/src/aznfswatchdogv4 b/src/aznfswatchdogv4 index ca0091b3..769d37f5 100644 --- a/src/aznfswatchdogv4 +++ b/src/aznfswatchdogv4 @@ -100,6 +100,11 @@ is_nfs_server_active_for_target() done < "$NFSFS_SERVERS_FILE" return 1 } +mtime_mountmapv4=0 +mtime_mountmapv4nontls=0 +# How often do we check for change in FQDN->IP? +IP_CHANGE_DETECTION_FREQUENCY=60 +next_ip_change_detection_epoch=0 # # Kill stunnel process and clean up stunnel files generated by aznfs mount helper @@ -138,6 +143,37 @@ cleanup_stunnel_files() fi } +# +# Kill Stunnel for Migration and remove pid and log files. +# +cleanup_stunnel_files() +{ + local l_conf=$1 + local l_log=$2 + local l_pid=$3 + local accept_port + + # Kill stunnel process first. + pid=$(cat $l_pid) + accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2) + pecho "killing stunnel process with pid: $pid on port: $accept_port" + kill -9 $pid + if [ $? -ne 0 ]; then + eecho "Unable to kill stunnel process $pid!" + fi + + # Cleanup stunnel files + rm $l_log + if [ $? -ne 0 ]; then + eecho "[FATAL] Unable to delete stunnel log file $l_log!" + fi + + rm $l_pid + if [ $? -ne 0 ]; then + eecho "[FATAL] Unable to delete stunnel pid file $l_pid!" + fi +} + # # Delete entry from MOUNTMAPv4. # @@ -201,17 +237,17 @@ cleanup_mount() flock -e $fd2 # Delete IFF mountmap is not changed since we read it above. - l_mtime=$(ensure_mountmapv4_not_exist "$line" "$mtime_mountmap") + l_mtime=$(ensure_mountmapv4_not_exist "$line" "$mtime_mountmapv4") # # Update mountmap mtime in case of successful updation of MOUNTMAPv4, # so that we can distinguish between MOUNTMAPv4 mtime changing because # of our action or some mount helper changing it. In the former case - # it's safe to update the MOUNTMAPv4, so update mtime_mountmap to the + # it's safe to update the MOUNTMAPv4, so update mtime_mountmapv4 to the # mtime after this update. # if [ $? -eq 0 ]; then - mtime_mountmap=$l_mtime + mtime_mountmapv4=$l_mtime else # If the mountmap file is changed since we read it, we need to read it again - don't modify anything. eecho "Failed to delete entry from ${MOUNTMAPv4}! Entry: [$line]" @@ -247,7 +283,7 @@ process_nfsv4_mounts() # exec {fd}<$MOUNTMAPv4 flock -e $fd - mtime_mountmap=$(stat -c%Y $MOUNTMAPv4) + mtime_mountmapv4=$(stat -c%Y $MOUNTMAPv4) IFS=$'\n' lines=$(cat $MOUNTMAPv4) flock -u $fd exec {fd}<&- @@ -258,7 +294,7 @@ process_nfsv4_mounts() # for sure that it's not in use by any mount and can be removed. # findmnt=$(findmnt | grep 'nfs4\|$LOCALHOST' 2>&1) - + #findmnt=$(findmnt --raw --noheading -o MAJ:MIN,FSTYPE,SOURCE,TARGET,OPTIONS -t nfs 2>&1) # # For no matching mounts also, findmnt exits with a failure return, so check # for both exit status and non-empty error o/p. @@ -281,19 +317,20 @@ process_nfsv4_mounts() if [ -z "$line" ]; then continue fi - # # MOUNTMAPv4 line is of the form: - # ;;;;;; + # ;;;;;;; # - IFS=";" read l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout <<< "$line" + + IFS=";" read l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout l_crc_32 <<< "$line" + IP_changed=false #we don't need to worry abour regional accounts with l_crc32 if [ -z "$l_ip" -o -z "$l_conf" -o -z "$l_pid" ]; then wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4: [$line]!" exec {fd2}<$MOUNTMAPv4 flock -e $fd2 l_mtime=$(ensure_mountmapv4_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + [ $? -eq 0 ] && mtime_mountmapv4=$l_mtime flock -u $fd2 exec {fd2}<&- continue @@ -311,26 +348,116 @@ process_nfsv4_mounts() fi accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2) - # vecho "accept_port: $accept_port" + #vecho "accept_port: $accept_port" # # Delete entry from MOUNTMAPv4 if there are no mounted shares on that host. # As long as we have at least one mount using the MOUNTMAPv4 entry, we leave # it around. # - if ! echo "$findmnt" | grep "$accept_port" >/dev/null; then + findmnt_output=$(echo "$findmnt" | grep "$accept_port") + vecho "findmnt_output: $findmnt_output" + if [ -z "$findmnt_output" ]; then vecho "findmnt shows no mount for accept_port=$accept_port (line=[$line])" if is_nfs_server_active_for_target "$LOCALHOST" "$accept_port"; then pecho "NFS server entry still active for $LOCALHOST:$accept_port; skipping cleanup for [$line]." continue fi - pecho "No mounted shares for host $l_ip with accept port $accept_port, deleting from ${MOUNTMAPv4} [$line]." cleanup_mount $l_conf $l_log $l_pid $line continue else - # vecho "Mounted shares found for host $l_ip with accept port $accept_port." + vecho "Mounted shares found for host $l_ip with accept port $accept_port." + # + # Fetch FSLocationIP from the virtual file and check if it's updated for migration scenario. + # The virtual file is at the mountpoint and named with the crc32 value in the mountmap entry. + # + local mountpoint=$(echo "$findmnt_output" | awk '{print $1}' | sed 's/^[└├│─]*//g') + l_fslocation_line=$(cat "${mountpoint}/${l_crc_32}") + #read now from the mountpoint + read_status=$? + IFS=";" read l_prt l_fslocationIP <<< "$l_fslocation_line" + #process the output + if [ $read_status -ne 0 ]; then + eecho "Failed to read from $l_crc32_file" + return 1 + fi + + # + # If FSLocationIP is not empty, and updated from the IP stored in MountMap and PRT is in migration stage, update. + # Mountmap, config, log, and checksumhash need to be updtaed accordingly. Then restart stunnel with new config file. + # If it's empty, it will skip this logic as ip will not be changed. - before server upgrade + # + if [ -n "$l_fslocationIP" -a "$l_fslocationIP" != "$l_ip" -a "$l_prt" -ne 0 ]; then + vecho " IP for $l_host changed [$l_ip -> $l_fslocationIP]." + + new_ip=$l_fslocationIP + IP_changed=true + + #Update stunnel config file with new IP + out=$(sed "s/${l_ip}/${new_ip}/g" $l_conf) + + ret=$? + + #kill old stunnel process and remove old files + cleanup_stunnel_files $l_conf $l_log $l_pid + + #update the contents of the l_conf file with the new ip + #changing log file location, and pid. Then recalculate checksum hash + if [ $ret -eq 0 ]; then + chattr -f -i $l_conf + # + # If this echo fails then MOUNTMAPv4 could be truncated. + # + echo "$out" > $l_conf + ret=$? + out= + if [ $ret -ne 0 ]; then + eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***" + fi + chattr -f +i $l_conf + fi + + #update filepath for config file and move to new path + new_conf_file_path=$(echo "$l_conf" | sed "s/${l_ip}/${new_ip}/g") + chattr -f -i $l_conf + mv -vf $l_conf $new_conf_file_path + chattr -f +i $new_conf_file_path + + #update mountmapentry to update checksumhash and ip files + newchecksumhash=`cksum $new_conf_file_path | awk '{print $1}'` + chattr -f -i $MOUNTMAPv4 + outmountmap=$(sed -e "s/$l_checksumhash/$newchecksumhash/g" -e "s/$l_ip/$new_ip/g" $MOUNTMAPv4) + + #Update MOUNTMAPv4 with new IP and new Checksum hash + ret=$? + if [ $ret -eq 0 ]; then + # + # If this echo fails then MOUNTMAPv4 could be truncated. + # + echo "$outmountmap" > $MOUNTMAPv4 + ret=$? + out= + if [ $ret -ne 0 ]; then + eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***" + fi + fi + + chattr -f +i $MOUNTMAPv4 + + # Update all variable paths to reflect new IP + l_conf=$new_conf_file_path + l_log=$(echo "$l_log" | sed "s/${l_ip}/${new_ip}/g") + l_pid=$(echo "$l_pid" | sed "s/${l_ip}/${new_ip}/g") + l_ip=$new_ip + l_checksumhash=$newchecksumhash + + # Start the new stunnel process + stunnel_status_new=$(stunnel $l_conf 2>&1) + + # done + fi # Check if checksumHash for stunnel.conf file has changed. # Customers should not modify stunnel.conf files created by aznfs mount helper. @@ -339,7 +466,8 @@ process_nfsv4_mounts() eecho "Failed to get the checksum hash of file: '${l_conf}'!" fi - if [ $checksumHash != $l_checksumhash ]; then + # If files were updated outside of IP being changed, then cleanup the mount. + if [ $checksumHash != $l_checksumhash ] && [ "$IP_changed" = "false" ]; then eecho "'${l_conf}' file has modified!" eecho "It's not recommended to modify '${l_conf}' file created by aznfs mount helper!" eecho "watchdog service will do cleanup, kill stunnel process with pid:$(cat $l_pid) and remove '${l_conf}'; '${l_log}'; '${l_pid}'!" @@ -348,7 +476,7 @@ process_nfsv4_mounts() cleanup_mount $l_conf $l_log $l_pid $line continue fi - + is_stunnel_running=$($NETSTATCOMMAND -anp | grep stunnel | grep `cat $l_pid`) if [ -z "$is_stunnel_running" ]; then vecho "Watchdog: stunnel is not running! Restarting the stunnel" @@ -376,6 +504,180 @@ process_nfsv4_mounts() done } +process_nfsv4_nontlsmounts() +{ + epoch_now=$(date +%s) + # + # Go over all lines in MOUNTMAPv4NONTLS and check them for two things: + # 1. Is that entry still in use by at least one aznfs mount, if not remove the entry. + # 2. Has the Blob endpoint address changed from what is stored? + # If yes, update DNAT rule to point to the new address and update entry accordingly. + # + # Sample line in MOUNTMAPv4NONTLS. + # account.file.preprod.core.windows.net 10.100.100.100 52.230.170.200 AZNFSCtrl.txt12345 + # + # where the format is + # fileendpoint_fqdn proxy_ip fileendpoint_ip crc_32 + # + # We store the mtime of MOUNTMAPv4NONTLS while inside the lock so that if any mount helper process + # updates it after this we will skip modification for sake of safety. We will come to it + # in the next iteration when it's safer. + # + exec {fd}<$MOUNTMAPv4NONTLS + flock -e $fd + mtime_mountmapv4nontls=$(stat -c%Y $MOUNTMAPv4NONTLS) + IFS=$'\n' lines=$(cat $MOUNTMAPv4NONTLS) + flock -u $fd + exec {fd}<&- + + #Can get rid of the below if every 30 seconds is fine. + do_ip_change_detection=false + + if [ $epoch_now -ge $next_ip_change_detection_epoch ]; then + do_ip_change_detection=true + next_ip_change_detection_epoch=$(expr $(date +%s) + $IP_CHANGE_DETECTION_FREQUENCY) + fi + + for line in $lines; do + if [ -z "$line" ]; then + continue + fi + # + # MOUNTMAPv4nontls line is of the form: + # account.file.core.windows.net [] + # compare publicIP and crc32 + # + + IFS=" " read l_host l_ip l_nfsip l_crc32 <<< "$line" + + accname=$(echo $l_host | awk -F'.file' '{print $1}') #can change to l_ip as the fix has been made. whatever is safer. + mountpoint=$(findmnt -r -n | grep 'nfs4' | grep -F -- "$accname" | head -n1 | awk '{print $1}') + + # Since we added it to the MOUNTMAPv4 file, it cannot be invalid. + if ! is_private_ip "$l_ip"; then + wecho "[FATAL] local ip ($l_ip) is invalid!" + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS") + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + + # Since we added it to the MOUNTMAPv4NONTLS file, it cannot be invalid. + if ! is_valid_ipv4_address "$l_nfsip"; then + wecho "[FATAL] File endpoint ip ($l_nfsip) is invalid!" + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS") + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + + # + # Do unmount GC only if MOUNTMAPv4nontls file is not modified in the last + # MOUNTMAP_INACTIVITY_SECS seconds. We don't want to incorrectly delete an + # entry while some aznfs mount is ongoing. + + #BUGBUG 300 seconds seems like a long time. Double check what you want to do here. + # + do_unmount_gc_nfsv4=true + # if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then + # if [ $epoch_now -ge $(expr $mtime_mountmapv4nontls + $MOUNTMAP_INACTIVITY_SECS) ]; then + # do_unmount_gc_nfsv4=true + # fi + # fi + + # + # Delete entry from MOUNTMAPv4NONTLS if there are no mounted shares on that host. + # As long as we have at least one mount using the MOUNTMAPv4NONTLS entry, we leave + # it around. Can just search for account name as we don't support multi-mount migration scenarios on AZNFS for now. + # + findmnt=$(findmnt -m | grep "${accname}") + + if [ -z "$findmnt" ] ; then + if $do_unmount_gc_nfsv4; then + eecho "No mounted shares for host $l_host, deleting from ${MOUNTMAPv4NONTLS} [$line]." + + # Delete IFF mountmap is not changed since we read it above. + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS" "$mtime_mountmapv4nontls") + + # + # Update ifmatch time in case of successful updation of MOUNTMAPv4NONTLS, + # so that we can distinguish between MOUNTMAPv4NONTLS mtime changing because + # of our action or some mount helper changing it. In the former case + # it's safe to update the MOUNTMAPv4NONTLS, so update mtime_mountmapv4nontls to the + # mtime after this update. + # + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + else + # + # Verify that iptable entry should be present for corresponding + # MOUNTMAPv4NONTLS entry if the share is not unmounted. + # + # Note: This is extra protection in case user flushes the iptable + # entries or removes it by mistake. This should not be + # required normally. + # + # We also reconcile conntrack entries stuck in some bad states which + # may hamper communication, f.e., in older kernels there's a bug due to + # which conntrack entry may get stuck in SYN_SENT state if client + # reuse the source port and keep retransmitting SYNs before the entry + # can timeout. + # + reconcile_conntrack "$l_ip" "$l_nfsip" + verify_iptable_entry "$l_ip" "$l_nfsip" + fi + + #read into the line here to do the ip change detection. this is per Line. + l_fslocation_line=$(cat "${mountpoint}/${l_crc32}") + read_status=$? + + #rearrange this, we are not using the status really + if [ $read_status -ne 0 ]; then + eecho "Failed to read from $l_crc32_file" + return 1 + fi + + # Ingest l_prt and l_fslocationIP from fslocation line. The format of the line is ; + IFS=";" read l_prt l_fslocationIP <<< "$l_fslocation_line" + + # should we umount or do something else like log since the server may not be at that build yet? + if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" -o -z "$l_crc32" ]; then + wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4NONTLS: [$line]!" + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS") + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + + # + # We probably don't need this as we are not making any DNS calls but instead reading from fslocation + # BUGBUG for reviewer and Dan - look to remove this from v4 + if ! $do_ip_change_detection; then + continue + fi + + # + # Check if the PRT is in the migration stage and if the IP Reported from Virtual File is updated yet. + # + if [ -n "$l_fslocationIP" -a "$l_fslocationIP" != "$l_nfsip" -a $l_prt != 0 ]; then + eecho "IP for $l_host changed [$l_nfsip -> $l_fslocationIP]." + + # This will update DNAT rule as well. + if ! update_mountmap_entry "$line" "$l_host $l_ip $l_fslocationIP $l_crc32" "$MOUNTMAPv4NONTLS"; then + eecho "Will reattempt the operation in next iteration." + else + #pings the new endpoint to get a TCP RST + mountpoint2048=$(echo "$findmnt" 2>/dev/null | egrep -m1 " nfs ${l_ip}:.*\" | awk '{print $4}') + mountpoint2048=$(echo -e "$mountpoint2048") + if [ -n "$mountpoint2048" ]; then + ping_new_endpoint "$mountpoint2048" & + fi + fi + fi + done + + + +} + # Load common aznfs helpers. AZNFS_VERSION=4 . /opt/microsoft/aznfs/common.sh @@ -389,7 +691,12 @@ if ! chattr -f +i $MOUNTMAPv4; then wecho "chattr does not work for ${MOUNTMAPv4}!" fi +if ! chattr -f +i $MOUNTMAPv4NONTLS; then + wecho "chattr does not work for ${MOUNTMAPv4NONTLS}!" +fi + while :; do sleep $MONITOR_INTERVAL_SECS process_nfsv4_mounts + process_nfsv4_nontlsmounts done diff --git a/src/nfsv3mountscript.sh b/src/nfsv3mountscript.sh index 9d526ae7..4837f138 100644 --- a/src/nfsv3mountscript.sh +++ b/src/nfsv3mountscript.sh @@ -548,367 +548,6 @@ check_account_count() return 0 } -# -# To maintain consistency in case of regional account and in general to avoid creating -# multiple DNAT entries corrosponding to one LOCAL_IP, first check for resolved IP in mountmap. -# This will help keep mountmap and DNAT entries in sync with each other. -# If the current resolved IP is different from the one stored in mountmap then it means that the IP has changed -# since the mountmap entry was created (could be due to migration or more likely due to RAs roundrobin DNS). -# In any case this will be properly handled by aznfswatchdog next time it checks for IP change for this fqdn. -# -resolve_ipv4_with_preference_to_mountmapv3() -{ - local fqdn=$1 - - exec {fd}<$MOUNTMAPv3 - flock -e $fd - - local mountmap_entry=$(grep -m1 "^${fqdn} " $MOUNTMAPv3) - - flock -u $fd - exec {fd}<&- - - IFS=" " read _ local_ip old_nfs_ip <<< "$mountmap_entry" - if [ -n "$old_nfs_ip" ]; then - echo "$old_nfs_ip" - return 2 - fi - - # - # Resolve FQDN to IPv4 using DNS if not found in the mountmap. - # - resolve_ipv4 "$fqdn" "true" -} - -# -# Is the given address one of the host addresses? -# -is_host_ip() -{ - # - # Do not make this local as status gathering does not work well when - # collecting command o/p to local variables. - # - route=$(ip -4 route get fibmatch $1 2>/dev/null) - if [ $? -ne 0 ]; then - return 1 - fi - - if ! echo "$route" | grep -q "scope host"; then - return 1 - fi - - return 0 -} - -# -# Is the given address one of the addresses directly reachable from the host? -# -is_link_ip() -{ - # - # Do not make this local as status gathering does not work well when - # collecting command o/p to local variables. - # - route=$(ip -4 route get fibmatch $1 2>/dev/null) - if [ $? -ne 0 ]; then - return 1 - fi - - if ! echo "$route" | grep -q "scope link"; then - return 1 - fi - - return 0 -} - -# -# Check if a given IPv4 address is responding to ICMP pings. -# Uses a 3 secs timeout to bail out in time if address is not responding. -# -is_pinging() -{ - # - # Unless env var AZNFS_PING_LOCAL_IP_BEFORE_USE is set, pretend IP address - # is available. - # - if [ "$AZNFS_PING_LOCAL_IP_BEFORE_USE" != "1" ]; then - return 1 - fi - - local ip=$1 - - # 3 secs timeout should be good. - ping -4 -W3 -c1 $ip > /dev/null 2>&1 -} - -# -# Returns number of octets in an IPv4 prefix. -# If IP prefix is not valid or is not a private IP address prefix, it returns 0. -# -# f.e. For 10 it will return 1, for 10.10 it will return 2, for 10.10.10 it will -# return 3 and for 10.10.10.10, it will return 4. -# -octets_in_ipv4_prefix() -{ - local ip=$1 - local octet="[0-9]{1,3}" - local octetdot="${octet}\." - - if ! is_valid_ipv4_prefix $ip; then - echo 0 - return - fi - - # - # Check if the IP prefix belongs to the private IP range (10.0.0.0/8, - # 172.16.0.0/12, or 192.168.0.0/16), i.e., will the user provided prefix - # result in a private IP address. - # - [[ $ip =~ ^10(\.${octet})*$ ]] || - [[ $ip =~ ^172\.(1[6-9]|2[0-9]|3[0-1])(\.${octet})*$ ]] || - [[ $ip =~ ^192\.168(\.${octet})*$ ]] - - if [ $? -ne 0 ]; then - echo 0 - return - fi - - # 4 octets. - [[ $ip =~ ^(${octetdot}){3}${octet}$ ]] && echo 4 && return; - - # 3 octets - [[ $ip =~ ^(${octetdot}){2}${octet}$ ]] && echo 3 && return; - - # 2 octets. - [[ $ip =~ ^(${octetdot}){1}${octet}$ ]] && echo 2 && return; - - # 1 octet. - [[ $ip =~ ^${octet}$ ]] && echo 1 && return; - - echo 0 -} - -search_free_local_ip_with_prefix() -{ - initial_ip_prefix=$1 - num_octets=$(octets_in_ipv4_prefix $ip_prefix) - - if [ $num_octets -ne 2 -a $num_octets -ne 3 ]; then - eecho "Invalid IPv4 prefix: ${ip_prefix}" - eecho "Valid prefix must have either 2 or 3 octets and must be a valid private IPv4 address prefix." - eecho "Examples of valid private IPv4 prefixes are 10.10, 10.10.10, 192.168, 192.168.10 etc." - return 1 - fi - - local local_ip="" - local optimize_get_free_local_ip=false - local used_local_ips_with_same_prefix=$(cat $MOUNTMAPv3 | awk '{print $2}' | grep "^${initial_ip_prefix}\." | sort -t . -k 1,1n -k 2,2n -k 3,3n -k 4,4n) - local iptable_entries=$(iptables-save -t nat) - - _3rdoctet=100 - ip_prefix=$initial_ip_prefix - - # - # Optimize the process to get free local IP by starting the loop to choose - # 3rd and 4th octet from the number which was used last and still exist in - # MOUNTMAPv3 instead of starting it from 100. - # - if [ $OPTIMIZE_GET_FREE_LOCAL_IP == true -a -n "$used_local_ips_with_same_prefix" ]; then - - last_used_ip=$(echo "$used_local_ips_with_same_prefix" | tail -n1) - - IFS="." read _ _ last_used_3rd_octet last_used_4th_octet <<< "$last_used_ip" - - if [ $num_octets -eq 2 ]; then - if [ "$last_used_3rd_octet" == "254" -a "$last_used_4th_octet" == "254" ]; then - return 1 - fi - - _3rdoctet=$last_used_3rd_octet - optimize_get_free_local_ip=true - else - if [ "$last_used_4th_octet" == "254" ]; then - return 1 - fi - - optimize_get_free_local_ip=true - fi - fi - - while true; do - if [ $num_octets -eq 2 ]; then - for ((; _3rdoctet<255; _3rdoctet++)); do - ip_prefix="${initial_ip_prefix}.$_3rdoctet" - - if is_link_ip $ip_prefix; then - vecho "Skipping link network ${ip_prefix}!" - continue - fi - - break - done - - if [ $_3rdoctet -eq 255 ]; then - # - # If the IP prefix had 2 octets and we exhausted all possible - # values of the 3rd and 4th octet, then we have failed the - # search for free local IP within the given prefix. - # - return 1 - fi - fi - - if $optimize_get_free_local_ip; then - _4thoctet=$(expr ${last_used_4th_octet} + 1) - optimize_get_free_local_ip=false - else - _4thoctet=100 - fi - - for ((; _4thoctet<255; _4thoctet++)); do - local_ip="${ip_prefix}.$_4thoctet" - - is_ip_used_by_aznfs=$(echo "$used_local_ips_with_same_prefix" | grep "^${local_ip}$") - if [ -n "$is_ip_used_by_aznfs" ]; then - vecho "$local_ip is in use by aznfs!" - continue - fi - - if is_host_ip $local_ip; then - vecho "Skipping host address ${local_ip}!" - continue - fi - - if is_link_ip $local_ip; then - vecho "Skipping link network ${local_ip}!" - continue - fi - - if [ "$nfs_ip" == "$local_ip" ]; then - vecho "Skipping private endpoint IP ${nfs_ip}!" - continue - fi - - is_present_in_iptables=$(echo "$iptable_entries" | grep -c "\<${local_ip}\>") - if [ $is_present_in_iptables -ne 0 ]; then - vecho "$local_ip is already present in iptables!" - continue - fi - - # - # Try pinging the address to be sure it is not in use in the - # client network. - # - # Note: If the address exists but not responding to ICMP ping then - # we will incorrectly treat it as non-exixtent. - # - if is_pinging $local_ip; then - vecho "Skipping $local_ip as it appears to be in use on the network!" - continue - fi - - vecho "Using local IP ($local_ip) for aznfs." - break - done - - if [ $_4thoctet -eq 255 ]; then - if [ $num_octets -eq 2 ]; then - let _3rdoctet++ - continue - else - # - # If the IP prefix had 3 octets and we exhausted all possible - # values of the 4th octet, then we have failed the search for - # free local IP within the given prefix. - # - return 1 - fi - fi - - # - # Happy path! - # - # Add this entry to MOUNTMAPv3 while we have the MOUNTMAPv3 lock. - # This is to avoid assigning same local ip to parallel mount requests - # for different endpoints. - # ensure_mountmapv3_exist will also create a matching iptable DNAT rule. - # - LOCAL_IP=$local_ip - ensure_mountmapv3_exist_nolock "$nfs_host $LOCAL_IP $nfs_ip" - - return 0 - done - - # We will never reach here. -} - -# -# Get a local IP that is free to use. Set global variable LOCAL_IP if found. -# -get_free_local_ip() -{ - for ip_prefix in $IP_PREFIXES; do - vecho "Trying IP prefix ${ip_prefix}." - if search_free_local_ip_with_prefix "$ip_prefix"; then - return 0 - fi - done - - # - # If the above loop is not able to find a free local IP using optimized way, - # do a linear search to get the free local IP. - # - vecho "Falling back to linear search for free ip!" - OPTIMIZE_GET_FREE_LOCAL_IP=false - for ip_prefix in $IP_PREFIXES; do - vecho "Trying IP prefix ${ip_prefix}." - if search_free_local_ip_with_prefix "$ip_prefix"; then - return 0 - fi - done - - # If we come here we did not get a free address to use. - return 1 -} - -# -# For the given AZNFS endpoint FQDN return a local IP that should proxy it. -# If there is at least one mount to the same FQDN it MUST return the local IP -# used for that, else assign a new free local IP. -# -get_local_ip_for_fqdn() -{ - local fqdn=$1 - local mountmap_entry=$(grep -m1 "^${fqdn} " $MOUNTMAPv3) - # One local ip per fqdn, so return existing one if already present. - IFS=" " read _ local_ip _ <<< "$mountmap_entry" - - if [ -n "$local_ip" ]; then - LOCAL_IP=$local_ip - - # - # Ask aznfswatchdog to stay away while we are using this proxy IP. - # This is similar to holding a timed lease, we can safely use this - # proxy IP w/o worrying about aznfswatchdog deleting it for 5 minutes. - # - touch_mountmapv3 - - # - # This is not really needed since iptable entry must also be present, - # but it's always better to ensure MOUNTMAPv3 and iptable entries are - # in sync. - # - ensure_iptable_entry $local_ip $nfs_ip - return 0 - fi - - # - # First mount of an account on this client. - # - get_free_local_ip -} - # # Perform a pseudo mount to generate a gatepass for the actual mount call. # This request is expected to fail with "server access denied" if server-side changes are enabled, @@ -1144,7 +783,7 @@ if [ ! -f "$MOUNTMAPv3" ]; then fi # Resolve the IP address for the NFS host -nfs_ip=$(resolve_ipv4_with_preference_to_mountmapv3 "$nfs_host") +nfs_ip=$(resolve_ipv4_with_preference_to_mountmap "$nfs_host" $MOUNTMAPv3) status=$? if [ $status -ne 0 ]; then if [ $status -eq 2 ]; then @@ -1169,7 +808,7 @@ flock -e $fd # cause "accounts mounted on one client" to exceed the limit. # if check_account_count; then - get_local_ip_for_fqdn $nfs_host + get_local_ip_for_fqdn $nfs_host $MOUNTMAPv3 ret=$? account_limit_exceeded=0 else diff --git a/src/nfsv4mountscript.sh b/src/nfsv4mountscript.sh index af09b75b..183a3223 100644 --- a/src/nfsv4mountscript.sh +++ b/src/nfsv4mountscript.sh @@ -49,6 +49,12 @@ cleanup() { exec {fd2}<&- } +# +# Local IP that is free to use. +# +LOCAL_IP="" + + get_next_available_port() { for ((port=NFSV4_PORT_RANGE_START; port<=NFSV4_PORT_RANGE_END; port++)) @@ -303,7 +309,7 @@ add_stunnel_configuration() eecho "Failed to 'client = yes' to $stunnel_conf_file!" return 1 fi - + echo "accept = $LOCALHOST:$available_port" >> $stunnel_conf_file if [ $? -ne 0 ]; then chattr -f +i $stunnel_conf_file @@ -340,6 +346,7 @@ add_stunnel_configuration() chattr -f +i $stunnel_conf_file } + check_if_notls_mount_exists() { # Check if the mount to the same endpoint exists that is using clear text (without TLS). @@ -421,7 +428,9 @@ tls_nfsv4_files_share_mount() exec {fd2}<$MOUNTMAPv4 flock -e $fd2 - vecho "nfs_dir=[$nfs_dir], nfs_host_ip=[$storageaccount_ip], mount_point=[$mount_point], options=[$OPTIONS], mount_options=[$MOUNT_OPTIONS]." + crc32=$(get_aznfs_ctrl_filename "$nfs_host") + + vecho "nfs_dir=[$nfs_dir], nfs_host_ip=[$storageaccount_ip], mount_point=[$mount_point], crc_32=[$crc32], options=[$OPTIONS], mount_options=[$MOUNT_OPTIONS]." IFS=/ read _ storageaccount container extra <<< "$nfs_dir" @@ -450,7 +459,7 @@ tls_nfsv4_files_share_mount() # # If this echo fails then MOUNTMAPv4 could be truncated. # - echo "$out" > $MOUNTMAPv4 + echo "$out" > $MOUNTMAPv4 ret=$? out= if [ $ret -ne 0 ]; then @@ -590,8 +599,11 @@ tls_nfsv4_files_share_mount() # Waiting: mountmap entry is added but mount command is not executed yet. Watchdog can ignore this entry. # Mounted: mount command is executed successfully. If the mount is unmounted, watchdog can remove this entry. # Failed: mount command failed. Watchdog can remove this entry. + # NOTE: Since multi-mount scenario is not officially supported, if multiple accounts are mounted to the same server IP during a migration, + # if some accounts are not migrated, this may cause an issue for that leftover account. - local mountmap_entry="$storageaccount_ip;$stunnel_conf_file;$stunnel_log_file;$stunnel_pid_file;$checksumHash;waiting;$mount_timeout" + local mountmap_entry="$storageaccount_ip;$stunnel_conf_file;$stunnel_log_file;$stunnel_pid_file;$checksumHash;waiting;$mount_timeout;$crc32" + vecho "Adding mountmap entry: [$mountmap_entry] to $MOUNTMAPv4" chattr -f -i $MOUNTMAPv4 echo "$mountmap_entry" >> $MOUNTMAPv4 if [ $? -ne 0 ]; then @@ -855,8 +867,30 @@ if [[ "$MOUNT_OPTIONS" == *"notls"* ]]; then MOUNT_OPTIONS=${MOUNT_OPTIONS//,notls/} fi - # Do the actual mount. - mount_output=$(mount -t nfs -o "$MOUNT_OPTIONS" "${nfs_host}:${nfs_dir}" "$mount_point" 2>&1) + # Resolve the IP address for the NFS host + nfs_ip=$(resolve_ipv4_with_preference_to_mountmap "$nfs_host" $MOUNTMAPv4NONTLS) + vecho "Resolved IP address for FQDN from mountmap [$nfs_host -> $nfs_ip]" + status=$? + if [ $status -ne 0 ]; then + if [ $status -eq 2 ]; then + vecho "Resolved IP address for FQDN from mountmap [$nfs_host -> $nfs_ip]" + else + echo "$nfs_ip" + eecho "Cannot resolve IP address for ${nfs_host}!" + eecho "Mount failed!" + exit 1 + fi + fi + + # get local ip for fqdn, this here maps to target get_local_ip is from the IPTable + # this also creates the mountmap entries etc if they do not exist, so it has to be after the TLS and non-TLS mountmap checking + get_local_ip_for_fqdn $nfs_host $MOUNTMAPv4NONTLS + ret=$? + + vecho "nfs_host=[$nfs_host], nfs_ip=[$nfs_ip], nfs_dir=[$nfs_dir], mount_point=[$mount_point], options=[$OPTIONS], mount_options=[$MOUNT_OPTIONS], local_ip=[$LOCAL_IP]." + + # Do the actual non tls mount. + mount_output=$(mount -t nfs -o "$MOUNT_OPTIONS" "${LOCAL_IP}:${nfs_dir}" "$mount_point" 2>&1) mount_status=$? flock -u $fd2