From d20d1cc88cd60d8105dd0414a8857e600c5a4c99 Mon Sep 17 00:00:00 2001 From: Daniel Wong Date: Thu, 12 Dec 2024 10:54:55 -0800 Subject: [PATCH] initial commit with only refactoring of fname quick push refactor add common methods to common.sh and add dns resolver for v4 mounts add log add common functions switch to mounting with local ip for nfsv4 non tls add fslocation file add process nontls mount add logging for watchdog disable ipcheck comment out looped for nontlsprocess uncomment comment first process so only one process is running remove process nontls mounts for now as it's causing crash enable nontls mount watchdog add correct findmnt fix findmnt update watchdog with grep update to 30 seconds and turn on ip detection update watchdog update watchdog hardcode detection true add vecho add local_host echo change to eecho log fslocation entry wrap in paren update ipchange detection remove the extra ping_newendpoint and see if it can just update the ip in iptable and keep localip the same added logs quick fix add new scenarios for watchdog add IP change detection for EIT quick sync failed since 2 add placeholder Resolve merge conflict issues Remove nontls Remove comment update for ip change detection update with file name changes and updating mountmap entries update with checksumhash updatre echo with quotes updates with logging and other ip updates add -i+i add so it only updates for newip chatattr to update files and filepaths update for logs debug with logging add logging for the outmountmap and see why mountmap isn't being updated update for fork add gflag add restart stunnel add newchecksumcalculation and to the mountmap update checksumhash update with newfilepath for checksum log add an extra -e update for checking netstat add wrapper for if fslocation line is empty and only do the ip detection then if there is even an fslocation to begin with add quote get rid of foorloop add space update conf file name and run stunnel off of l_conf remove space around= reset the others add kill pid before stunnel_status_new update with logs for pid only set new path locally, let stunnel create the new file path reset l_pid after move log l_conf before stunnel start log stunnel status properly remove old log file only track nontls mounts bump poll to 30 update to track only tls update test package small update test vlaue for testaccount1 add test update for test to see if we need to fix test update add to aznfsnontls file add crc in the function if aznfs version is 4 add to line and pick update with the latest ip information for change detection test with string add mountpoint updated mountpoint -r -n fix findmntM update echo and findmnt command remove colon add lock to file so it waits until after the mount command is completed change interval to 60 rearrange locks update with old line and new line for updatemountmapv3 entry add some more logging adding more logging add fix update egrep update with adding back check without grabbing crc32 change to mountpoint for ip detection for now update to use account name fo rnow update just for demo update to remove check for file for demo Remove warning for ipv4 Reverted the last 3 commits due to issues update to ensure cleanup for ip tables update to remove the devnull output override update with echos add braces update spacing cleanup spit out correct val update the grab add aznfsctrltxt correct name mountmapv4nontls duplicate entry prevention remove semicolon add logs trytt updated code Refactor test with crc32 for tls update with crc32 in common and call for reads and writes into the mountscript update with logging add mountpoint add mountpoint updated with setting the correct conf files and starting stunnel after a migration detection has been changed small fix with pid and conf paths etc fake for testing fake 2 update with fix for the fake unfake update for refactor Rearrange so that ip deteciton reading of crc is after we check for empty mount remove m_time logic for now refactor ensure_mountmapv3_not_exist remove locking afterwards to ensure it doesn't mess it up with stat updated with ensure Refactor with func names as well refactor mtime Refactor a bit and also skip some cehcks and reuse the local ip for second mount of the same account Refactor the entire stack for mounting in mountmapv3 and mountmapv4nontls to common.sh and update callers. refactor mountmap methods remove some comments that are sent to aznfs.log test with do unmount gc setting revert that update Remove settings --- lib/common.sh | 687 ++++++++++++++++++++++++++++++++++++---- src/aznfswatchdog | 43 +-- src/aznfswatchdogv4 | 337 +++++++++++++++++++- src/nfsv3mountscript.sh | 365 +-------------------- src/nfsv4mountscript.sh | 46 ++- 5 files changed, 1009 insertions(+), 469 deletions(-) diff --git a/lib/common.sh b/lib/common.sh index 25ae99323..d5e90d683 100644 --- a/lib/common.sh +++ b/lib/common.sh @@ -22,6 +22,11 @@ MOUNTMAPv3="${OPTDIRDATA}/mountmap" # MOUNTMAPv4="${OPTDIRDATA}/mountmapv4" +# +# This stores the map of local IP and share name an external file endpoint IP. +# +MOUNTMAPv4NONTLS="${OPTDIRDATA}/mountmapv4nontls" + RED="\e[2;31m" GREEN="\e[2;32m" YELLOW="\e[2;33m" @@ -52,7 +57,77 @@ RELEASE_NUMBER_FOR_AKS=x.y.z # How often does the watchdog look for unmounts and/or IP address changes for # Blob and nfs file endpoints. # -MONITOR_INTERVAL_SECS=5 +MONITOR_INTERVAL_SECS=30 + +# +# ------------------ Common definitions from nfsv3mountscript.sh -------------------- +# + +# +# Default order in which we try the network prefixes for a free local IP to use. +# This can be overriden using AZNFS_IP_PREFIXES environment variable. +# +DEFAULT_AZNFS_IP_PREFIXES="10.161 192.168 172.16" +IP_PREFIXES="${AZNFS_IP_PREFIXES:-${DEFAULT_AZNFS_IP_PREFIXES}}" + +# Aznfs port, defaults to 2048. +AZNFS_PORT="${AZNFS_PORT:-2048}" + +# Default to checking azure nconnect support. +AZNFS_CHECK_AZURE_NCONNECT="${AZNFS_CHECK_AZURE_NCONNECT:-1}" + +# Default to fixing mount options passed in to help the user. +AZNFS_FIX_MOUNT_OPTIONS="${AZNFS_FIX_MOUNT_OPTIONS:-1}" + +# Default to fixing dirty bytes config to help the user. +AZNFS_FIX_DIRTY_BYTES_CONFIG="${AZNFS_FIX_DIRTY_BYTES_CONFIG:-1}" + +# Read ahead size in KB defaults to 16384. +AZNFS_READ_AHEAD_KB="${AZNFS_READ_AHEAD_KB:-16384}" + +# +# Use noresvport mount option to allow using non-reserve ports by client. +# This allows much higher number of local ports to be used by NFS client and +# hence may alleviate some issues due to running out of very small resv port range. +# Blob NFS doesn't require clients to use reserve ports so we can use non-reserve +# port with Blob NFS but Linux NFS client doesn't reuse source port while reconnecting +# if noresvport option is used. This does not work will with the DRC cache. +# +AZNFS_USE_NORESVPORT="${AZNFS_USE_NORESVPORT:-0}" + +# Set the fingerprint GUID as an environment variable with a default value. +AZNFS_FINGERPRINT="${AZNFS_FINGERPRINT:-80a18d5c-9553-4c64-88dd-d7553c6b3beb}" + +# +# Default to maximum number of mount retries in case of server-side returns failure. +# Retries make the mount process more robust. Currently, we don't distinguish between +# access denied failure due to intermittent issues or genuine mount failures. We retry anyways. +# +AZNFS_MAX_MOUNT_RETRIES="${AZNFS_MAX_MOUNT_RETRIES:-3}" + +# +# Maximum number of accounts that can be mounted from the same tenant/cluster. +# Any number of containers on these many accounts can be mounted. +# With ~350 reserved ports and 16 connections per mount (with nconnect=16) leaving +# some room, 20 is a reasonable limit. +# +MAX_ACCOUNTS_MOUNTABLE_FROM_SINGLE_TENANT=20 + +# +# Local IP that is free to use. +# +LOCAL_IP="" + +# +# Choose the local IP based on last used IP in MOUNTMAPv3 if this flag is enabled. +# +OPTIMIZE_GET_FREE_LOCAL_IP=true + +# +# True if user has asked to use port 2047 using 'port=2047' mount option. +# This signifies server side nconnect which has some special needs. +# +USING_PORT_2047=false _log() { @@ -339,26 +414,31 @@ is_private_ip() } # -# Mount helper must call this function to grab a timed lease on all MOUNTMAPv3 +# Mount helper must call this function to grab a timed lease on all mountmap # entries. It should do this if it decides to use any of the entries. Once -# this is called aznfswatchdog is guaranteed to not delete any MOUNTMAPv3 till -# the next 5 minutes. +# this is called aznfswatchdog is guaranteed to not delete any mountmap entries +# till the next 5 minutes. # -# Must be called with MOUNTMAPv3 lock held. +# Must be called with mountmap lock held. # -touch_mountmapv3() +# Parameters: +# $1 - mountmap_file: The mountmap file to touch +# +touch_mountmap() { - chattr -f -i $MOUNTMAPv3 - touch $MOUNTMAPv3 + local mountmap_file=$1 + + chattr -f -i $mountmap_file + touch $mountmap_file if [ $? -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "Failed to touch ${MOUNTMAPv3}!" + chattr -f +i $mountmap_file + eecho "Failed to touch ${mountmap_file}!" return 1 fi - chattr -f +i $MOUNTMAPv3 + chattr -f +i $mountmap_file } -# Create mount map file +# Create mount map file MOUNTMAPv3 or MOUNTMAPv4 create_mountmap_file() { local mountmap_filename=MOUNTMAPv$AZNFS_VERSION @@ -372,113 +452,201 @@ create_mountmap_file() fi } +# Create mountmap file MOUNTMAPv4NONTLS +create_mountmap_file_nontlsv4() +{ + local mountmap_filename_nontls=MOUNTMAPv4NONTLS + if [ ! -f ${!mountmap_filename_nontls} ]; then + touch ${!mountmap_filename_nontls} + if [ $? -ne 0 ]; then + eecho "[FATAL] Not able to create '${!mountmap_filename_nontls}'!" + return 1 + fi + chattr -f +i ${!mountmap_filename_nontls} + fi + + local fslocation_filename=VIRTUALFSLOCATION + + if [ ! -f ${!fslocation_filename} ]; then + touch ${!fslocation_filename} + if [ $? -ne 0 ]; then + eecho "[FATAL] Not able to create '${!fslocation_filename}'!" + return 1 + fi + chattr -f +i ${!fslocation_filename} + fi +} + +# +# Calculate control file name based on storage account hostname. +# Returns: AZNFSCtrl.txt where hash is derived from the account name. +# +get_aznfs_ctrl_filename() +{ + local hostname="$1" + local account_name=${hostname%%.*} + local key="abc" + local keylen=${#key} + local acc=0 + + for (( i=0; i<${#account_name}; ++i )); do + # Extract single character (byte) from each string + local ch="${account_name:i:1}" + local kch="${key:i%keylen:1}" + + # Get decimal byte values + local b=$(printf '%d' "'$ch") + local kb=$(printf '%d' "'$kch") + + local xored=$(( (b ^ kb) & 0xFF )) + local shift_amt=$(( (i % 4) * 8 )) + acc=$(( acc ^ (xored << shift_amt ) )) + done + + acc=$(( acc & 0xFFFFFFFF )) + echo "AZNFSCtrl.txt${acc}" +} + # # MOUNTMAPv3 is accessed by both mount.aznfs and aznfswatchdog service. Update it # only after taking exclusive lock. # -# Add entry to MOUNTMAPv3 in case of a new mount or IP change for blob FQDN. +# Add entry to mountmap in case of a new mount or IP change for blob/file FQDN. # -# This also ensures that the corresponding DNAT rule is created so that MOUNTMAPv3 +# This also ensures that the corresponding DNAT rule is created so that mountmap # entry and DNAT rule are always in sync. +# For Nfsv4 Non TLS, also add CRC32 based on the account name +# +# Parameters: +# $1 - entry: The entry to add (format: "host ip nfsip") +# $2 - mountmap_file: The mountmap file to update # -ensure_mountmapv3_exist_nolock() +ensure_mountmap_exist_nolock() { - IFS=" " read l_host l_ip l_nfsip <<< "$1" + local entry=$1 + local mountmap_file=$2 + + IFS=" " read l_host l_ip l_nfsip <<< "$entry" if ! ensure_iptable_entry $l_ip $l_nfsip; then - eecho "[$1] failed to add to ${MOUNTMAPv3}!" + eecho "[$entry] failed to add to ${mountmap_file}!" return 1 fi + line="$entry" + if [ "$AZNFS_VERSION" = "4" ]; then + #calculate crc32 and then append to the line + local ctrl_filename=$(get_aznfs_ctrl_filename "$l_host") + vecho "Control file for $l_host: $ctrl_filename" + line+=" $ctrl_filename" + fi - egrep -q "^${1}$" $MOUNTMAPv3 + egrep -q "^${line}$" $mountmap_file if [ $? -ne 0 ]; then - chattr -f -i $MOUNTMAPv3 - echo "$1" >> $MOUNTMAPv3 + chattr -f -i $mountmap_file + echo "$line" >> $mountmap_file if [ $? -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "[$1] failed to add to ${MOUNTMAPv3}!" - # Could not add MOUNTMAPv3 entry, delete the DNAT rule added above. + chattr -f +i $mountmap_file + eecho "[$entry] failed to add to ${mountmap_file}!" + # Could not add mountmap entry, delete the DNAT rule added above. ensure_iptable_entry_not_exist $l_ip $l_nfsip return 1 fi - chattr -f +i $MOUNTMAPv3 + chattr -f +i $mountmap_file else - pecho "[$1] already exists in ${MOUNTMAPv3}." + pecho "[$entry] already exists in ${mountmap_file}." fi } -ensure_mountmapv3_exist() +# +# Add entry to mountmap with exclusive lock. +# +# Parameters: +# $1 - entry: The entry to add (format: "host ip nfsip") +# $2 - mountmap_file: The mountmap file to update +# +ensure_mountmap_exist() { + local entry=$1 + local mountmap_file=$2 + ( flock -e 999 - ensure_mountmapv3_exist_nolock "$1" + ensure_mountmap_exist_nolock "$entry" "$mountmap_file" return $? - ) 999<$MOUNTMAPv3 + ) 999<$mountmap_file } # -# Delete entry from MOUNTMAPv3 and also the corresponding iptable rule. +# Delete entry from mountmap and also the corresponding iptable rule. +# +# Parameters: +# $1 - line: The entry to delete +# $2 - mountmap_file: The mountmap file to update +# $3 - ifmatch (optional): Only delete if mountmap mtime matches this value # -ensure_mountmapv3_not_exist() +ensure_mountmap_not_exist() { + local line=$1 + local mountmap_file=$2 + local ifmatch="$3" + ( flock -e 999 # - # If user wants to delete the entry only if MOUNTMAPv3 has not changed since + # If user wants to delete the entry only if mountmap has not changed since # he looked up, honour that. # - local ifmatch="$2" if [ -n "$ifmatch" ]; then - local mtime=$(stat -c%Y $MOUNTMAPv3) + local mtime=$(stat -c%Y $mountmap_file) if [ "$mtime" != "$ifmatch" ]; then - eecho "[$1] Refusing to remove from ${MOUNTMAPv3} as $mtime != $ifmatch!" + eecho "[$line] Refusing to remove from ${mountmap_file} as $mtime != $ifmatch!" return 1 fi fi - # Delete iptable rule corresponding to the outgoing MOUNTMAPv3 entry. - IFS=" " read l_host l_ip l_nfsip <<< "$1" + # Delete iptable rule corresponding to the outgoing mountmap entry. + IFS=" " read l_host l_ip l_nfsip l_aznfsctrlfile <<< "$line" if [ -n "$l_host" -a -n "$l_ip" -a -n "$l_nfsip" ]; then if ! ensure_iptable_entry_not_exist $l_ip $l_nfsip; then - eecho "[$1] Refusing to remove from ${MOUNTMAPv3} as iptable entry could not be deleted!" + eecho "[$line] Refusing to remove from ${mountmap_file} as iptable entry could not be deleted!" return 1 fi fi - chattr -f -i $MOUNTMAPv3 + chattr -f -i $mountmap_file # # We do this thing instead of inplace update by sed as that has a - # very bad side-effect of creating a new MOUNTMAPv3 file. This breaks + # very bad side-effect of creating a new mountmap file. This breaks # any locking that we dependent on the old file. # - out=$(sed "\%^${1}$%d" $MOUNTMAPv3) + out=$(sed "\%^${line}$%d" $mountmap_file) ret=$? if [ $ret -eq 0 ]; then # - # If this echo fails then MOUNTMAPv3 could be truncated. In that case we need + # If this echo fails then mountmap could be truncated. In that case we need # to reconcile it from the mount info and iptable info. That needs to be done # out-of-band. # - echo "$out" > $MOUNTMAPv3 + echo "$out" > $mountmap_file ret=$? out= if [ $ret -ne 0 ]; then - eecho "*** [FATAL] MOUNTMAPv3 may be in inconsistent state, contact Microsoft support ***" + eecho "*** [FATAL] $mountmap_file may be in inconsistent state, contact Microsoft support ***" fi fi if [ $ret -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "[$1] failed to remove from ${MOUNTMAPv3}!" + chattr -f +i $mountmap_file + eecho "[$line] failed to remove from ${mountmap_file}!" # Reinstate DNAT rule deleted above. ensure_iptable_entry $l_ip $l_nfsip return 1 fi - chattr -f +i $MOUNTMAPv3 + chattr -f +i $mountmap_file # Return the mtime after our mods. - echo $(stat -c%Y $MOUNTMAPv3) - ) 999<$MOUNTMAPv3 + echo $(stat -c%Y $mountmap_file) + ) 999<$mountmap_file } # @@ -487,66 +655,451 @@ ensure_mountmapv3_not_exist() # corresponding to old entry and adding the DNAT rule corresponding to the new # entry. # -update_mountmapv3_entry() +# Parameters: +# $1 - old: The old entry to replace +# $2 - new: The new entry to replace with +# $3 - mountmap_file: The mountmap file to update +# +update_mountmap_entry() { local old=$1 local new=$2 + local mountmap_file=$3 - vecho "Updating mountmapv3 entry [$old -> $new]" + vecho "Updating mountmap entry [$old -> $new] in $mountmap_file" ( flock -e 999 - IFS=" " read l_host l_ip l_nfsip_old <<< "$old" + IFS=" " read l_host l_ip l_nfsip_old l_aznfsctrlfile <<< "$old" if [ -n "$l_host" -a -n "$l_ip" -a -n "$l_nfsip_old" ]; then if ! ensure_iptable_entry_not_exist $l_ip $l_nfsip_old; then - eecho "[$old] Refusing to remove from ${MOUNTMAPv3} as old iptable entry could not be deleted!" + eecho "[$old] Refusing to remove from ${mountmap_file} as old iptable entry could not be deleted!" return 1 fi fi - IFS=" " read l_host l_ip l_nfsip_new <<< "$new" + IFS=" " read l_host l_ip l_nfsip_new l_aznfsctrlfile <<< "$new" if [ -n "$l_host" -a -n "$l_ip" -a -n "$l_nfsip_new" ]; then if ! ensure_iptable_entry $l_ip $l_nfsip_new; then - eecho "[$new] Refusing to remove from ${MOUNTMAPv3} as new iptable entry could not be added!" + eecho "[$new] Refusing to remove from ${mountmap_file} as new iptable entry could not be added!" # Roll back. ensure_iptable_entry $l_ip $l_nfsip_old return 1 fi fi - chattr -f -i $MOUNTMAPv3 + chattr -f -i $mountmap_file # # We do this thing instead of inplace update by sed as that has a - # very bad side-effect of creating a new MOUNTMAPv3 file. This breaks + # very bad side-effect of creating a new mountmap file. This breaks # any locking that we dependent on the old file. # - out=$(sed "s%^${old}$%${new}%g" $MOUNTMAPv3) + out=$(sed "s%^${old}$%${new}%g" $mountmap_file) ret=$? if [ $ret -eq 0 ]; then # - # If this echo fails then MOUNTMAPv3 could be truncated. In that case we need + # If this echo fails then mountmap could be truncated. In that case we need # to reconcile it from the mount info and iptable info. That needs to be done # out-of-band. # - echo "$out" > $MOUNTMAPv3 + echo "$out" > $mountmap_file ret=$? out= if [ $ret -ne 0 ]; then - eecho "*** [FATAL] MOUNTMAPv3 may be in inconsistent state, contact Microsoft support ***" + eecho "*** [FATAL] $mountmap_file may be in inconsistent state, contact Microsoft support ***" fi fi if [ $ret -ne 0 ]; then - chattr -f +i $MOUNTMAPv3 - eecho "[$old -> $new] failed to update ${MOUNTMAPv3}!" + chattr -f +i $mountmap_file + eecho "[$old -> $new] failed to update ${mountmap_file}!" # Roll back. ensure_iptable_entry_not_exist $l_ip $l_nfsip_new ensure_iptable_entry $l_ip $l_nfsip_old return 1 fi - chattr -f +i $MOUNTMAPv3 - ) 999<$MOUNTMAPv3 + chattr -f +i $mountmap_file + ) 999<$mountmap_file +} + +# +# Is the given address one of the host addresses? +# +is_host_ip() +{ + # + # Do not make this local as status gathering does not work well when + # collecting command o/p to local variables. + # + route=$(ip -4 route get fibmatch $1 2>/dev/null) + if [ $? -ne 0 ]; then + return 1 + fi + + if ! echo "$route" | grep -q "scope host"; then + return 1 + fi + + return 0 +} + +# +# Is the given address one of the addresses directly reachable from the host? +# +is_link_ip() +{ + # + # Do not make this local as status gathering does not work well when + # collecting command o/p to local variables. + # + route=$(ip -4 route get fibmatch $1 2>/dev/null) + if [ $? -ne 0 ]; then + return 1 + fi + + if ! echo "$route" | grep -q "scope link"; then + return 1 + fi + + return 0 +} + +# +# Check if a given IPv4 address is responding to ICMP pings. +# Uses a 3 secs timeout to bail out in time if address is not responding. +# +is_pinging() +{ + # + # Unless env var AZNFS_PING_LOCAL_IP_BEFORE_USE is set, pretend IP address + # is available. + # + if [ "$AZNFS_PING_LOCAL_IP_BEFORE_USE" != "1" ]; then + return 1 + fi + + local ip=$1 + + # 3 secs timeout should be good. + ping -4 -W3 -c1 $ip > /dev/null 2>&1 +} + +# +# Returns number of octets in an IPv4 prefix. +# If IP prefix is not valid or is not a private IP address prefix, it returns 0. +# +# f.e. For 10 it will return 1, for 10.10 it will return 2, for 10.10.10 it will +# return 3 and for 10.10.10.10, it will return 4. +# +octets_in_ipv4_prefix() +{ + local ip=$1 + local octet="[0-9]{1,3}" + local octetdot="${octet}\." + + if ! is_valid_ipv4_prefix $ip; then + echo 0 + return + fi + + # + # Check if the IP prefix belongs to the private IP range (10.0.0.0/8, + # 172.16.0.0/12, or 192.168.0.0/16), i.e., will the user provided prefix + # result in a private IP address. + # + [[ $ip =~ ^10(\.${octet})*$ ]] || + [[ $ip =~ ^172\.(1[6-9]|2[0-9]|3[0-1])(\.${octet})*$ ]] || + [[ $ip =~ ^192\.168(\.${octet})*$ ]] + + if [ $? -ne 0 ]; then + echo 0 + return + fi + + # 4 octets. + [[ $ip =~ ^(${octetdot}){3}${octet}$ ]] && echo 4 && return; + + # 3 octets + [[ $ip =~ ^(${octetdot}){2}${octet}$ ]] && echo 3 && return; + + # 2 octets. + [[ $ip =~ ^(${octetdot}){1}${octet}$ ]] && echo 2 && return; + + # 1 octet. + [[ $ip =~ ^${octet}$ ]] && echo 1 && return; + + echo 0 +} + +# +# Search for a free local IP with the given prefix. +# Takes the IP prefix and the mountmap file to use. +# +search_free_local_ip_with_prefix() +{ + local initial_ip_prefix=$1 + local mountmap_file=$2 + local num_octets=$(octets_in_ipv4_prefix $ip_prefix) + + if [ $num_octets -ne 2 -a $num_octets -ne 3 ]; then + eecho "Invalid IPv4 prefix: ${ip_prefix}" + eecho "Valid prefix must have either 2 or 3 octets and must be a valid private IPv4 address prefix." + eecho "Examples of valid private IPv4 prefixes are 10.10, 10.10.10, 192.168, 192.168.10 etc." + return 1 + fi + + local local_ip="" + local optimize_get_free_local_ip=false + local used_local_ips_with_same_prefix=$(cat $mountmap_file | awk '{print $2}' | grep "^${initial_ip_prefix}\." | sort -t . -k 1,1n -k 2,2n -k 3,3n -k 4,4n) + local iptable_entries=$(iptables-save -t nat) + + _3rdoctet=100 + ip_prefix=$initial_ip_prefix + + # + # Optimize the process to get free local IP by starting the loop to choose + # 3rd and 4th octet from the number which was used last and still exist in + # mountmap instead of starting it from 100. + # + if [ $OPTIMIZE_GET_FREE_LOCAL_IP == true -a -n "$used_local_ips_with_same_prefix" ]; then + + last_used_ip=$(echo "$used_local_ips_with_same_prefix" | tail -n1) + + IFS="." read _ _ last_used_3rd_octet last_used_4th_octet <<< "$last_used_ip" + + if [ $num_octets -eq 2 ]; then + if [ "$last_used_3rd_octet" == "254" -a "$last_used_4th_octet" == "254" ]; then + return 1 + fi + + _3rdoctet=$last_used_3rd_octet + optimize_get_free_local_ip=true + else + if [ "$last_used_4th_octet" == "254" ]; then + return 1 + fi + + optimize_get_free_local_ip=true + fi + fi + + while true; do + if [ $num_octets -eq 2 ]; then + for ((; _3rdoctet<255; _3rdoctet++)); do + ip_prefix="${initial_ip_prefix}.$_3rdoctet" + + if is_link_ip $ip_prefix; then + vecho "Skipping link network ${ip_prefix}!" + continue + fi + + break + done + + if [ $_3rdoctet -eq 255 ]; then + # + # If the IP prefix had 2 octets and we exhausted all possible + # values of the 3rd and 4th octet, then we have failed the + # search for free local IP within the given prefix. + # + return 1 + fi + fi + + if $optimize_get_free_local_ip; then + _4thoctet=$(expr ${last_used_4th_octet} + 1) + optimize_get_free_local_ip=false + else + _4thoctet=100 + fi + + for ((; _4thoctet<255; _4thoctet++)); do + local_ip="${ip_prefix}.$_4thoctet" + + is_ip_used_by_aznfs=$(echo "$used_local_ips_with_same_prefix" | grep "^${local_ip}$") + if [ -n "$is_ip_used_by_aznfs" ]; then + vecho "$local_ip is in use by aznfs!" + continue + fi + + if is_host_ip $local_ip; then + vecho "Skipping host address ${local_ip}!" + continue + fi + + if is_link_ip $local_ip; then + vecho "Skipping link network ${local_ip}!" + continue + fi + + if [ "$nfs_ip" == "$local_ip" ]; then + vecho "Skipping private endpoint IP ${nfs_ip}!" + continue + fi + + is_present_in_iptables=$(echo "$iptable_entries" | grep -c "\<${local_ip}\>") + if [ $is_present_in_iptables -ne 0 ]; then + vecho "$local_ip is already present in iptables!" + continue + fi + + # + # Try pinging the address to be sure it is not in use in the + # client network. + # + # Note: If the address exists but not responding to ICMP ping then + # we will incorrectly treat it as non-exixtent. + # + if is_pinging $local_ip; then + vecho "Skipping $local_ip as it appears to be in use on the network!" + continue + fi + + vecho "Using local IP ($local_ip) for aznfs." + break + done + + if [ $_4thoctet -eq 255 ]; then + if [ $num_octets -eq 2 ]; then + let _3rdoctet++ + continue + else + # + # If the IP prefix had 3 octets and we exhausted all possible + # values of the 4th octet, then we have failed the search for + # free local IP within the given prefix. + # + return 1 + fi + fi + + # + # Happy path! + # + # Add this entry to mountmap while we have the mountmap lock. + # This is to avoid assigning same local ip to parallel mount requests + # for different endpoints. + # ensure_mountmap_exist_nolock will also create a matching iptable DNAT rule. + # + LOCAL_IP=$local_ip + ensure_mountmap_exist_nolock "$nfs_host $LOCAL_IP $nfs_ip" "$mountmap_file" + + return 0 + done + + # We will never reach here. +} + +# +# Get a local IP that is free to use. Set global variable LOCAL_IP if found. +# Takes the mountmap file to use for tracking used IPs. +# +get_free_local_ip() +{ + local mountmap_file=$1 + + for ip_prefix in $IP_PREFIXES; do + vecho "Trying IP prefix ${ip_prefix}." + if search_free_local_ip_with_prefix "$ip_prefix" "$mountmap_file"; then + return 0 + fi + done + + # + # If the above loop is not able to find a free local IP using optimized way, + # do a linear search to get the free local IP. + # + vecho "Falling back to linear search for free ip!" + OPTIMIZE_GET_FREE_LOCAL_IP=false + for ip_prefix in $IP_PREFIXES; do + vecho "Trying IP prefix ${ip_prefix}." + if search_free_local_ip_with_prefix "$ip_prefix" "$mountmap_file"; then + return 0 + fi + done + + # If we come here we did not get a free address to use. + return 1 +} + +# +# To maintain consistency in case of regional account and in general to avoid creating +# multiple DNAT entries corresponding to one LOCAL_IP, first check for resolved IP in mountmap. +# This will help keep mountmap and DNAT entries in sync with each other. +# If the current resolved IP is different from the one stored in mountmap then it means that the IP has changed +# since the mountmap entry was created (could be due to migration or more likely due to RAs roundrobin DNS). +# In any case this will be properly handled by aznfswatchdog next time it checks for IP change for this fqdn. +# +# Parameters: +# $1 - fqdn: The FQDN to resolve +# $2 - mountmap_file: The mountmap file to check for existing IP +# +resolve_ipv4_with_preference_to_mountmap() +{ + local fqdn=$1 + local mountmap_file=$2 + + exec {fd}<$mountmap_file + flock -e $fd + + local mountmap_entry=$(grep -m1 "^${fqdn} " $mountmap_file) + + flock -u $fd + exec {fd}<&- + + IFS=" " read _ local_ip old_nfs_ip <<< "$mountmap_entry" + if [ -n "$old_nfs_ip" ]; then + echo "$old_nfs_ip" + return 2 + fi + + # + # Resolve FQDN to IPv4 using DNS if not found in the mountmap. + # + resolve_ipv4 "$fqdn" "true" +} + +# +# For the given AZNFS endpoint FQDN return a local IP that should proxy it. +# If there is at least one mount to the same FQDN it MUST return the local IP +# used for that, else assign a new free local IP. +# +# Parameters: +# $1 - fqdn: The FQDN to get a local IP for +# $2 - mountmap_file: The mountmap file to use +# +get_local_ip_for_fqdn() +{ + local fqdn=$1 + local mountmap_file=$2 + local mountmap_entry=$(grep -m1 "^${fqdn} " $mountmap_file) + # One local ip per fqdn, so return existing one if already present. + IFS=" " read _ local_ip _ <<< "$mountmap_entry" + + if [ -n "$local_ip" ]; then + LOCAL_IP=$local_ip + + # + # Ask aznfswatchdog to stay away while we are using this proxy IP. + # This is similar to holding a timed lease, we can safely use this + # proxy IP w/o worrying about aznfswatchdog deleting it for 5 minutes. + # + touch_mountmap $mountmap_file + + # + # This is not really needed since iptable entry must also be present, + # but it's always better to ensure mountmap and iptable entries are + # in sync. + # + ensure_iptable_entry $local_ip $nfs_ip + return 0 + fi + + # + # First mount of an account on this client. + # + get_free_local_ip $mountmap_file } # @@ -744,6 +1297,12 @@ if ! create_mountmap_file; then exit 1 fi +# Create mount map file nontls v4 + +if ! create_mountmap_file_nontlsv4; then + exit 1 +fi + ulimitfd=$(ulimit -n 2>/dev/null) if [ -n "$ulimitfd" -a $ulimitfd -lt 131072 ]; then ulimit -n 131072 diff --git a/src/aznfswatchdog b/src/aznfswatchdog index 77cf1dc19..eef76e77b 100644 --- a/src/aznfswatchdog +++ b/src/aznfswatchdog @@ -26,13 +26,6 @@ AZNFS_SKIP_UNMOUNT_CLEANUP="${AZNFS_SKIP_UNMOUNT_CLEANUP:-0}" # TIMEWAIT timeout to be used for conntrack entries. AZNFS_TIMEWAIT_TIMEOUT="${AZNFS_TIMEWAIT_TIMEOUT:-65}" -# -# Environment variable to control skipping of IP change detection for regional accounts. -# By default we want to skip IP change detection for regional accounts, but if we want to -# disable skipping we can set this environment variable to 0. -# -AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS="${AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS:-1}" - next_ip_change_detection_epoch=0 # @@ -61,6 +54,14 @@ if [ ! -s $RANDBYTES ]; then fi chattr -f +i $RANDBYTES +# +# Environment variable to control skipping of IP change detection for regional accounts. +# By default we want to skip IP change detection for regional accounts, but if we want to +# disable skipping we can set this environment variable to 0. +# +AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS="${AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS:-1}" + +# Associative arrays for regional account tracking declare -A ip_change_count declare -A last_ip_change_time declare -A regional_accounts @@ -342,7 +343,7 @@ process_nfsv3_mounts() # exec {fd}<$MOUNTMAPv3 flock -e $fd - mtime_mountmap=$(stat -c%Y $MOUNTMAPv3) + mtime_mountmapv3=$(stat -c%Y $MOUNTMAPv3) IFS=$'\n' lines=$(cat $MOUNTMAPv3) flock -u $fd exec {fd}<&- @@ -360,7 +361,7 @@ process_nfsv3_mounts() # do_unmount_gc=false if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then - if [ $epoch_now -ge $(expr $mtime_mountmap + $MOUNTMAP_INACTIVITY_SECS) ]; then + if [ $epoch_now -ge $(expr $mtime_mountmapv3 + $MOUNTMAP_INACTIVITY_SECS) ]; then do_unmount_gc=true fi fi @@ -398,24 +399,24 @@ process_nfsv3_mounts() if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" ]; then wecho "[FATAL] Deleting invalid line in $MOUNTMAPv3: [$line]!" - l_mtime=$(ensure_mountmapv3_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi # Since we added it to the MOUNTMAPv3 file, it cannot be invalid. if ! is_private_ip "$l_ip"; then wecho "[FATAL] local ip ($l_ip) is invalid!" - l_mtime=$(ensure_mountmapv3_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi # Since we added it to the MOUNTMAPv3 file, it cannot be invalid. if ! is_valid_ipv4_address "$l_nfsip"; then wecho "[FATAL] Blob endpoint ip ($l_nfsip) is invalid!" - l_mtime=$(ensure_mountmapv3_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi @@ -429,16 +430,16 @@ process_nfsv3_mounts() pecho "No mounted shares for host $l_host, deleting from ${MOUNTMAPv3} [$line]." # Delete IFF mountmap is not changed since we read it above. - l_mtime=$(ensure_mountmapv3_not_exist "$line" "$mtime_mountmap") + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3" "$mtime_mountmapv3") # # Update ifmatch time in case of successful updation of MOUNTMAPv3, # so that we can distinguish between MOUNTMAPv3 mtime changing because # of our action or some mount helper changing it. In the former case - # it's safe to update the MOUNTMAPv3, so update mtime_mountmap to the + # it's safe to update the MOUNTMAPv3, so update mtime_mountmapv3 to the # mtime after this update. # - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime continue fi else @@ -493,8 +494,8 @@ process_nfsv3_mounts() if [ "$new_ip" == "NXDOMAIN" ]; then pecho "Account corresponding to $l_host seems to have been deleted, deleting from ${MOUNTMAPv3} [$line]!" - l_mtime=$(ensure_mountmapv3_not_exist "$line" "$mtime_mountmap") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv3" "$mtime_mountmapv3") + [ $? -eq 0 ] && mtime_mountmapv3=$l_mtime else eecho "resolve_ipv4($l_host) failed: $new_ip" fi @@ -510,7 +511,7 @@ process_nfsv3_mounts() check_for_regional_account "$l_host" # This will update DNAT rule as well. - if ! update_mountmapv3_entry "$line" "$l_host $l_ip $new_ip"; then + if ! update_mountmap_entry "$line" "$l_host $l_ip $new_ip" "$MOUNTMAPv3"; then eecho "Will reattempt the operation in next iteration." else mountpoint2048=$(echo "$findmnt" 2>/dev/null | egrep -m1 " nfs ${l_ip}:.*\" | awk '{print $4}') diff --git a/src/aznfswatchdogv4 b/src/aznfswatchdogv4 index ca0091b34..769d37f51 100644 --- a/src/aznfswatchdogv4 +++ b/src/aznfswatchdogv4 @@ -100,6 +100,11 @@ is_nfs_server_active_for_target() done < "$NFSFS_SERVERS_FILE" return 1 } +mtime_mountmapv4=0 +mtime_mountmapv4nontls=0 +# How often do we check for change in FQDN->IP? +IP_CHANGE_DETECTION_FREQUENCY=60 +next_ip_change_detection_epoch=0 # # Kill stunnel process and clean up stunnel files generated by aznfs mount helper @@ -138,6 +143,37 @@ cleanup_stunnel_files() fi } +# +# Kill Stunnel for Migration and remove pid and log files. +# +cleanup_stunnel_files() +{ + local l_conf=$1 + local l_log=$2 + local l_pid=$3 + local accept_port + + # Kill stunnel process first. + pid=$(cat $l_pid) + accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2) + pecho "killing stunnel process with pid: $pid on port: $accept_port" + kill -9 $pid + if [ $? -ne 0 ]; then + eecho "Unable to kill stunnel process $pid!" + fi + + # Cleanup stunnel files + rm $l_log + if [ $? -ne 0 ]; then + eecho "[FATAL] Unable to delete stunnel log file $l_log!" + fi + + rm $l_pid + if [ $? -ne 0 ]; then + eecho "[FATAL] Unable to delete stunnel pid file $l_pid!" + fi +} + # # Delete entry from MOUNTMAPv4. # @@ -201,17 +237,17 @@ cleanup_mount() flock -e $fd2 # Delete IFF mountmap is not changed since we read it above. - l_mtime=$(ensure_mountmapv4_not_exist "$line" "$mtime_mountmap") + l_mtime=$(ensure_mountmapv4_not_exist "$line" "$mtime_mountmapv4") # # Update mountmap mtime in case of successful updation of MOUNTMAPv4, # so that we can distinguish between MOUNTMAPv4 mtime changing because # of our action or some mount helper changing it. In the former case - # it's safe to update the MOUNTMAPv4, so update mtime_mountmap to the + # it's safe to update the MOUNTMAPv4, so update mtime_mountmapv4 to the # mtime after this update. # if [ $? -eq 0 ]; then - mtime_mountmap=$l_mtime + mtime_mountmapv4=$l_mtime else # If the mountmap file is changed since we read it, we need to read it again - don't modify anything. eecho "Failed to delete entry from ${MOUNTMAPv4}! Entry: [$line]" @@ -247,7 +283,7 @@ process_nfsv4_mounts() # exec {fd}<$MOUNTMAPv4 flock -e $fd - mtime_mountmap=$(stat -c%Y $MOUNTMAPv4) + mtime_mountmapv4=$(stat -c%Y $MOUNTMAPv4) IFS=$'\n' lines=$(cat $MOUNTMAPv4) flock -u $fd exec {fd}<&- @@ -258,7 +294,7 @@ process_nfsv4_mounts() # for sure that it's not in use by any mount and can be removed. # findmnt=$(findmnt | grep 'nfs4\|$LOCALHOST' 2>&1) - + #findmnt=$(findmnt --raw --noheading -o MAJ:MIN,FSTYPE,SOURCE,TARGET,OPTIONS -t nfs 2>&1) # # For no matching mounts also, findmnt exits with a failure return, so check # for both exit status and non-empty error o/p. @@ -281,19 +317,20 @@ process_nfsv4_mounts() if [ -z "$line" ]; then continue fi - # # MOUNTMAPv4 line is of the form: - # ;;;;;; + # ;;;;;;; # - IFS=";" read l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout <<< "$line" + + IFS=";" read l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout l_crc_32 <<< "$line" + IP_changed=false #we don't need to worry abour regional accounts with l_crc32 if [ -z "$l_ip" -o -z "$l_conf" -o -z "$l_pid" ]; then wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4: [$line]!" exec {fd2}<$MOUNTMAPv4 flock -e $fd2 l_mtime=$(ensure_mountmapv4_not_exist "$line") - [ $? -eq 0 ] && mtime_mountmap=$l_mtime + [ $? -eq 0 ] && mtime_mountmapv4=$l_mtime flock -u $fd2 exec {fd2}<&- continue @@ -311,26 +348,116 @@ process_nfsv4_mounts() fi accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2) - # vecho "accept_port: $accept_port" + #vecho "accept_port: $accept_port" # # Delete entry from MOUNTMAPv4 if there are no mounted shares on that host. # As long as we have at least one mount using the MOUNTMAPv4 entry, we leave # it around. # - if ! echo "$findmnt" | grep "$accept_port" >/dev/null; then + findmnt_output=$(echo "$findmnt" | grep "$accept_port") + vecho "findmnt_output: $findmnt_output" + if [ -z "$findmnt_output" ]; then vecho "findmnt shows no mount for accept_port=$accept_port (line=[$line])" if is_nfs_server_active_for_target "$LOCALHOST" "$accept_port"; then pecho "NFS server entry still active for $LOCALHOST:$accept_port; skipping cleanup for [$line]." continue fi - pecho "No mounted shares for host $l_ip with accept port $accept_port, deleting from ${MOUNTMAPv4} [$line]." cleanup_mount $l_conf $l_log $l_pid $line continue else - # vecho "Mounted shares found for host $l_ip with accept port $accept_port." + vecho "Mounted shares found for host $l_ip with accept port $accept_port." + # + # Fetch FSLocationIP from the virtual file and check if it's updated for migration scenario. + # The virtual file is at the mountpoint and named with the crc32 value in the mountmap entry. + # + local mountpoint=$(echo "$findmnt_output" | awk '{print $1}' | sed 's/^[└├│─]*//g') + l_fslocation_line=$(cat "${mountpoint}/${l_crc_32}") + #read now from the mountpoint + read_status=$? + IFS=";" read l_prt l_fslocationIP <<< "$l_fslocation_line" + #process the output + if [ $read_status -ne 0 ]; then + eecho "Failed to read from $l_crc32_file" + return 1 + fi + + # + # If FSLocationIP is not empty, and updated from the IP stored in MountMap and PRT is in migration stage, update. + # Mountmap, config, log, and checksumhash need to be updtaed accordingly. Then restart stunnel with new config file. + # If it's empty, it will skip this logic as ip will not be changed. - before server upgrade + # + if [ -n "$l_fslocationIP" -a "$l_fslocationIP" != "$l_ip" -a "$l_prt" -ne 0 ]; then + vecho " IP for $l_host changed [$l_ip -> $l_fslocationIP]." + + new_ip=$l_fslocationIP + IP_changed=true + + #Update stunnel config file with new IP + out=$(sed "s/${l_ip}/${new_ip}/g" $l_conf) + + ret=$? + + #kill old stunnel process and remove old files + cleanup_stunnel_files $l_conf $l_log $l_pid + + #update the contents of the l_conf file with the new ip + #changing log file location, and pid. Then recalculate checksum hash + if [ $ret -eq 0 ]; then + chattr -f -i $l_conf + # + # If this echo fails then MOUNTMAPv4 could be truncated. + # + echo "$out" > $l_conf + ret=$? + out= + if [ $ret -ne 0 ]; then + eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***" + fi + chattr -f +i $l_conf + fi + + #update filepath for config file and move to new path + new_conf_file_path=$(echo "$l_conf" | sed "s/${l_ip}/${new_ip}/g") + chattr -f -i $l_conf + mv -vf $l_conf $new_conf_file_path + chattr -f +i $new_conf_file_path + + #update mountmapentry to update checksumhash and ip files + newchecksumhash=`cksum $new_conf_file_path | awk '{print $1}'` + chattr -f -i $MOUNTMAPv4 + outmountmap=$(sed -e "s/$l_checksumhash/$newchecksumhash/g" -e "s/$l_ip/$new_ip/g" $MOUNTMAPv4) + + #Update MOUNTMAPv4 with new IP and new Checksum hash + ret=$? + if [ $ret -eq 0 ]; then + # + # If this echo fails then MOUNTMAPv4 could be truncated. + # + echo "$outmountmap" > $MOUNTMAPv4 + ret=$? + out= + if [ $ret -ne 0 ]; then + eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***" + fi + fi + + chattr -f +i $MOUNTMAPv4 + + # Update all variable paths to reflect new IP + l_conf=$new_conf_file_path + l_log=$(echo "$l_log" | sed "s/${l_ip}/${new_ip}/g") + l_pid=$(echo "$l_pid" | sed "s/${l_ip}/${new_ip}/g") + l_ip=$new_ip + l_checksumhash=$newchecksumhash + + # Start the new stunnel process + stunnel_status_new=$(stunnel $l_conf 2>&1) + + # done + fi # Check if checksumHash for stunnel.conf file has changed. # Customers should not modify stunnel.conf files created by aznfs mount helper. @@ -339,7 +466,8 @@ process_nfsv4_mounts() eecho "Failed to get the checksum hash of file: '${l_conf}'!" fi - if [ $checksumHash != $l_checksumhash ]; then + # If files were updated outside of IP being changed, then cleanup the mount. + if [ $checksumHash != $l_checksumhash ] && [ "$IP_changed" = "false" ]; then eecho "'${l_conf}' file has modified!" eecho "It's not recommended to modify '${l_conf}' file created by aznfs mount helper!" eecho "watchdog service will do cleanup, kill stunnel process with pid:$(cat $l_pid) and remove '${l_conf}'; '${l_log}'; '${l_pid}'!" @@ -348,7 +476,7 @@ process_nfsv4_mounts() cleanup_mount $l_conf $l_log $l_pid $line continue fi - + is_stunnel_running=$($NETSTATCOMMAND -anp | grep stunnel | grep `cat $l_pid`) if [ -z "$is_stunnel_running" ]; then vecho "Watchdog: stunnel is not running! Restarting the stunnel" @@ -376,6 +504,180 @@ process_nfsv4_mounts() done } +process_nfsv4_nontlsmounts() +{ + epoch_now=$(date +%s) + # + # Go over all lines in MOUNTMAPv4NONTLS and check them for two things: + # 1. Is that entry still in use by at least one aznfs mount, if not remove the entry. + # 2. Has the Blob endpoint address changed from what is stored? + # If yes, update DNAT rule to point to the new address and update entry accordingly. + # + # Sample line in MOUNTMAPv4NONTLS. + # account.file.preprod.core.windows.net 10.100.100.100 52.230.170.200 AZNFSCtrl.txt12345 + # + # where the format is + # fileendpoint_fqdn proxy_ip fileendpoint_ip crc_32 + # + # We store the mtime of MOUNTMAPv4NONTLS while inside the lock so that if any mount helper process + # updates it after this we will skip modification for sake of safety. We will come to it + # in the next iteration when it's safer. + # + exec {fd}<$MOUNTMAPv4NONTLS + flock -e $fd + mtime_mountmapv4nontls=$(stat -c%Y $MOUNTMAPv4NONTLS) + IFS=$'\n' lines=$(cat $MOUNTMAPv4NONTLS) + flock -u $fd + exec {fd}<&- + + #Can get rid of the below if every 30 seconds is fine. + do_ip_change_detection=false + + if [ $epoch_now -ge $next_ip_change_detection_epoch ]; then + do_ip_change_detection=true + next_ip_change_detection_epoch=$(expr $(date +%s) + $IP_CHANGE_DETECTION_FREQUENCY) + fi + + for line in $lines; do + if [ -z "$line" ]; then + continue + fi + # + # MOUNTMAPv4nontls line is of the form: + # account.file.core.windows.net [] + # compare publicIP and crc32 + # + + IFS=" " read l_host l_ip l_nfsip l_crc32 <<< "$line" + + accname=$(echo $l_host | awk -F'.file' '{print $1}') #can change to l_ip as the fix has been made. whatever is safer. + mountpoint=$(findmnt -r -n | grep 'nfs4' | grep -F -- "$accname" | head -n1 | awk '{print $1}') + + # Since we added it to the MOUNTMAPv4 file, it cannot be invalid. + if ! is_private_ip "$l_ip"; then + wecho "[FATAL] local ip ($l_ip) is invalid!" + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS") + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + + # Since we added it to the MOUNTMAPv4NONTLS file, it cannot be invalid. + if ! is_valid_ipv4_address "$l_nfsip"; then + wecho "[FATAL] File endpoint ip ($l_nfsip) is invalid!" + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS") + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + + # + # Do unmount GC only if MOUNTMAPv4nontls file is not modified in the last + # MOUNTMAP_INACTIVITY_SECS seconds. We don't want to incorrectly delete an + # entry while some aznfs mount is ongoing. + + #BUGBUG 300 seconds seems like a long time. Double check what you want to do here. + # + do_unmount_gc_nfsv4=true + # if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then + # if [ $epoch_now -ge $(expr $mtime_mountmapv4nontls + $MOUNTMAP_INACTIVITY_SECS) ]; then + # do_unmount_gc_nfsv4=true + # fi + # fi + + # + # Delete entry from MOUNTMAPv4NONTLS if there are no mounted shares on that host. + # As long as we have at least one mount using the MOUNTMAPv4NONTLS entry, we leave + # it around. Can just search for account name as we don't support multi-mount migration scenarios on AZNFS for now. + # + findmnt=$(findmnt -m | grep "${accname}") + + if [ -z "$findmnt" ] ; then + if $do_unmount_gc_nfsv4; then + eecho "No mounted shares for host $l_host, deleting from ${MOUNTMAPv4NONTLS} [$line]." + + # Delete IFF mountmap is not changed since we read it above. + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS" "$mtime_mountmapv4nontls") + + # + # Update ifmatch time in case of successful updation of MOUNTMAPv4NONTLS, + # so that we can distinguish between MOUNTMAPv4NONTLS mtime changing because + # of our action or some mount helper changing it. In the former case + # it's safe to update the MOUNTMAPv4NONTLS, so update mtime_mountmapv4nontls to the + # mtime after this update. + # + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + else + # + # Verify that iptable entry should be present for corresponding + # MOUNTMAPv4NONTLS entry if the share is not unmounted. + # + # Note: This is extra protection in case user flushes the iptable + # entries or removes it by mistake. This should not be + # required normally. + # + # We also reconcile conntrack entries stuck in some bad states which + # may hamper communication, f.e., in older kernels there's a bug due to + # which conntrack entry may get stuck in SYN_SENT state if client + # reuse the source port and keep retransmitting SYNs before the entry + # can timeout. + # + reconcile_conntrack "$l_ip" "$l_nfsip" + verify_iptable_entry "$l_ip" "$l_nfsip" + fi + + #read into the line here to do the ip change detection. this is per Line. + l_fslocation_line=$(cat "${mountpoint}/${l_crc32}") + read_status=$? + + #rearrange this, we are not using the status really + if [ $read_status -ne 0 ]; then + eecho "Failed to read from $l_crc32_file" + return 1 + fi + + # Ingest l_prt and l_fslocationIP from fslocation line. The format of the line is ; + IFS=";" read l_prt l_fslocationIP <<< "$l_fslocation_line" + + # should we umount or do something else like log since the server may not be at that build yet? + if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" -o -z "$l_crc32" ]; then + wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4NONTLS: [$line]!" + l_mtime=$(ensure_mountmap_not_exist "$line" "$MOUNTMAPv4NONTLS") + [ $? -eq 0 ] && mtime_mountmapv4nontls=$l_mtime + continue + fi + + # + # We probably don't need this as we are not making any DNS calls but instead reading from fslocation + # BUGBUG for reviewer and Dan - look to remove this from v4 + if ! $do_ip_change_detection; then + continue + fi + + # + # Check if the PRT is in the migration stage and if the IP Reported from Virtual File is updated yet. + # + if [ -n "$l_fslocationIP" -a "$l_fslocationIP" != "$l_nfsip" -a $l_prt != 0 ]; then + eecho "IP for $l_host changed [$l_nfsip -> $l_fslocationIP]." + + # This will update DNAT rule as well. + if ! update_mountmap_entry "$line" "$l_host $l_ip $l_fslocationIP $l_crc32" "$MOUNTMAPv4NONTLS"; then + eecho "Will reattempt the operation in next iteration." + else + #pings the new endpoint to get a TCP RST + mountpoint2048=$(echo "$findmnt" 2>/dev/null | egrep -m1 " nfs ${l_ip}:.*\" | awk '{print $4}') + mountpoint2048=$(echo -e "$mountpoint2048") + if [ -n "$mountpoint2048" ]; then + ping_new_endpoint "$mountpoint2048" & + fi + fi + fi + done + + + +} + # Load common aznfs helpers. AZNFS_VERSION=4 . /opt/microsoft/aznfs/common.sh @@ -389,7 +691,12 @@ if ! chattr -f +i $MOUNTMAPv4; then wecho "chattr does not work for ${MOUNTMAPv4}!" fi +if ! chattr -f +i $MOUNTMAPv4NONTLS; then + wecho "chattr does not work for ${MOUNTMAPv4NONTLS}!" +fi + while :; do sleep $MONITOR_INTERVAL_SECS process_nfsv4_mounts + process_nfsv4_nontlsmounts done diff --git a/src/nfsv3mountscript.sh b/src/nfsv3mountscript.sh index 9d526ae79..4837f1387 100644 --- a/src/nfsv3mountscript.sh +++ b/src/nfsv3mountscript.sh @@ -548,367 +548,6 @@ check_account_count() return 0 } -# -# To maintain consistency in case of regional account and in general to avoid creating -# multiple DNAT entries corrosponding to one LOCAL_IP, first check for resolved IP in mountmap. -# This will help keep mountmap and DNAT entries in sync with each other. -# If the current resolved IP is different from the one stored in mountmap then it means that the IP has changed -# since the mountmap entry was created (could be due to migration or more likely due to RAs roundrobin DNS). -# In any case this will be properly handled by aznfswatchdog next time it checks for IP change for this fqdn. -# -resolve_ipv4_with_preference_to_mountmapv3() -{ - local fqdn=$1 - - exec {fd}<$MOUNTMAPv3 - flock -e $fd - - local mountmap_entry=$(grep -m1 "^${fqdn} " $MOUNTMAPv3) - - flock -u $fd - exec {fd}<&- - - IFS=" " read _ local_ip old_nfs_ip <<< "$mountmap_entry" - if [ -n "$old_nfs_ip" ]; then - echo "$old_nfs_ip" - return 2 - fi - - # - # Resolve FQDN to IPv4 using DNS if not found in the mountmap. - # - resolve_ipv4 "$fqdn" "true" -} - -# -# Is the given address one of the host addresses? -# -is_host_ip() -{ - # - # Do not make this local as status gathering does not work well when - # collecting command o/p to local variables. - # - route=$(ip -4 route get fibmatch $1 2>/dev/null) - if [ $? -ne 0 ]; then - return 1 - fi - - if ! echo "$route" | grep -q "scope host"; then - return 1 - fi - - return 0 -} - -# -# Is the given address one of the addresses directly reachable from the host? -# -is_link_ip() -{ - # - # Do not make this local as status gathering does not work well when - # collecting command o/p to local variables. - # - route=$(ip -4 route get fibmatch $1 2>/dev/null) - if [ $? -ne 0 ]; then - return 1 - fi - - if ! echo "$route" | grep -q "scope link"; then - return 1 - fi - - return 0 -} - -# -# Check if a given IPv4 address is responding to ICMP pings. -# Uses a 3 secs timeout to bail out in time if address is not responding. -# -is_pinging() -{ - # - # Unless env var AZNFS_PING_LOCAL_IP_BEFORE_USE is set, pretend IP address - # is available. - # - if [ "$AZNFS_PING_LOCAL_IP_BEFORE_USE" != "1" ]; then - return 1 - fi - - local ip=$1 - - # 3 secs timeout should be good. - ping -4 -W3 -c1 $ip > /dev/null 2>&1 -} - -# -# Returns number of octets in an IPv4 prefix. -# If IP prefix is not valid or is not a private IP address prefix, it returns 0. -# -# f.e. For 10 it will return 1, for 10.10 it will return 2, for 10.10.10 it will -# return 3 and for 10.10.10.10, it will return 4. -# -octets_in_ipv4_prefix() -{ - local ip=$1 - local octet="[0-9]{1,3}" - local octetdot="${octet}\." - - if ! is_valid_ipv4_prefix $ip; then - echo 0 - return - fi - - # - # Check if the IP prefix belongs to the private IP range (10.0.0.0/8, - # 172.16.0.0/12, or 192.168.0.0/16), i.e., will the user provided prefix - # result in a private IP address. - # - [[ $ip =~ ^10(\.${octet})*$ ]] || - [[ $ip =~ ^172\.(1[6-9]|2[0-9]|3[0-1])(\.${octet})*$ ]] || - [[ $ip =~ ^192\.168(\.${octet})*$ ]] - - if [ $? -ne 0 ]; then - echo 0 - return - fi - - # 4 octets. - [[ $ip =~ ^(${octetdot}){3}${octet}$ ]] && echo 4 && return; - - # 3 octets - [[ $ip =~ ^(${octetdot}){2}${octet}$ ]] && echo 3 && return; - - # 2 octets. - [[ $ip =~ ^(${octetdot}){1}${octet}$ ]] && echo 2 && return; - - # 1 octet. - [[ $ip =~ ^${octet}$ ]] && echo 1 && return; - - echo 0 -} - -search_free_local_ip_with_prefix() -{ - initial_ip_prefix=$1 - num_octets=$(octets_in_ipv4_prefix $ip_prefix) - - if [ $num_octets -ne 2 -a $num_octets -ne 3 ]; then - eecho "Invalid IPv4 prefix: ${ip_prefix}" - eecho "Valid prefix must have either 2 or 3 octets and must be a valid private IPv4 address prefix." - eecho "Examples of valid private IPv4 prefixes are 10.10, 10.10.10, 192.168, 192.168.10 etc." - return 1 - fi - - local local_ip="" - local optimize_get_free_local_ip=false - local used_local_ips_with_same_prefix=$(cat $MOUNTMAPv3 | awk '{print $2}' | grep "^${initial_ip_prefix}\." | sort -t . -k 1,1n -k 2,2n -k 3,3n -k 4,4n) - local iptable_entries=$(iptables-save -t nat) - - _3rdoctet=100 - ip_prefix=$initial_ip_prefix - - # - # Optimize the process to get free local IP by starting the loop to choose - # 3rd and 4th octet from the number which was used last and still exist in - # MOUNTMAPv3 instead of starting it from 100. - # - if [ $OPTIMIZE_GET_FREE_LOCAL_IP == true -a -n "$used_local_ips_with_same_prefix" ]; then - - last_used_ip=$(echo "$used_local_ips_with_same_prefix" | tail -n1) - - IFS="." read _ _ last_used_3rd_octet last_used_4th_octet <<< "$last_used_ip" - - if [ $num_octets -eq 2 ]; then - if [ "$last_used_3rd_octet" == "254" -a "$last_used_4th_octet" == "254" ]; then - return 1 - fi - - _3rdoctet=$last_used_3rd_octet - optimize_get_free_local_ip=true - else - if [ "$last_used_4th_octet" == "254" ]; then - return 1 - fi - - optimize_get_free_local_ip=true - fi - fi - - while true; do - if [ $num_octets -eq 2 ]; then - for ((; _3rdoctet<255; _3rdoctet++)); do - ip_prefix="${initial_ip_prefix}.$_3rdoctet" - - if is_link_ip $ip_prefix; then - vecho "Skipping link network ${ip_prefix}!" - continue - fi - - break - done - - if [ $_3rdoctet -eq 255 ]; then - # - # If the IP prefix had 2 octets and we exhausted all possible - # values of the 3rd and 4th octet, then we have failed the - # search for free local IP within the given prefix. - # - return 1 - fi - fi - - if $optimize_get_free_local_ip; then - _4thoctet=$(expr ${last_used_4th_octet} + 1) - optimize_get_free_local_ip=false - else - _4thoctet=100 - fi - - for ((; _4thoctet<255; _4thoctet++)); do - local_ip="${ip_prefix}.$_4thoctet" - - is_ip_used_by_aznfs=$(echo "$used_local_ips_with_same_prefix" | grep "^${local_ip}$") - if [ -n "$is_ip_used_by_aznfs" ]; then - vecho "$local_ip is in use by aznfs!" - continue - fi - - if is_host_ip $local_ip; then - vecho "Skipping host address ${local_ip}!" - continue - fi - - if is_link_ip $local_ip; then - vecho "Skipping link network ${local_ip}!" - continue - fi - - if [ "$nfs_ip" == "$local_ip" ]; then - vecho "Skipping private endpoint IP ${nfs_ip}!" - continue - fi - - is_present_in_iptables=$(echo "$iptable_entries" | grep -c "\<${local_ip}\>") - if [ $is_present_in_iptables -ne 0 ]; then - vecho "$local_ip is already present in iptables!" - continue - fi - - # - # Try pinging the address to be sure it is not in use in the - # client network. - # - # Note: If the address exists but not responding to ICMP ping then - # we will incorrectly treat it as non-exixtent. - # - if is_pinging $local_ip; then - vecho "Skipping $local_ip as it appears to be in use on the network!" - continue - fi - - vecho "Using local IP ($local_ip) for aznfs." - break - done - - if [ $_4thoctet -eq 255 ]; then - if [ $num_octets -eq 2 ]; then - let _3rdoctet++ - continue - else - # - # If the IP prefix had 3 octets and we exhausted all possible - # values of the 4th octet, then we have failed the search for - # free local IP within the given prefix. - # - return 1 - fi - fi - - # - # Happy path! - # - # Add this entry to MOUNTMAPv3 while we have the MOUNTMAPv3 lock. - # This is to avoid assigning same local ip to parallel mount requests - # for different endpoints. - # ensure_mountmapv3_exist will also create a matching iptable DNAT rule. - # - LOCAL_IP=$local_ip - ensure_mountmapv3_exist_nolock "$nfs_host $LOCAL_IP $nfs_ip" - - return 0 - done - - # We will never reach here. -} - -# -# Get a local IP that is free to use. Set global variable LOCAL_IP if found. -# -get_free_local_ip() -{ - for ip_prefix in $IP_PREFIXES; do - vecho "Trying IP prefix ${ip_prefix}." - if search_free_local_ip_with_prefix "$ip_prefix"; then - return 0 - fi - done - - # - # If the above loop is not able to find a free local IP using optimized way, - # do a linear search to get the free local IP. - # - vecho "Falling back to linear search for free ip!" - OPTIMIZE_GET_FREE_LOCAL_IP=false - for ip_prefix in $IP_PREFIXES; do - vecho "Trying IP prefix ${ip_prefix}." - if search_free_local_ip_with_prefix "$ip_prefix"; then - return 0 - fi - done - - # If we come here we did not get a free address to use. - return 1 -} - -# -# For the given AZNFS endpoint FQDN return a local IP that should proxy it. -# If there is at least one mount to the same FQDN it MUST return the local IP -# used for that, else assign a new free local IP. -# -get_local_ip_for_fqdn() -{ - local fqdn=$1 - local mountmap_entry=$(grep -m1 "^${fqdn} " $MOUNTMAPv3) - # One local ip per fqdn, so return existing one if already present. - IFS=" " read _ local_ip _ <<< "$mountmap_entry" - - if [ -n "$local_ip" ]; then - LOCAL_IP=$local_ip - - # - # Ask aznfswatchdog to stay away while we are using this proxy IP. - # This is similar to holding a timed lease, we can safely use this - # proxy IP w/o worrying about aznfswatchdog deleting it for 5 minutes. - # - touch_mountmapv3 - - # - # This is not really needed since iptable entry must also be present, - # but it's always better to ensure MOUNTMAPv3 and iptable entries are - # in sync. - # - ensure_iptable_entry $local_ip $nfs_ip - return 0 - fi - - # - # First mount of an account on this client. - # - get_free_local_ip -} - # # Perform a pseudo mount to generate a gatepass for the actual mount call. # This request is expected to fail with "server access denied" if server-side changes are enabled, @@ -1144,7 +783,7 @@ if [ ! -f "$MOUNTMAPv3" ]; then fi # Resolve the IP address for the NFS host -nfs_ip=$(resolve_ipv4_with_preference_to_mountmapv3 "$nfs_host") +nfs_ip=$(resolve_ipv4_with_preference_to_mountmap "$nfs_host" $MOUNTMAPv3) status=$? if [ $status -ne 0 ]; then if [ $status -eq 2 ]; then @@ -1169,7 +808,7 @@ flock -e $fd # cause "accounts mounted on one client" to exceed the limit. # if check_account_count; then - get_local_ip_for_fqdn $nfs_host + get_local_ip_for_fqdn $nfs_host $MOUNTMAPv3 ret=$? account_limit_exceeded=0 else diff --git a/src/nfsv4mountscript.sh b/src/nfsv4mountscript.sh index af09b75be..183a32233 100644 --- a/src/nfsv4mountscript.sh +++ b/src/nfsv4mountscript.sh @@ -49,6 +49,12 @@ cleanup() { exec {fd2}<&- } +# +# Local IP that is free to use. +# +LOCAL_IP="" + + get_next_available_port() { for ((port=NFSV4_PORT_RANGE_START; port<=NFSV4_PORT_RANGE_END; port++)) @@ -303,7 +309,7 @@ add_stunnel_configuration() eecho "Failed to 'client = yes' to $stunnel_conf_file!" return 1 fi - + echo "accept = $LOCALHOST:$available_port" >> $stunnel_conf_file if [ $? -ne 0 ]; then chattr -f +i $stunnel_conf_file @@ -340,6 +346,7 @@ add_stunnel_configuration() chattr -f +i $stunnel_conf_file } + check_if_notls_mount_exists() { # Check if the mount to the same endpoint exists that is using clear text (without TLS). @@ -421,7 +428,9 @@ tls_nfsv4_files_share_mount() exec {fd2}<$MOUNTMAPv4 flock -e $fd2 - vecho "nfs_dir=[$nfs_dir], nfs_host_ip=[$storageaccount_ip], mount_point=[$mount_point], options=[$OPTIONS], mount_options=[$MOUNT_OPTIONS]." + crc32=$(get_aznfs_ctrl_filename "$nfs_host") + + vecho "nfs_dir=[$nfs_dir], nfs_host_ip=[$storageaccount_ip], mount_point=[$mount_point], crc_32=[$crc32], options=[$OPTIONS], mount_options=[$MOUNT_OPTIONS]." IFS=/ read _ storageaccount container extra <<< "$nfs_dir" @@ -450,7 +459,7 @@ tls_nfsv4_files_share_mount() # # If this echo fails then MOUNTMAPv4 could be truncated. # - echo "$out" > $MOUNTMAPv4 + echo "$out" > $MOUNTMAPv4 ret=$? out= if [ $ret -ne 0 ]; then @@ -590,8 +599,11 @@ tls_nfsv4_files_share_mount() # Waiting: mountmap entry is added but mount command is not executed yet. Watchdog can ignore this entry. # Mounted: mount command is executed successfully. If the mount is unmounted, watchdog can remove this entry. # Failed: mount command failed. Watchdog can remove this entry. + # NOTE: Since multi-mount scenario is not officially supported, if multiple accounts are mounted to the same server IP during a migration, + # if some accounts are not migrated, this may cause an issue for that leftover account. - local mountmap_entry="$storageaccount_ip;$stunnel_conf_file;$stunnel_log_file;$stunnel_pid_file;$checksumHash;waiting;$mount_timeout" + local mountmap_entry="$storageaccount_ip;$stunnel_conf_file;$stunnel_log_file;$stunnel_pid_file;$checksumHash;waiting;$mount_timeout;$crc32" + vecho "Adding mountmap entry: [$mountmap_entry] to $MOUNTMAPv4" chattr -f -i $MOUNTMAPv4 echo "$mountmap_entry" >> $MOUNTMAPv4 if [ $? -ne 0 ]; then @@ -855,8 +867,30 @@ if [[ "$MOUNT_OPTIONS" == *"notls"* ]]; then MOUNT_OPTIONS=${MOUNT_OPTIONS//,notls/} fi - # Do the actual mount. - mount_output=$(mount -t nfs -o "$MOUNT_OPTIONS" "${nfs_host}:${nfs_dir}" "$mount_point" 2>&1) + # Resolve the IP address for the NFS host + nfs_ip=$(resolve_ipv4_with_preference_to_mountmap "$nfs_host" $MOUNTMAPv4NONTLS) + vecho "Resolved IP address for FQDN from mountmap [$nfs_host -> $nfs_ip]" + status=$? + if [ $status -ne 0 ]; then + if [ $status -eq 2 ]; then + vecho "Resolved IP address for FQDN from mountmap [$nfs_host -> $nfs_ip]" + else + echo "$nfs_ip" + eecho "Cannot resolve IP address for ${nfs_host}!" + eecho "Mount failed!" + exit 1 + fi + fi + + # get local ip for fqdn, this here maps to target get_local_ip is from the IPTable + # this also creates the mountmap entries etc if they do not exist, so it has to be after the TLS and non-TLS mountmap checking + get_local_ip_for_fqdn $nfs_host $MOUNTMAPv4NONTLS + ret=$? + + vecho "nfs_host=[$nfs_host], nfs_ip=[$nfs_ip], nfs_dir=[$nfs_dir], mount_point=[$mount_point], options=[$OPTIONS], mount_options=[$MOUNT_OPTIONS], local_ip=[$LOCAL_IP]." + + # Do the actual non tls mount. + mount_output=$(mount -t nfs -o "$MOUNT_OPTIONS" "${LOCAL_IP}:${nfs_dir}" "$mount_point" 2>&1) mount_status=$? flock -u $fd2