diff --git a/cli/src/bacc/mpi_bm.py b/cli/src/bacc/mpi_bm.py index 2bb72e9..e793cad 100644 --- a/cli/src/bacc/mpi_bm.py +++ b/cli/src/bacc/mpi_bm.py @@ -61,7 +61,7 @@ def populate_arguments(loader): "pool_id", options_list=["--pool-id", "-p"], help="The ID of the pool to use for the job.", - choices=['almalinux', 'rhel8'] + # choices=['almalinux', 'rhel8'] ) c.argument( "await_completion", @@ -72,8 +72,8 @@ def populate_arguments(loader): c.argument( "mpi_impl", options_list=["--mpi-implementation", "-m"], - help="The MPI implementation to use for the job.", - choices=['hpcx'], + help="The MPI implementation to use for the job. Ensure that the MPI implementation is available on the compute node.", + choices=['hpcx', 'openmpi', 'impi-2021', 'mvapich2'], arg_group="MPI Arguments", ) @@ -172,8 +172,12 @@ def execute(resource_group_name:str, subscription_id:str, job_id = "{}-{}".format('custom', uid) num_ranks_per_node = math.ceil(num_ranks / num_nodes) - if mpi_impl == "hpcx": - mpi_cmd=f"mpirun -host $(get_openmpi_hosts_with_slots) -x UCX_TLS=rc -x LD_LIBRARY_PATH --map-by ppr:{num_ranks_per_node}:node -np {num_ranks}" + if mpi_impl == "hpcx" or mpi_impl == "openmpi": + mpi_cmd=f"mpirun -host $(get_openmpi_hosts_with_slots) -mca coll_hcoll_enable 0 -x UCX_TLS=tcp -x LD_LIBRARY_PATH -x UCX_NET_DEVICES=eth0 --map-by ppr:{num_ranks_per_node}:node -np {num_ranks}" + elif mpi_impl == "impi-2021": + mpi_cmd=f"mpirun -hosts $(echo $AZ_BATCH_NODE_LIST | sed \"s/;/,/g\") -genv I_MPI_DEBUG 5 -genv I_MPI_FABRICS ofi -ppn {num_ranks_per_node} -np {num_ranks}" + elif mpi_impl == "mvapich2": + mpi_cmd=f"mpirun -host $(get_openmpi_hosts_with_slots) -x LD_LIBRARY_PATH --map-by ppr:{num_ranks_per_node}:node -np {num_ranks}" wrk_command = f"$(find {prefix}/{mpi_impl}/ -name {bm_exe} -type f | head -n 1) {bm_args}" task_cmd = f"bash -c 'source /etc/profile.d/modules.sh && source /mnt/batch_utils.sh && module load mpi/{mpi_impl} && {mpi_cmd} {wrk_command}'" diff --git a/examples/mpi-benchmarks/deployment.bicep b/examples/mpi-benchmarks/deployment.bicep index 7eb131c..ed1b229 100644 --- a/examples/mpi-benchmarks/deployment.bicep +++ b/examples/mpi-benchmarks/deployment.bicep @@ -45,6 +45,9 @@ param addressPrefix string = '10.121.0.0/16' @description('Batch Service Object Id (az ad sp show --id "ddbf3205-c6bd-46ae-8127-60eb93363864" --query id)') param batchServiceObjectId string +@description('log analysis workspace resource id (optional). leave empty to disable log analytics') +param logAnalyticsWorkspaceId string = '' + //------------------------------------------------------------------------------ var extraArgs = !empty(mpiWorkloadGitUrl) && !empty(mpiWorkloadGitBranch) && !empty(mpiWorkloadGitCMakePath) ? '-g ${mpiWorkloadGitUrl} -b ${mpiWorkloadGitBranch} -p ${mpiWorkloadGitCMakePath}' : '' var c0 = replace(loadTextContent('./config.jsonc'), '\${sku}', sku) @@ -60,12 +63,18 @@ var peerings = !empty(vnetPeerResourceGroupName) && !empty(vnetPeerName) ? [{ useGateway: true }] : [] -var hubConfig = !empty(peerings) ? { +var hc0 = !empty(peerings) ? { network: { peerings: peerings } } : {} +var hc1 = !empty(logAnalyticsWorkspaceId) ? { + diagnostics: { + logAnalyticsWorkspace: { id: logAnalyticsWorkspaceId } + } +} : {} + @description('suffix used for all nested deployments') var dplSuffix = uniqueString(deployment().name, location, resourceGroupName) @@ -74,7 +83,7 @@ module mdlInfrastructure '../../modules/infrastructure.bicep' = { name: 'infrastructure-${dplSuffix}' params: { config: config - hubConfig: hubConfig + hubConfig: union(hc0, hc1) resourceGroupName: resourceGroupName location: location tags: tags diff --git a/examples/mpi-benchmarks/start_task.sh b/examples/mpi-benchmarks/start_task.sh index 441e1d4..68581f0 100755 --- a/examples/mpi-benchmarks/start_task.sh +++ b/examples/mpi-benchmarks/start_task.sh @@ -246,12 +246,6 @@ EOF install_intel_benchmarks () { mpi_impl=$1 - # check if arguments are valid - if [ "$mpi_impl" != "hpcx" ]; then - echo "Invalid MPI implementation: ${mpi_impl}" - exit 1 - fi - status_file="${STATUS_PREFIX}/intel_benchmarks_installed_${mpi_impl}" if [ -f "${status_file}" ]; then echo "Intel MPI Benchmarks (${mpi_impl}) already installed. Skipping." @@ -284,12 +278,6 @@ install_osu_benchmarks () { #-------- mpi_impl=$1 - # check if arguments are valid - if [ "$mpi_impl" != "hpcx" ]; then - echo "Invalid MPI implementation: ${mpi_impl}" - exit 1 - fi - status_file="${STATUS_PREFIX}/osu_benchmarks_installed_${mpi_impl}" if [ -f "${status_file}" ]; then echo "OSU Benchmarks (${mpi_impl}) already installed. Skipping." @@ -303,7 +291,7 @@ install_osu_benchmarks () { tar -xvf osu-micro-benchmarks-7.0.1.tar.gz pushd osu-micro-benchmarks-7.0.1 - ./configure CC=mpicc CXX=mpicxx --prefix=/mnt/osu-micro-benchmarks/${mpi_impl} + ./configure CC=mpicc CXX=mpicxx --prefix=${INSTALL_PREFIX}/osu-micro-benchmarks/${mpi_impl} make -j $(nproc) make install popd @@ -366,7 +354,7 @@ EOF save_batch_utils () { # This function has utility functions for Batch tasks - cat << EOF > /mnt/batch_utils.sh + cat << EOF > ${INSTALL_PREFIX}/batch_utils.sh #!/usr/bin/env bash # This script has utility functions for Batch tasks @@ -390,6 +378,17 @@ export AZ_BATCH_OMPI_HOSTS=\$(get_openmpi_hosts_with_slots) EOF } +get_mpi_impls () { + mpi_impls="" + mpis_to_test="hpcx openmpi mvapich2 impi-2021" + for mpi in $mpis_to_test; do + if [ $(module avail -t mpi 2>&1 | grep -c $mpi) -gt 0 ]; then + mpi_impls="$mpi_impls $mpi" + fi + done + echo $mpi_impls +} + if [ "${_arg_mofed}" = "on" ]; then echo "Installing Mellanox OFED drivers" install_dependencies @@ -403,24 +402,32 @@ if [ "${_arg_mpis}" = "on" ]; then fi source /etc/profile.d/modules.sh +mpi_impls=$(get_mpi_impls) + if [ "${_arg_ibm}" = "on" ]; then echo "Installing Intel MPI Benchmarks" - install_intel_benchmarks hpcx + for mpi in $mpi_impls; do + install_intel_benchmarks $mpi + done module purge fi if [ "${_arg_osu}" = "on" ]; then echo "Installing OSU Micro Benchmarks" - install_osu_benchmarks hpcx + for mpi in $mpi_impls; do + install_osu_benchmarks $mpi + done module purge fi # build mpi workload if [ -n "${_arg_git_url}" ]; then echo "Building MPI workload from git repo" - install_mpi_workload "${_arg_git_url}" "${_arg_git_branch}" "${_arg_git_path}" "hpcx" + for mpi in $mpi_impls; do + install_mpi_workload "${_arg_git_url}" "${_arg_git_branch}" "${_arg_git_path}" $mpi + done module purge fi -# save batch_utils to /mnt/batch_utils.sh +# save batch_utils to ${INSTALL_PREFIX}/batch_utils.sh save_batch_utils