diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml index f94e15a..31bfe5c 100644 --- a/.github/workflows/workflow.yml +++ b/.github/workflows/workflow.yml @@ -166,7 +166,7 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git fetch origin gh-pages - git checkout gh-pages + git checkout -f gh-pages cp /tmp/index.html index.html git add index.html git diff --cached --quiet || git commit -m "Update docs for ${{ steps.get_version.outputs.version }}" diff --git a/Dockerfile.devenv b/Dockerfile.devenv index cdbff7a..cca15bb 100644 --- a/Dockerfile.devenv +++ b/Dockerfile.devenv @@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y \ g++-11 \ make \ cmake \ + lsof \ clang-format \ libboost-dev \ libboost-program-options-dev \ diff --git a/client-scripts/notebooks/analytical_congestion_aware_sample.ipynb b/client-scripts/notebooks/analytical_congestion_aware_sample.ipynb new file mode 100644 index 0000000..793dbce --- /dev/null +++ b/client-scripts/notebooks/analytical_congestion_aware_sample.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "42708369", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "970611b8", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../utils\")\n", + "from astra_sim import AstraSim, Collective, NetworkBackend" + ] + }, + { + "cell_type": "markdown", + "id": "fa81522b", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a89f699a", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"analytical_congestion_aware_sample\")" + ] + }, + { + "cell_type": "markdown", + "id": "67e495d4", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3f00619", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLTOALL, coll_size= 8*1024*1024, npu_range=[0,8])\n", + "print(astra.configuration.common_config.workload)" + ] + }, + { + "cell_type": "markdown", + "id": "f7fe6ba5", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bc05826", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "print(astra.configuration.common_config.system)" + ] + }, + { + "cell_type": "markdown", + "id": "292def3f", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6353e6c6", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "76e50273", + "metadata": {}, + "source": [ + "##### Configure the network backend and topology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0384143f", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# astra.configuration.network_backend.choice = astra.configuration.network_backend.ANALYTICAL_CONGESTION_AWARE\n", + "astra.configuration.network_backend.analytical_congestion_aware.topology.network.clear()\n", + "astra.configuration.network_backend.analytical_congestion_aware.topology.network.add(\"switch\", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns)\n", + "print(\"Network backend set to\", astra.configuration.network_backend.choice)\n", + "print(\"network backend choice set to:\",astra.configuration.network_backend.analytical_congestion_aware.topology.choice)" + ] + }, + { + "cell_type": "markdown", + "id": "8a9b435c", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2cbf6ec", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "\n", + "print(astra.configuration.network_backend.analytical_congestion_aware.topology.network)" + ] + }, + { + "cell_type": "markdown", + "id": "a6857638", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8e0cfef", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_AWARE)" + ] + }, + { + "cell_type": "markdown", + "id": "0d74c3ca", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46bbb3e0", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/analytical_congestion_aware_sample.py b/client-scripts/notebooks/analytical_congestion_aware_sample.py deleted file mode 100644 index a166181..0000000 --- a/client-scripts/notebooks/analytical_congestion_aware_sample.py +++ /dev/null @@ -1,92 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../utils") -from astra_sim import AstraSim, Collective, NetworkBackend - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "analytical_congestion_aware_sample") - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLTOALL, coll_size= 8*1024*1024, npu_range=[0,8]) -print(astra.configuration.common_config.workload) - - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -print(astra.configuration.common_config.system) - - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - - -# %% [markdown] -# ##### Configure the network backend and topology - -# %% -# astra.configuration.network_backend.choice = astra.configuration.network_backend.ANALYTICAL_CONGESTION_AWARE -astra.configuration.network_backend.analytical_congestion_aware.topology.network.clear() -astra.configuration.network_backend.analytical_congestion_aware.topology.network.add("switch", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns) -print("Network backend set to", astra.configuration.network_backend.choice) -print("network backend choice set to:",astra.configuration.network_backend.analytical_congestion_aware.topology.choice) - - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False -astra.configuration.common_config.cmd_parameters.injection_scale = 1 - -print(astra.configuration.network_backend.analytical_congestion_aware.topology.network) - - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_AWARE) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() diff --git a/client-scripts/notebooks/analytical_congestion_unaware_sample.ipynb b/client-scripts/notebooks/analytical_congestion_unaware_sample.ipynb new file mode 100644 index 0000000..f203f31 --- /dev/null +++ b/client-scripts/notebooks/analytical_congestion_unaware_sample.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "eb43ba22", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3060cfd4", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../utils\")\n", + "from astra_sim import AstraSim, Collective, NetworkBackend" + ] + }, + { + "cell_type": "markdown", + "id": "70c0620f", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d58edab", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint =\"172.17.0.2:8989\", tag = \"analytical_congestion_unaware_sample\")" + ] + }, + { + "cell_type": "markdown", + "id": "9ccee35f", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e81c44d", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1024*1024*1024, npu_range=[0,8])\n", + "print(astra.configuration.common_config.workload)" + ] + }, + { + "cell_type": "markdown", + "id": "64b74479", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "329a9e55", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "print(astra.configuration.common_config.system)" + ] + }, + { + "cell_type": "markdown", + "id": "714ee503", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dedb40d", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "b08c6473", + "metadata": {}, + "source": [ + "##### Configure the network backend and topology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "260c0790", + "metadata": {}, + "outputs": [], + "source": [ + "# astra.configuration.network_backend.choice = astra.configuration.network_backend.ANALYTICAL_CONGESTION_UNAWARE\n", + "astra.configuration.network_backend.analytical_congestion_aware.topology.network.clear()\n", + "astra.configuration.network_backend.analytical_congestion_unaware.topology.network.add(\"fullyconnected\", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns)\n", + "print(\"Network backend set to\", astra.configuration.network_backend.choice)\n", + "print(\"network backend choice set to:\",astra.configuration.network_backend.analytical_congestion_unaware.topology.choice)" + ] + }, + { + "cell_type": "markdown", + "id": "e8c1dfdf", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2415e723", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1" + ] + }, + { + "cell_type": "markdown", + "id": "99679487", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "073d2540", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_UNAWARE)" + ] + }, + { + "cell_type": "markdown", + "id": "beba5b30", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaba6af4", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/analytical_congestion_unaware_sample.py b/client-scripts/notebooks/analytical_congestion_unaware_sample.py deleted file mode 100644 index d699c45..0000000 --- a/client-scripts/notebooks/analytical_congestion_unaware_sample.py +++ /dev/null @@ -1,90 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../utils") -from astra_sim import AstraSim, Collective, NetworkBackend - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - -# %% -astra = AstraSim(server_endpoint ="172.17.0.2:8989", tag = "analytical_congestion_unaware_sample") - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1024*1024*1024, npu_range=[0,8]) -print(astra.configuration.common_config.workload) - - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -print(astra.configuration.common_config.system) - - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - - -# %% [markdown] -# ##### Configure the network backend and topology - -# %% -# astra.configuration.network_backend.choice = astra.configuration.network_backend.ANALYTICAL_CONGESTION_UNAWARE -astra.configuration.network_backend.analytical_congestion_aware.topology.network.clear() -astra.configuration.network_backend.analytical_congestion_unaware.topology.network.add("fullyconnected", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns) -print("Network backend set to", astra.configuration.network_backend.choice) -print("network backend choice set to:",astra.configuration.network_backend.analytical_congestion_unaware.topology.choice) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False -astra.configuration.common_config.cmd_parameters.injection_scale = 1 - - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_UNAWARE) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - diff --git a/client-scripts/notebooks/config_to_schema_sample.ipynb b/client-scripts/notebooks/config_to_schema_sample.ipynb new file mode 100644 index 0000000..df99828 --- /dev/null +++ b/client-scripts/notebooks/config_to_schema_sample.ipynb @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "794e5901", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cd4574f", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "sys.path.append(\"../utils\")\n", + "from config_to_schema import TranslateConfig\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim" + ] + }, + { + "cell_type": "markdown", + "id": "ac016638", + "metadata": {}, + "source": [ + "##### Initialize astra-sim sdk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f03bf76", + "metadata": {}, + "outputs": [], + "source": [ + "config = astra_sim.Config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86088050", + "metadata": {}, + "outputs": [], + "source": [ + "RESOURCES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), \"../resourcess/\")" + ] + }, + { + "cell_type": "markdown", + "id": "6748d449", + "metadata": {}, + "source": [ + "##### Translate Remote Memory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c2ae4df", + "metadata": {}, + "outputs": [], + "source": [ + "remote_mem_path = os.path.join(RESOURCES_DIR, \"RemoteMemory.json\")\n", + "TranslateConfig.translate_remote_memory(remote_mem_path, config)" + ] + }, + { + "cell_type": "markdown", + "id": "b9d359ef", + "metadata": {}, + "source": [ + "##### Translate System Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba5028dc", + "metadata": {}, + "outputs": [], + "source": [ + "system_config_path = os.path.join(RESOURCES_DIR, \"system.json\")\n", + "TranslateConfig.translate_system_configuration(system_config_path, config)" + ] + }, + { + "cell_type": "markdown", + "id": "2ee6f37d", + "metadata": {}, + "source": [ + "##### Translate Communicator group Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5394e3a6", + "metadata": {}, + "outputs": [], + "source": [ + "communicator_config_path = os.path.join(RESOURCES_DIR, \"communicator_group.json\")\n", + "TranslateConfig.translate_communicator_configuration(communicator_config_path, config)" + ] + }, + { + "cell_type": "markdown", + "id": "7bd8daf5", + "metadata": {}, + "source": [ + "##### Translate nc-topology Configuration\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30286e0b", + "metadata": {}, + "outputs": [], + "source": [ + "nc_topology_file_path = os.path.join(RESOURCES_DIR, \"nc-topology-file.txt\")\n", + "TranslateConfig.translate_ns3_nc_topology_configuration(nc_topology_file_path, config)" + ] + }, + { + "cell_type": "markdown", + "id": "55c3383b", + "metadata": {}, + "source": [ + "##### Translate network Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb0467cd", + "metadata": {}, + "outputs": [], + "source": [ + "network_config_path = os.path.join(RESOURCES_DIR, \"network_config.txt\")\n", + "TranslateConfig.translate_ns3_network_configuration(network_config_path, config)" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec022b", + "metadata": {}, + "source": [ + "##### Translate logical Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef72781", + "metadata": {}, + "outputs": [], + "source": [ + "logical_dim_file = os.path.join(RESOURCES_DIR, \"logical.json\")\n", + "TranslateConfig.translate_ns3_logical_configuration(logical_dim_file, config)" + ] + }, + { + "cell_type": "markdown", + "id": "595828c0", + "metadata": {}, + "source": [ + "##### Translate analytical network Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a0dfea3", + "metadata": {}, + "outputs": [], + "source": [ + "analytical_network_file = os.path.join(RESOURCES_DIR, \"network.yaml\")\n", + "# There are three available backends — analytical_congestion_aware, analytical_congestion_unaware, and HTsim — all of which use the analytical_network file format, so specify the backend_name\n", + "TranslateConfig.translate_analytical_network(analytical_network_file, config, \"analytical_congestion_aware\")" + ] + }, + { + "cell_type": "markdown", + "id": "250e418b", + "metadata": {}, + "source": [ + "##### Translate HTsim fat tree topology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ee56854", + "metadata": {}, + "outputs": [], + "source": [ + "htsim_fat_tree_file = os.path.join(RESOURCES_DIR, \"8nodes.topo\")\n", + "TranslateConfig.translate_htsim_fat_tree_topology(htsim_fat_tree_file, config)" + ] + }, + { + "cell_type": "markdown", + "id": "1d586a45", + "metadata": {}, + "source": [ + "##### Translate ns3 trace file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eebd213", + "metadata": {}, + "outputs": [], + "source": [ + "ns3_trace_file = os.path.join(RESOURCES_DIR, \"trace.txt\")\n", + "TranslateConfig.translate_ns3_trace_file_to_schema(ns3_trace_file, config)" + ] + }, + { + "cell_type": "markdown", + "id": "fd2fc03c", + "metadata": {}, + "source": [ + "##### Translate logging file to schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b60a44a", + "metadata": {}, + "outputs": [], + "source": [ + "logging_toml_file = os.path.join(RESOURCES_DIR, \"logging_config.toml\")\n", + "TranslateConfig.translate_logging_file_to_schema(logging_toml_file, config)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/config_to_schema_sample.py b/client-scripts/notebooks/config_to_schema_sample.py deleted file mode 100644 index f57131c..0000000 --- a/client-scripts/notebooks/config_to_schema_sample.py +++ /dev/null @@ -1,104 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -import os -sys.path.append("../utils") -from config_to_schema import TranslateConfig -import astra_sim_sdk.astra_sim_sdk as astra_sim - -# %% [markdown] -# ##### Initialize astra-sim sdk - -# %% -config = astra_sim.Config() - -# %% -RESOURCES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../resourcess/") - -# %% [markdown] -# ##### Translate Remote Memory - -# %% -remote_mem_path = os.path.join(RESOURCES_DIR, "RemoteMemory.json") -TranslateConfig.translate_remote_memory(remote_mem_path, config) - -# %% [markdown] -# ##### Translate System Configuration - -# %% -system_config_path = os.path.join(RESOURCES_DIR, "system.json") -TranslateConfig.translate_system_configuration(system_config_path, config) - -# %% [markdown] -# ##### Translate Communicator group Configuration - -# %% -communicator_config_path = os.path.join(RESOURCES_DIR, "communicator_group.json") -TranslateConfig.translate_communicator_configuration(communicator_config_path, config) - -# %% [markdown] -# ##### Translate nc-topology Configuration -# - -# %% -nc_topology_file_path = os.path.join(RESOURCES_DIR, "nc-topology-file.txt") -TranslateConfig.translate_ns3_nc_topology_configuration(nc_topology_file_path, config) - -# %% [markdown] -# ##### Translate network Configuration - -# %% -network_config_path = os.path.join(RESOURCES_DIR, "network_config.txt") -TranslateConfig.translate_ns3_network_configuration(network_config_path, config) - -# %% [markdown] -# ##### Translate logical Configuration - -# %% -logical_dim_file = os.path.join(RESOURCES_DIR, "logical.json") -TranslateConfig.translate_ns3_logical_configuration(logical_dim_file, config) - -# %% [markdown] -# ##### Translate analytical network Configuration - -# %% -analytical_network_file = os.path.join(RESOURCES_DIR, "network.yaml") -# There are three available backends — analytical_congestion_aware, analytical_congestion_unaware, and HTsim — all of which use the analytical_network file format, so specify the backend_name -TranslateConfig.translate_analytical_network(analytical_network_file, config, "analytical_congestion_aware") - -# %% [markdown] -# ##### Translate HTsim fat tree topology - -# %% -htsim_fat_tree_file = os.path.join(RESOURCES_DIR, "8nodes.topo") -TranslateConfig.translate_htsim_fat_tree_topology(htsim_fat_tree_file, config) - -# %% [markdown] -# ##### Translate ns3 trace file - -# %% -ns3_trace_file = os.path.join(RESOURCES_DIR, "trace.txt") -TranslateConfig.translate_ns3_trace_file_to_schema(ns3_trace_file, config) - -# %% [markdown] -# ##### Translate logging file to schema - -# %% -logging_toml_file = os.path.join(RESOURCES_DIR, "logging_config.toml") -TranslateConfig.translate_logging_file_to_schema(logging_toml_file, config) diff --git a/client-scripts/notebooks/htsim_sample.ipynb b/client-scripts/notebooks/htsim_sample.ipynb new file mode 100644 index 0000000..f6eedba --- /dev/null +++ b/client-scripts/notebooks/htsim_sample.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8ca5060e", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0578d4a5", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../utils\")\n", + "from astra_sim import AstraSim, Collective, NetworkBackend" + ] + }, + { + "cell_type": "markdown", + "id": "2758ebfe", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed5628bc", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint =\"172.17.0.2:8989\",tag = \"htsim_sample\")" + ] + }, + { + "cell_type": "markdown", + "id": "768c6eaa", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0531dfd", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 * 1024 * 1024, npu_range=[0, 8])\n", + "print(astra.configuration.common_config.workload)" + ] + }, + { + "cell_type": "markdown", + "id": "3a7b4d31", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "015c17a2", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.preferred_dataset_splits = 4\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "astra.configuration.common_config.system.peak_perf = 900\n", + "astra.configuration.common_config.system.roofline_enabled = 0\n", + "print(astra.configuration.common_config.system)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "8dedb720", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b59596b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "cc132acb", + "metadata": {}, + "source": [ + "##### Configure the network backend, htsim protocol and topology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f6fbfff", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.network.clear()\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.network.add(\"ring\", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns)\n", + "astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP\n", + "print(\"Network backend set to\", astra.configuration.network_backend.choice)\n", + "print(\"network backend choice set to:\",astra.configuration.network_backend.htsim.topology.choice)\n", + "print(\"protocol set to\", astra.configuration.network_backend.htsim.htsim_protocol.choice)" + ] + }, + { + "cell_type": "markdown", + "id": "b3c1ac6b", + "metadata": {}, + "source": [ + "##### Configure the fat tree topology." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a468797", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Configuring topo file\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.nodes = 8\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.podsize = 4\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tiers = 3\n", + "\n", + "# Configuring values for each tiers\n", + "# Configuring values for tier 0\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.set(\n", + " downlink_speed_gbps=200\n", + ")\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.radix_down = 2\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.radix_up = 2\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.downlink_latency_ns = 1000\n", + "\n", + "# Configuring values for tier 1\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.set(\n", + " downlink_speed_gbps=200\n", + ")\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.radix_down = 2\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.radix_up = 4\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.downlink_latency_ns = 1000\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.bundle = 1\n", + "\n", + "# Configuring values for tier 2\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.set(\n", + " downlink_speed_gbps=100\n", + ")\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.radix_down = 4\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.downlink_latency_ns = 1000\n", + "astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.bundle = 2\n", + "\n", + "astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = \"8\"\n", + "print(astra.configuration.network_backend.htsim.topology.network_topology_configuration)" + ] + }, + { + "cell_type": "markdown", + "id": "00a5b3d4", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fca076da", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.HTSIM)" + ] + }, + { + "cell_type": "markdown", + "id": "2d1c5c1e", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e88a73e", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/htsim_sample.py b/client-scripts/notebooks/htsim_sample.py deleted file mode 100644 index 4d2e364..0000000 --- a/client-scripts/notebooks/htsim_sample.py +++ /dev/null @@ -1,125 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../utils") -from astra_sim import AstraSim, Collective, NetworkBackend - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - -# %% -astra = AstraSim(server_endpoint ="172.17.0.2:8989",tag = "htsim_sample") - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 * 1024 * 1024, npu_range=[0, 8]) -print(astra.configuration.common_config.workload) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.preferred_dataset_splits = 4 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -astra.configuration.common_config.system.peak_perf = 900 -astra.configuration.common_config.system.roofline_enabled = 0 -print(astra.configuration.common_config.system) - - - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - - -# %% [markdown] -# ##### Configure the network backend, htsim protocol and topology - -# %% -astra.configuration.network_backend.htsim.topology.network_topology_configuration.network.clear() -astra.configuration.network_backend.htsim.topology.network_topology_configuration.network.add("ring", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns) -astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP -print("Network backend set to", astra.configuration.network_backend.choice) -print("network backend choice set to:",astra.configuration.network_backend.htsim.topology.choice) -print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol.choice) - -# %% [markdown] -# ##### Configure the fat tree topology. - -# %% -# Configuring topo file -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.nodes = 8 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.podsize = 4 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tiers = 3 - -# Configuring values for each tiers -# Configuring values for tier 0 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.set( - downlink_speed_gbps=200 -) -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.radix_down = 2 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.radix_up = 2 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.downlink_latency_ns = 1000 - -# Configuring values for tier 1 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.set( - downlink_speed_gbps=200 -) -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.radix_down = 2 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.radix_up = 4 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.downlink_latency_ns = 1000 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.bundle = 1 - -# Configuring values for tier 2 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.set( - downlink_speed_gbps=100 -) -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.radix_down = 4 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.downlink_latency_ns = 1000 -astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.bundle = 2 - -astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = "8" -print(astra.configuration.network_backend.htsim.topology.network_topology_configuration) - - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.HTSIM) - - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - diff --git a/client-scripts/notebooks/infragraph/analytical_dgx_device.ipynb b/client-scripts/notebooks/infragraph/analytical_dgx_device.ipynb new file mode 100644 index 0000000..12c7a39 --- /dev/null +++ b/client-scripts/notebooks/infragraph/analytical_dgx_device.ipynb @@ -0,0 +1,346 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "650e72e0", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c7bf2c9", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX, DgxProfile\n", + "from infragraph import Infrastructure\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "d5857661", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4ff237b", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"analytical_dgx_device\")" + ] + }, + { + "cell_type": "markdown", + "id": "0cc43938", + "metadata": {}, + "source": [ + "##### Get all available DGX variants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dccce3c9", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import get_args\n", + "print(get_args(DgxProfile))" + ] + }, + { + "cell_type": "markdown", + "id": "25987b17", + "metadata": {}, + "source": [ + "##### Create a Nvidia DGX device fabric using infragraph device blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a0b157d", + "metadata": {}, + "outputs": [], + "source": [ + "server = NvidiaDGX()\n", + "infrastructure = Infrastructure()\n", + "infrastructure.devices.append(server)\n", + "infrastructure.instances.add(name=server.name, device=server.name, count=1)\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "a27e7ce9", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24921fe9", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "total_npus = service.get_component(device=server, type=\"xpu\").count\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))" + ] + }, + { + "cell_type": "markdown", + "id": "b0f2bfc4", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "793d7563", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "99de70db", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b56c38cf", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "d339364c", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36490835", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "8836e23b", + "metadata": {}, + "source": [ + "##### Set ASTRA-sim network backend to ANALYTICAL_CONGESTION_AWARE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1efef83a", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.analytical_congestion_aware.topology.choice = astra.configuration.network_backend.analytical_congestion_aware.topology.INFRAGRAPH" + ] + }, + { + "cell_type": "markdown", + "id": "bf4c4369", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70bf271e", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = server.name\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)" + ] + }, + { + "cell_type": "markdown", + "id": "cd0bc41f", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d13f9225", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "94ac0705", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aa872cb", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "e146c4a2", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c096641", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "080a8923", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbcb30d1", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_AWARE)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/analytical_dgx_device.py b/client-scripts/notebooks/infragraph/analytical_dgx_device.py deleted file mode 100644 index d0a816c..0000000 --- a/client-scripts/notebooks/infragraph/analytical_dgx_device.py +++ /dev/null @@ -1,132 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX, DgxProfile -from infragraph import Infrastructure - - - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "analytical_dgx_device") - -# %% [markdown] -# ##### Get all available DGX variants - -# %% -from typing import get_args -print(get_args(DgxProfile)) - -# %% [markdown] -# ##### Create a Nvidia DGX device fabric using infragraph device blueprint - -# %% -server = NvidiaDGX() -infrastructure = Infrastructure() -infrastructure.devices.append(server) -infrastructure.instances.add(name=server.name, device=server.name, count=1) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -total_npus = service.get_component(device=server, type="xpu").count -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Set ASTRA-sim network backend to ANALYTICAL_CONGESTION_AWARE - -# %% -astra.configuration.network_backend.analytical_congestion_aware.topology.choice = astra.configuration.network_backend.analytical_congestion_aware.topology.INFRAGRAPH - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = server.name -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_AWARE) - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","analytical_dgx_device.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","analytical_dgx_device.yaml")) diff --git a/client-scripts/notebooks/infragraph/analytical_ironwood_rack.ipynb b/client-scripts/notebooks/infragraph/analytical_ironwood_rack.ipynb new file mode 100644 index 0000000..6e082a4 --- /dev/null +++ b/client-scripts/notebooks/infragraph/analytical_ironwood_rack.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bea06936", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6d3813d", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.devices.ironwood_rack import IronwoodRack\n", + "from infragraph import Infrastructure" + ] + }, + { + "cell_type": "markdown", + "id": "cc09d155", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75fc79b2", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"analytical_ironwood_rack\")" + ] + }, + { + "cell_type": "markdown", + "id": "e1d35925", + "metadata": {}, + "source": [ + "##### Create a ironwood rack device fabric using infragraph device blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e149fef8", + "metadata": {}, + "outputs": [], + "source": [ + "server = IronwoodRack()\n", + "infrastructure = Infrastructure()\n", + "infrastructure.devices.append(server)\n", + "\n", + "infrastructure.instances.add(name=server.name, device=server.name, count=1)\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "cba694b6", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fb89a22", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "total_npus = service.get_component(device=server, type=\"xpu\").count\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))" + ] + }, + { + "cell_type": "markdown", + "id": "77f53072", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeefed28", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "7d9049f7", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c2f1788", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "49377270", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17aeee6e", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "90c09fed", + "metadata": {}, + "source": [ + "##### Configure network backend to ANALYTICAL_CONGESTION_AWARE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b66e035b", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.analytical_congestion_unaware.topology.choice = astra.configuration.network_backend.analytical_congestion_unaware.topology.INFRAGRAPH" + ] + }, + { + "cell_type": "markdown", + "id": "178ac732", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2409f0aa", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = server.name\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)" + ] + }, + { + "cell_type": "markdown", + "id": "88e593c6", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d283686b", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "700ecdd2", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e49b253", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "0f9a2c48", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1628a250", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "d99092a6", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46c2a21b", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_UNAWARE)" + ] + }, + { + "cell_type": "markdown", + "id": "45269b28", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2adbbcda", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/analytical_ironwood_rack.py b/client-scripts/notebooks/infragraph/analytical_ironwood_rack.py deleted file mode 100644 index ce51e7a..0000000 --- a/client-scripts/notebooks/infragraph/analytical_ironwood_rack.py +++ /dev/null @@ -1,131 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.devices.ironwood_rack import IronwoodRack -from infragraph import Infrastructure - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "analytical_ironwood_rack") - -# %% [markdown] -# ##### Create a ironwood rack device fabric using infragraph device blueprint - -# %% -server = IronwoodRack() -infrastructure = Infrastructure() -infrastructure.devices.append(server) - -infrastructure.instances.add(name=server.name, device=server.name, count=1) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -total_npus = service.get_component(device=server, type="xpu").count -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure network backend to ANALYTICAL_CONGESTION_AWARE - -# %% -astra.configuration.network_backend.analytical_congestion_unaware.topology.choice = astra.configuration.network_backend.analytical_congestion_unaware.topology.INFRAGRAPH - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = server.name -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_UNAWARE) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","analytical_dgx_device.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","analytical_dgx_device.yaml")) diff --git a/client-scripts/notebooks/infragraph/htsim_clos_fabric_2tier.ipynb b/client-scripts/notebooks/infragraph/htsim_clos_fabric_2tier.ipynb new file mode 100644 index 0000000..cf4a3a0 --- /dev/null +++ b/client-scripts/notebooks/infragraph/htsim_clos_fabric_2tier.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8de400f9", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cccb5e49", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import subprocess\n", + "import os\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric\n", + "from infragraph.blueprints.devices.generic.server import Server\n", + "from infragraph.blueprints.devices.generic.generic_switch import Switch" + ] + }, + { + "cell_type": "markdown", + "id": "64688a84", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3a3b48", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"htsim_clos_fabric_2tier\")" + ] + }, + { + "cell_type": "markdown", + "id": "084c8cc6", + "metadata": {}, + "source": [ + "##### Create a two-tier clos fabric using infragraph fabric blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5b0ec8f", + "metadata": {}, + "outputs": [], + "source": [ + "server = Server()\n", + "switch = Switch(port_count=8)\n", + "infrastructure = ClosFatTreeFabric(switch, server, 2,[])\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "10a5c5f3", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "648027ad", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))\n", + "total_npus = 16" + ] + }, + { + "cell_type": "markdown", + "id": "dcaf99c4", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9c74682", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "4e052d35", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb7010b3", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.preferred_dataset_splits = 4\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "astra.configuration.common_config.system.peak_perf = 900\n", + "astra.configuration.common_config.system.roofline_enabled = 0\n", + "print(astra.configuration.common_config.system)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "30ec2cad", + "metadata": {}, + "source": [ + "##### Configure the remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b4100d4", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "96c7a0bc", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or network_topology_configuration)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d14185a", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM\n", + "astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH" + ] + }, + { + "cell_type": "markdown", + "id": "5b7f489f", + "metadata": {}, + "source": [ + "##### Select htsim protocol" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5298b22f", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP\n", + "print(\"Network backend set to\", astra.configuration.network_backend.choice)\n", + "print(\"network topology choice set to:\",astra.configuration.network_backend.htsim.topology.choice)\n", + "print(\"protocol set to\", astra.configuration.network_backend.htsim.htsim_protocol)\n", + "astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus)" + ] + }, + { + "cell_type": "markdown", + "id": "8cdb444c", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26e98e61", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 1000\n", + "host_device_spec.device_latency_ms = 0.005\n", + "host_device_spec.device_name = \"server\"\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)\n", + "\n", + "switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "switch_device_spec.device_bandwidth_gbps = 1000\n", + "switch_device_spec.device_latency_ms = 0.005\n", + "switch_device_spec.device_name = \"switch\"\n", + "switch_device_spec.device_type = \"switch\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(\n", + " switch_device_spec\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2c97616a", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09917a91", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "69fddfae", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "304516db", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "dfe625b5", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e99a61a", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "bf92ca9b", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3b4263", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.HTSIM)" + ] + }, + { + "cell_type": "markdown", + "id": "fcb4b56e", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f9f13ee", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/htsim_clos_fabric_2tier.py b/client-scripts/notebooks/infragraph/htsim_clos_fabric_2tier.py deleted file mode 100644 index 4f05442..0000000 --- a/client-scripts/notebooks/infragraph/htsim_clos_fabric_2tier.py +++ /dev/null @@ -1,156 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric -from infragraph.blueprints.devices.generic.server import Server -from infragraph.blueprints.devices.generic.generic_switch import Switch - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "htsim_clos_fabric_2tier") - -# %% [markdown] -# ##### Create a two-tier clos fabric using infragraph fabric blueprint - -# %% -server = Server() -switch = Switch(port_count=8) -infrastructure = ClosFatTreeFabric(switch, server, 2,[]) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) -total_npus = 16 - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.preferred_dataset_splits = 4 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -astra.configuration.common_config.system.peak_perf = 900 -astra.configuration.common_config.system.roofline_enabled = 0 -print(astra.configuration.common_config.system) - - - -# %% [markdown] -# ##### Configure the remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or network_topology_configuration) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM -astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH - - -# %% [markdown] -# ##### Select htsim protocol - -# %% -astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP -print("Network backend set to", astra.configuration.network_backend.choice) -print("network topology choice set to:",astra.configuration.network_backend.htsim.topology.choice) -print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol) -astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 1000 -host_device_spec.device_latency_ms = 0.005 -host_device_spec.device_name = "server" -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -switch_device_spec.device_bandwidth_gbps = 1000 -switch_device_spec.device_latency_ms = 0.005 -switch_device_spec.device_name = "switch" -switch_device_spec.device_type = "switch" -astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec -) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.HTSIM) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","htsim_clos_fabric_2tier.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","htsim_clos_fabric_2tier.yaml")) diff --git a/client-scripts/notebooks/infragraph/htsim_clos_fabric_3tier.ipynb b/client-scripts/notebooks/infragraph/htsim_clos_fabric_3tier.ipynb new file mode 100644 index 0000000..ca64d2b --- /dev/null +++ b/client-scripts/notebooks/infragraph/htsim_clos_fabric_3tier.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "63705c57", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff8f875a", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric\n", + "from infragraph.blueprints.devices.generic.server import Server\n", + "from infragraph.blueprints.devices.generic.generic_switch import Switch" + ] + }, + { + "cell_type": "markdown", + "id": "add4ea8b", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdd78f3c", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"htsim_clos_fabric_3tier\")" + ] + }, + { + "cell_type": "markdown", + "id": "07f4718b", + "metadata": {}, + "source": [ + "##### Create a three-tier clos fabric using infragraph fabric blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f73252d5", + "metadata": {}, + "outputs": [], + "source": [ + "server = Server()\n", + "switch = Switch(port_count=8)\n", + "infrastructure = ClosFatTreeFabric(switch, server, 3,[])\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "4b090b75", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0db31e7", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))\n", + "total_npus = 64" + ] + }, + { + "cell_type": "markdown", + "id": "7415f64c", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "510b7454", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "b3177578", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d4d9c0f", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.preferred_dataset_splits = 4\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "astra.configuration.common_config.system.peak_perf = 900\n", + "astra.configuration.common_config.system.roofline_enabled = 0\n", + "print(astra.configuration.common_config.system)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "cb6fb5b5", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db186db1", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "dbefce6f", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or network_topology_configuration)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88c0c6eb", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM\n", + "astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH" + ] + }, + { + "cell_type": "markdown", + "id": "4949d933", + "metadata": {}, + "source": [ + "##### Select htsim protocol" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "647d1b0b", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP\n", + "print(\"Network backend set to\", astra.configuration.network_backend.choice)\n", + "print(\"network topology choice set to:\",astra.configuration.network_backend.htsim.topology.choice)\n", + "print(\"protocol set to\", astra.configuration.network_backend.htsim.htsim_protocol)\n", + "astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus)" + ] + }, + { + "cell_type": "markdown", + "id": "83470713", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f81ecac", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 1000\n", + "host_device_spec.device_latency_ms = 0.005\n", + "host_device_spec.device_name = \"server\"\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)\n", + "\n", + "switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "switch_device_spec.device_bandwidth_gbps = 1000\n", + "switch_device_spec.device_latency_ms = 0.005\n", + "switch_device_spec.device_name = \"switch\"\n", + "switch_device_spec.device_type = \"switch\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(\n", + " switch_device_spec\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8afd82de", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94461949", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "aa9940b4", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe828d35", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "f7a3ad17", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96c2d8df", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "63b6acae", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a0e647f", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.HTSIM)" + ] + }, + { + "cell_type": "markdown", + "id": "62b6b049", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4094f6c", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/htsim_clos_fabric_3tier.py b/client-scripts/notebooks/infragraph/htsim_clos_fabric_3tier.py deleted file mode 100644 index 6832a61..0000000 --- a/client-scripts/notebooks/infragraph/htsim_clos_fabric_3tier.py +++ /dev/null @@ -1,155 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric -from infragraph.blueprints.devices.generic.server import Server -from infragraph.blueprints.devices.generic.generic_switch import Switch - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "htsim_clos_fabric_3tier") - -# %% [markdown] -# ##### Create a three-tier clos fabric using infragraph fabric blueprint - -# %% -server = Server() -switch = Switch(port_count=8) -infrastructure = ClosFatTreeFabric(switch, server, 3,[]) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) -total_npus = 64 - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.preferred_dataset_splits = 4 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -astra.configuration.common_config.system.peak_perf = 900 -astra.configuration.common_config.system.roofline_enabled = 0 -print(astra.configuration.common_config.system) - - - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or network_topology_configuration) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM -astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH - -# %% [markdown] -# ##### Select htsim protocol - -# %% -astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP -print("Network backend set to", astra.configuration.network_backend.choice) -print("network topology choice set to:",astra.configuration.network_backend.htsim.topology.choice) -print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol) -astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 1000 -host_device_spec.device_latency_ms = 0.005 -host_device_spec.device_name = "server" -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -switch_device_spec.device_bandwidth_gbps = 1000 -switch_device_spec.device_latency_ms = 0.005 -switch_device_spec.device_name = "switch" -switch_device_spec.device_type = "switch" -astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec -) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.HTSIM) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","htsim_clos_fabric_3tier.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","htsim_clos_fabric_3tier.yaml")) diff --git a/client-scripts/notebooks/infragraph/ns3_clos_fabric_2tier.ipynb b/client-scripts/notebooks/infragraph/ns3_clos_fabric_2tier.ipynb new file mode 100644 index 0000000..769fa65 --- /dev/null +++ b/client-scripts/notebooks/infragraph/ns3_clos_fabric_2tier.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "959a81cb", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import pandas as pd\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric\n", + "from infragraph.blueprints.devices.generic.server import Server\n", + "from infragraph.blueprints.devices.generic.generic_switch import Switch" + ] + }, + { + "cell_type": "markdown", + "id": "9006ee5d", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68b9ec39", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"ns3_clos_fabric_2tier\")" + ] + }, + { + "cell_type": "markdown", + "id": "4122cb3d", + "metadata": {}, + "source": [ + "##### Create a two-tier clos fabric using infragraph fabric blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcc45b74", + "metadata": {}, + "outputs": [], + "source": [ + "server = Server()\n", + "switch = Switch(port_count=8)\n", + "infrastructure = ClosFatTreeFabric(switch, server, 2,[])\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "40964f88", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15473e5b", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))\n", + "total_npus = 32" + ] + }, + { + "cell_type": "markdown", + "id": "e147483d", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6927185", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "745eafea", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5889f7c", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "dba2ad39", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a821017", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "9467f70e", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or nc_topology)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d7a2e89", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)" + ] + }, + { + "cell_type": "markdown", + "id": "42b279ab", + "metadata": {}, + "source": [ + "##### Adding ns3 trace and logical dimension " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b72624a", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = []\n", + "for i in range(0, total_npus):\n", + " astra.configuration.network_backend.ns3.trace.trace_ids.append(i)" + ] + }, + { + "cell_type": "markdown", + "id": "80dd7153", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "133aa87c", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = \"server\"\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)\n", + "\n", + "switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "switch_device_spec.device_bandwidth_gbps = 100\n", + "switch_device_spec.device_latency_ms = 0.05\n", + "switch_device_spec.device_name = \"switch\"\n", + "switch_device_spec.device_type = \"switch\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(\n", + " switch_device_spec\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4335d7ce", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a767455", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "4ccf0ef4", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46c8c907", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "526c9236", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eca0f534", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "3532aab7", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84ba0157", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "210a5874", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b577e24", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "2cc12528", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d82d053a", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5132f8c2", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"flow_stats.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/ns3_clos_fabric_2tier.py b/client-scripts/notebooks/infragraph/ns3_clos_fabric_2tier.py deleted file mode 100644 index dec494a..0000000 --- a/client-scripts/notebooks/infragraph/ns3_clos_fabric_2tier.py +++ /dev/null @@ -1,164 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric -from infragraph.blueprints.devices.generic.server import Server -from infragraph.blueprints.devices.generic.generic_switch import Switch - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "ns3_clos_fabric_2tier") - -# %% [markdown] -# ##### Create a two-tier clos fabric using infragraph fabric blueprint - -# %% -server = Server() -switch = Switch(port_count=8) -infrastructure = ClosFatTreeFabric(switch, server, 2,[]) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) -total_npus = 32 - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or nc_topology) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - -# %% [markdown] -# ##### Adding ns3 trace and logical dimension - -# %% -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] -astra.configuration.network_backend.ns3.trace.trace_ids = [] -for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = "server" -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -switch_device_spec.device_bandwidth_gbps = 100 -switch_device_spec.device_latency_ms = 0.05 -switch_device_spec.device_name = "switch" -switch_device_spec.device_type = "switch" -astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec -) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -# %% -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) -df.head() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_clos_fabric_2tier.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_clos_fabric_2tier.yaml")) diff --git a/client-scripts/notebooks/infragraph/ns3_clos_fabric_3tier.ipynb b/client-scripts/notebooks/infragraph/ns3_clos_fabric_3tier.ipynb new file mode 100644 index 0000000..e451166 --- /dev/null +++ b/client-scripts/notebooks/infragraph/ns3_clos_fabric_3tier.ipynb @@ -0,0 +1,398 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8bf1e79b", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19cb437c", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import subprocess\n", + "import os\n", + "import pandas as pd\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric\n", + "from infragraph.blueprints.devices.generic.server import Server\n", + "from infragraph.blueprints.devices.generic.generic_switch import Switch" + ] + }, + { + "cell_type": "markdown", + "id": "e5d24eab", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7854a50", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"ns3_clos_fabric_3tier\")" + ] + }, + { + "cell_type": "markdown", + "id": "e344f8f2", + "metadata": {}, + "source": [ + "##### Create a three-tier clos fabric using infragraph fabric blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "963f0ddc", + "metadata": {}, + "outputs": [], + "source": [ + "server = Server()\n", + "switch = Switch(port_count=4)\n", + "infrastructure = ClosFatTreeFabric(switch, server, 3,[])\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "46c37a6d", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d212803", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))\n", + "total_npus = 16" + ] + }, + { + "cell_type": "markdown", + "id": "0e2bc6cc", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2629ebf2", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "319ad188", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f4934e", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "b5841017", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59c66bcb", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "a4528f6f", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or nc_topology)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928b1ca5", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)" + ] + }, + { + "cell_type": "markdown", + "id": "dd4d69e3", + "metadata": {}, + "source": [ + "##### Adding ns3 trace and logical dimension " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d3ebad7", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = []\n", + "for i in range(0, total_npus):\n", + " astra.configuration.network_backend.ns3.trace.trace_ids.append(i)" + ] + }, + { + "cell_type": "markdown", + "id": "1bde2431", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b36271", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = \"server\"\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)\n", + "\n", + "switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "switch_device_spec.device_bandwidth_gbps = 100\n", + "switch_device_spec.device_latency_ms = 0.05\n", + "switch_device_spec.device_name = \"switch\"\n", + "switch_device_spec.device_type = \"switch\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(\n", + " switch_device_spec\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c801cb45", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcfe7486", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "d963cc69", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de2caa59", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "c08df70c", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f903e215", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "b9a8de4f", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72675b9d", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "96bcc2bc", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6548eff5", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "bf6b5659", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b418f9c", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/ns3_clos_fabric_3tier.py b/client-scripts/notebooks/infragraph/ns3_clos_fabric_3tier.py deleted file mode 100644 index 2a63ece..0000000 --- a/client-scripts/notebooks/infragraph/ns3_clos_fabric_3tier.py +++ /dev/null @@ -1,160 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric -from infragraph.blueprints.devices.generic.server import Server -from infragraph.blueprints.devices.generic.generic_switch import Switch - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "ns3_clos_fabric_3tier") - -# %% [markdown] -# ##### Create a three-tier clos fabric using infragraph fabric blueprint - -# %% -server = Server() -switch = Switch(port_count=4) -infrastructure = ClosFatTreeFabric(switch, server, 3,[]) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) -total_npus = 16 - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or nc_topology) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - -# %% [markdown] -# ##### Adding ns3 trace and logical dimension - -# %% -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] -astra.configuration.network_backend.ns3.trace.trace_ids = [] -for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = "server" -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -switch_device_spec.device_bandwidth_gbps = 100 -switch_device_spec.device_latency_ms = 0.05 -switch_device_spec.device_name = "switch" -switch_device_spec.device_type = "switch" -astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec -) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_clos_fabric_3tier"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_clos_fabric_3tier.yaml")) diff --git a/client-scripts/notebooks/infragraph/ns3_single_dgx_device.ipynb b/client-scripts/notebooks/infragraph/ns3_single_dgx_device.ipynb new file mode 100644 index 0000000..9c19bca --- /dev/null +++ b/client-scripts/notebooks/infragraph/ns3_single_dgx_device.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af1898cf", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c70c622", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import subprocess\n", + "import os\n", + "import pandas as pd\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX, DgxProfile\n", + "from infragraph import Infrastructure" + ] + }, + { + "cell_type": "markdown", + "id": "f5e5fc47", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80ea543f", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"ns3_single_dgx\")" + ] + }, + { + "cell_type": "markdown", + "id": "2dd5d3db", + "metadata": {}, + "source": [ + "##### Get all available DGX variants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "437af895", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import get_args\n", + "print(get_args(DgxProfile))" + ] + }, + { + "cell_type": "markdown", + "id": "2eef31b1", + "metadata": {}, + "source": [ + "##### Create a Nvidia DGX device fabric using infragraph device blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6cda575", + "metadata": {}, + "outputs": [], + "source": [ + "server = NvidiaDGX(\"dgx_h100\")\n", + "infrastructure = Infrastructure()\n", + "infrastructure.devices.append(server)\n", + "infrastructure.instances.add(name=server.name, device=server.name, count=1)\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "0a79a343", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39479cbb", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "total_npus = service.get_component(device=server, type=\"xpu\").count\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))" + ] + }, + { + "cell_type": "markdown", + "id": "f6db5a71", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe69674a", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "5111b9ad", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aae1858b", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "90fd7a36", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd0b8270", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "ddec6271", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or nc_topology)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be5bbf10", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)" + ] + }, + { + "cell_type": "markdown", + "id": "2cdf8df4", + "metadata": {}, + "source": [ + "##### Adding ns3 trace and logical dimension " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0299501f", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = []\n", + "for i in range(0, total_npus):\n", + " astra.configuration.network_backend.ns3.trace.trace_ids.append(i)" + ] + }, + { + "cell_type": "markdown", + "id": "33487d2f", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation for Nvidia DGX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84e218f9", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = server.name\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)" + ] + }, + { + "cell_type": "markdown", + "id": "088410f5", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "552cc24c", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "cb1986f2", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa072bd7", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "ecebaf71", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0229cf9", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "95f727ad", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f69375f6", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "bbc76926", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4848847e", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "10f938aa", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "490c6acb", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/ns3_single_dgx_device.py b/client-scripts/notebooks/infragraph/ns3_single_dgx_device.py deleted file mode 100644 index 794f63a..0000000 --- a/client-scripts/notebooks/infragraph/ns3_single_dgx_device.py +++ /dev/null @@ -1,158 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX, DgxProfile -from infragraph import Infrastructure - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "ns3_single_dgx") - -# %% [markdown] -# ##### Get all available DGX variants - -# %% -from typing import get_args -print(get_args(DgxProfile)) - -# %% [markdown] -# ##### Create a Nvidia DGX device fabric using infragraph device blueprint - -# %% -server = NvidiaDGX("dgx_h100") -infrastructure = Infrastructure() -infrastructure.devices.append(server) -infrastructure.instances.add(name=server.name, device=server.name, count=1) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -total_npus = service.get_component(device=server, type="xpu").count -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or nc_topology) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - -# %% [markdown] -# ##### Adding ns3 trace and logical dimension - -# %% -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] -astra.configuration.network_backend.ns3.trace.trace_ids = [] -for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation for Nvidia DGX - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = server.name -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_dgx"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_dgx.yaml")) diff --git a/client-scripts/notebooks/infragraph/ns3_single_ironwood_device.ipynb b/client-scripts/notebooks/infragraph/ns3_single_ironwood_device.ipynb new file mode 100644 index 0000000..f6aa0fd --- /dev/null +++ b/client-scripts/notebooks/infragraph/ns3_single_ironwood_device.ipynb @@ -0,0 +1,390 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c24f2ab6", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0928b9a", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import pandas as pd\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.devices.ironwood_rack import IronwoodRack\n", + "from infragraph import Infrastructure" + ] + }, + { + "cell_type": "markdown", + "id": "0d02f54c", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3228a246", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"ns3_single_ironwood\")" + ] + }, + { + "cell_type": "markdown", + "id": "36b53490", + "metadata": {}, + "source": [ + "##### Create a ironwood rack device fabric using infragraph device blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "975c511f", + "metadata": {}, + "outputs": [], + "source": [ + "server = IronwoodRack()\n", + "infrastructure = Infrastructure()\n", + "infrastructure.devices.append(server)\n", + "infrastructure.instances.add(name=server.name, device=server.name, count=1)\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())\n", + "print(astra.configuration.infragraph.infrastructure)" + ] + }, + { + "cell_type": "markdown", + "id": "a1a959f5", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d33bcaf7", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))\n", + "total_npus = service.get_component(server, \"xpu\").count\n", + "print(total_npus)" + ] + }, + { + "cell_type": "markdown", + "id": "0e46c84f", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a724c9e4", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "9e2f7e1d", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f65a427", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "7ad2dc66", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e45ec44b", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "3e832bae", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or nc_topology)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "502d3b25", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)" + ] + }, + { + "cell_type": "markdown", + "id": "324c7cba", + "metadata": {}, + "source": [ + "##### Adding ns3 trace and logical dimension " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d62c44de", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [4, 4, 4]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = []\n", + "for i in range(0, total_npus):\n", + " astra.configuration.network_backend.ns3.trace.trace_ids.append(i)" + ] + }, + { + "cell_type": "markdown", + "id": "08871bd5", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6165bc", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = server.name\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)" + ] + }, + { + "cell_type": "markdown", + "id": "2921597a", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d58aca6d", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1e387eeb", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86125fba", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "c2a87754", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2d2f976", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "62be0629", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be8221a6", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "5349d2b6", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e67e3d72", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "ed835cb8", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a36bfc22", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/ns3_single_ironwood_device.py b/client-scripts/notebooks/infragraph/ns3_single_ironwood_device.py deleted file mode 100644 index 40343a3..0000000 --- a/client-scripts/notebooks/infragraph/ns3_single_ironwood_device.py +++ /dev/null @@ -1,152 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -import networkx -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.devices.ironwood_rack import IronwoodRack -from infragraph import Infrastructure - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "ns3_single_ironwood") - -# %% [markdown] -# ##### Create a ironwood rack device fabric using infragraph device blueprint - -# %% -server = IronwoodRack() -infrastructure = Infrastructure() -infrastructure.devices.append(server) -infrastructure.instances.add(name=server.name, device=server.name, count=1) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) -print(astra.configuration.infragraph.infrastructure) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) -total_npus = service.get_component(server, "xpu").count -print(total_npus) - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or nc_topology) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - -# %% [markdown] -# ##### Adding ns3 trace and logical dimension - -# %% -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [4, 4, 4] -astra.configuration.network_backend.ns3.trace.trace_ids = [] -for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = server.name -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_ironwood"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_ironwood.yaml")) diff --git a/client-scripts/notebooks/infragraph/ns3_single_tier_with_dgx.ipynb b/client-scripts/notebooks/infragraph/ns3_single_tier_with_dgx.ipynb new file mode 100644 index 0000000..6470bb2 --- /dev/null +++ b/client-scripts/notebooks/infragraph/ns3_single_tier_with_dgx.ipynb @@ -0,0 +1,399 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "48077a93", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8588a9f6", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import pandas as pd\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX\n", + "from infragraph.blueprints.fabrics.single_tier_fabric import SingleTierFabric\n", + "from infragraph.infragraph_service import InfraGraphService" + ] + }, + { + "cell_type": "markdown", + "id": "19c27d41", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aafe93fe", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint = \"172.17.0.2:8989\", tag = \"ns3_single_tier_with_dgx\")" + ] + }, + { + "cell_type": "markdown", + "id": "b0f45279", + "metadata": {}, + "source": [ + "##### Create a single tier rack device with two Nvidia DGX and a single switch using infragraph device, fabric blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "025b0ed5", + "metadata": {}, + "outputs": [], + "source": [ + "dgx_count = 2\n", + "server = NvidiaDGX()\n", + "infrastructure = SingleTierFabric(server, dgx_count)\n", + "astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize())" + ] + }, + { + "cell_type": "markdown", + "id": "cf5457f6", + "metadata": {}, + "source": [ + "##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73913147", + "metadata": {}, + "outputs": [], + "source": [ + "service = InfraGraphService()\n", + "service.set_graph(infrastructure)\n", + "\n", + "g = service.get_networkx_graph()\n", + "print(networkx.write_network_text(g, vertical_chains=True))\n", + "\n", + "total_npus = service.get_component(server, \"xpu\").count * dgx_count\n", + "print(total_npus)" + ] + }, + { + "cell_type": "markdown", + "id": "209e8c57", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc20308e", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "58879baf", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afbbf33f", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "33f2afbf", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2c7e4b4", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus])" + ] + }, + { + "cell_type": "markdown", + "id": "76aa4588", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21a5ae3b", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600" + ] + }, + { + "cell_type": "markdown", + "id": "9cb52015", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "065afac5", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "0a9615e2", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or nc_topology)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d136387", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)" + ] + }, + { + "cell_type": "markdown", + "id": "8d38c643", + "metadata": {}, + "source": [ + "##### Adding ns3 trace and logical dimension " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee937faf", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = []\n", + "for i in range(0, total_npus):\n", + " astra.configuration.network_backend.ns3.trace.trace_ids.append(i)" + ] + }, + { + "cell_type": "markdown", + "id": "abeed10f", + "metadata": {}, + "source": [ + "##### Adding ASTRA-sim - Infragraph specific annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b7edefa", + "metadata": {}, + "outputs": [], + "source": [ + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 100\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = server.name\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)\n", + "\n", + "switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "switch_device_spec.device_bandwidth_gbps = 100\n", + "switch_device_spec.device_latency_ms = 0.05\n", + "switch_device_spec.device_name = \"switch\"\n", + "switch_device_spec.device_type = \"switch\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(\n", + " switch_device_spec\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "602fcbbd", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "192d6dd3", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "0969c7ae", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c60d06aa", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "9b573b67", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd80bf0d", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "cc7c3593", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca675ef8", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/ns3_single_tier_with_dgx.py b/client-scripts/notebooks/infragraph/ns3_single_tier_with_dgx.py deleted file mode 100644 index d35356d..0000000 --- a/client-scripts/notebooks/infragraph/ns3_single_tier_with_dgx.py +++ /dev/null @@ -1,161 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -from astra_sim import AstraSim, Collective, NetworkBackend -from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX -from infragraph.blueprints.fabrics.single_tier_fabric import SingleTierFabric -import networkx -from infragraph.infragraph_service import InfraGraphService -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint = "172.17.0.2:8989", tag = "ns3_single_tier_with_dgx") - -# %% [markdown] -# ##### Create a single tier rack device with two Nvidia DGX and a single switch using infragraph device, fabric blueprint - -# %% -dgx_count = 2 -server = NvidiaDGX() -infrastructure = SingleTierFabric(server, dgx_count) -astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - -# %% [markdown] -# ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - -# %% -service = InfraGraphService() -service.set_graph(infrastructure) - -g = service.get_networkx_graph() -print(networkx.write_network_text(g, vertical_chains=True)) - -total_npus = service.get_component(server, "xpu").count * dgx_count -print(total_npus) - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus]) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or nc_topology) - -# %% -astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - -# %% [markdown] -# ##### Adding ns3 trace and logical dimension - -# %% -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] -astra.configuration.network_backend.ns3.trace.trace_ids = [] -for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - -# %% [markdown] -# ##### Adding ASTRA-sim - Infragraph specific annotation - -# %% -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 100 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = server.name -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -switch_device_spec.device_bandwidth_gbps = 100 -switch_device_spec.device_latency_ms = 0.05 -switch_device_spec.device_name = "switch" -switch_device_spec.device_type = "switch" -astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec -) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_tier_with_dgx"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_tier_with_dgx.yaml")) diff --git a/client-scripts/notebooks/infragraph/ns3_single_tier_with_generic_server.ipynb b/client-scripts/notebooks/infragraph/ns3_single_tier_with_generic_server.ipynb new file mode 100644 index 0000000..f9d587d --- /dev/null +++ b/client-scripts/notebooks/infragraph/ns3_single_tier_with_generic_server.ipynb @@ -0,0 +1,418 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d8fa9641", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d903d960", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../utils\")\n", + "import networkx\n", + "import yaml\n", + "import os\n", + "import subprocess\n", + "import pandas as pd\n", + "import astra_sim_sdk.astra_sim_sdk as astra_sim_kit\n", + "from common import FileFolderUtils\n", + "from IPython.display import IFrame\n", + "from astra_sim import AstraSim, Collective, NetworkBackend\n", + "from astra_sim_sdk import Device, Component\n", + "from infragraph import Component, InfrastructureEdge\n", + "from infragraph.infragraph_service import InfraGraphService\n", + "from infragraph.blueprints.devices.generic.server import Server\n", + "from infragraph.blueprints.devices.generic.generic_switch import Switch" + ] + }, + { + "cell_type": "markdown", + "id": "abc15479", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76bd56cc", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint=\"172.17.0.2:8989\", tag = \"ns3_single_tier_with_generic_server\")" + ] + }, + { + "cell_type": "markdown", + "id": "313c13d4", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1aa9daf", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0,8])\n", + "print(astra.configuration.common_config.workload)" + ] + }, + { + "cell_type": "markdown", + "id": "60c6e60e", + "metadata": {}, + "source": [ + "##### Configure the ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f856dc0c", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "print(astra.configuration.common_config.system)" + ] + }, + { + "cell_type": "markdown", + "id": "f0a5fa7a", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bb834c2", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "1c90091f", + "metadata": {}, + "source": [ + "##### Configure the selected network backend and the topology (infragraph or nc_topology)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1df92ed", + "metadata": {}, + "outputs": [], + "source": [ + "# We need to configure the network backend here since we are translating the topology from infragraph and not creating it directly from the sdk.\n", + "astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)\n", + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3, 4, 5, 6, 7]" + ] + }, + { + "cell_type": "markdown", + "id": "e262c619", + "metadata": {}, + "source": [ + "##### Creating Infrastructure with four host and one rack Device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d577543", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.infragraph.infrastructure.name = \"1host-4ranks\"\n", + "\n", + "server = Device()\n", + "server.deserialize((Server(npu_factor=1).serialize()))\n", + "\n", + "hosts = astra.configuration.infragraph.infrastructure.instances.add(\n", + " name=\"host\", device=server.name, count=4\n", + ")\n", + "switch = Device()\n", + "switch.deserialize(Switch(port_count=16).serialize())\n", + "\n", + "rack_switch = astra.configuration.infragraph.infrastructure.instances.add(\n", + " name=\"rack_switch\", device=switch.name, count=1\n", + ")\n", + "\n", + "astra.configuration.infragraph.infrastructure.devices.append(server).append(switch)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "05264a01", + "metadata": {}, + "source": [ + "##### Creating Links" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "143c7f6e", + "metadata": {}, + "outputs": [], + "source": [ + "rack_link = astra.configuration.infragraph.infrastructure.links.add(\n", + " name=\"rack-link\",\n", + " description=\"Link characteristics for connectivity between servers and rack switch\",\n", + ")\n", + "rack_link.physical.bandwidth.gigabits_per_second = 200" + ] + }, + { + "cell_type": "markdown", + "id": "5e972048", + "metadata": {}, + "source": [ + "##### Adding edges and annotations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec821926", + "metadata": {}, + "outputs": [], + "source": [ + "host_component = InfraGraphService.get_component(server, Component.NIC)\n", + "switch_component = InfraGraphService.get_component(switch, Component.PORT)\n", + "# link each host to one leaf switch\n", + "for idx in range(hosts.count):\n", + " edge = astra.configuration.infragraph.infrastructure.edges.add(\n", + " scheme=InfrastructureEdge.ONE2ONE, link=rack_link.name\n", + " )\n", + " edge.ep1.instance = f\"{hosts.name}[{idx}]\"\n", + " edge.ep1.component = f\"{host_component.name}[0]\"\n", + " edge.ep2.instance = f\"{rack_switch.name}[0]\"\n", + " edge.ep2.component = f\"{switch_component.name}[{idx * 2}]\"\n", + " edge = astra.configuration.infragraph.infrastructure.edges.add(\n", + " scheme=InfrastructureEdge.ONE2ONE, link=rack_link.name\n", + " )\n", + " edge.ep1.instance = f\"{hosts.name}[{idx}]\"\n", + " edge.ep1.component = f\"{host_component.name}[1]\"\n", + " edge.ep2.instance = f\"{rack_switch.name}[0]\"\n", + " edge.ep2.component = f\"{switch_component.name}[{idx * 2 + 1}]\"\n", + "\n", + "# annotation\n", + "host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "host_device_spec.device_bandwidth_gbps = 200\n", + "host_device_spec.device_latency_ms = 0.05\n", + "host_device_spec.device_name = \"server\"\n", + "host_device_spec.device_type = \"host\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec)\n", + "\n", + "switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications()\n", + "switch_device_spec.device_bandwidth_gbps = 200\n", + "switch_device_spec.device_latency_ms = 0.05\n", + "switch_device_spec.device_name = \"switch\"\n", + "switch_device_spec.device_type = \"switch\"\n", + "astra.configuration.infragraph.annotations.device_specifications.append(\n", + " switch_device_spec\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c4414c8a", + "metadata": {}, + "source": [ + "##### Save infragraph as a yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74d5c82a", + "metadata": {}, + "outputs": [], + "source": [ + "with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"../infrastructure\",f\"{astra.tag}.yaml\"),\"w\") as f:\n", + " data = astra.configuration.infragraph.infrastructure.serialize(\"dict\")\n", + " yaml.dump(data, f, default_flow_style=False, indent=4)\n", + "\n", + "print(\"saved yaml to:\", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,\"..\",f\"{astra.tag}.yaml\"))" + ] + }, + { + "cell_type": "markdown", + "id": "70fa14c9", + "metadata": {}, + "source": [ + "##### Visualize the Infragraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb45bf7", + "metadata": {}, + "outputs": [], + "source": [ + "# VISUALIZER_START\n", + "PORT = 8765\n", + "\n", + "subprocess.run(\n", + " f\"lsof -ti:{PORT} | xargs -r kill -9\",\n", + " shell=True\n", + ")\n", + "\n", + "infra_yaml_path = os.path.join(\n", + " FileFolderUtils.get_instance().OUTPUT_DIR,\n", + " \"../infrastructure\",\n", + " f\"{astra.tag}.yaml\"\n", + ")\n", + "\n", + "infra_dir = os.path.dirname(infra_yaml_path)\n", + "visual_output_dir = os.path.normpath(os.path.join(infra_dir, \"../visuals\"))\n", + "\n", + "subprocess.run(\n", + " [\"infragraph\", \"visualize\", \"--input\", infra_yaml_path, \"--output\", visual_output_dir],\n", + " check=True\n", + ")\n", + "\n", + "subprocess.Popen(\n", + " [\"python3\", \"-m\", \"http.server\", f\"{PORT}\"],\n", + " cwd=visual_output_dir\n", + ")\n", + "IFrame(f\"http://localhost:{PORT}/index.html\", width=\"100%\", height=700)\n", + "# VISUALIZER_END" + ] + }, + { + "cell_type": "markdown", + "id": "665a4334", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90e58a65", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False" + ] + }, + { + "cell_type": "markdown", + "id": "697afcfc", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df571e8e", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "212005d3", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8630fd8f", + "metadata": {}, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "b7bacc46", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "551020a9", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()\n", + "\n", + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"flow_stats.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/infragraph/ns3_single_tier_with_generic_server.py b/client-scripts/notebooks/infragraph/ns3_single_tier_with_generic_server.py deleted file mode 100644 index c71c989..0000000 --- a/client-scripts/notebooks/infragraph/ns3_single_tier_with_generic_server.py +++ /dev/null @@ -1,192 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../../utils") -from astra_sim import AstraSim, Collective, NetworkBackend -from astra_sim_sdk import Device, Component -from infragraph import Component, InfrastructureEdge -from infragraph.infragraph_service import InfraGraphService -from infragraph.blueprints.devices.generic.server import Server -from infragraph.blueprints.devices.generic.generic_switch import Switch -import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - -# %% -astra = AstraSim(server_endpoint="172.17.0.2:8989", tag = "ns3_single_tier_with_generic_server") - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0,8]) -print(astra.configuration.common_config.workload) - - -# %% [markdown] -# ##### Configure the ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -print(astra.configuration.common_config.system) - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the selected network backend and the topology (infragraph or nc_topology) - -# %% -# We need to configure the network backend here since we are translating the topology from infragraph and not creating it directly from the sdk. -astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8] -astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3, 4, 5, 6, 7] - -# %% [markdown] -# ##### Creating Infrastructure with four host and one rack Device - -# %% -astra.configuration.infragraph.infrastructure.name = "1host-4ranks" - -server = Device() -server.deserialize((Server(npu_factor=1).serialize())) - -hosts = astra.configuration.infragraph.infrastructure.instances.add( - name="host", device=server.name, count=4 -) -switch = Device() -switch.deserialize(Switch(port_count=16).serialize()) - -rack_switch = astra.configuration.infragraph.infrastructure.instances.add( - name="rack_switch", device=switch.name, count=1 -) - -astra.configuration.infragraph.infrastructure.devices.append(server).append(switch) - - - -# %% [markdown] -# ##### Creating Links - -# %% -rack_link = astra.configuration.infragraph.infrastructure.links.add( - name="rack-link", - description="Link characteristics for connectivity between servers and rack switch", -) -rack_link.physical.bandwidth.gigabits_per_second = 200 - -# %% [markdown] -# ##### Adding edges and annotations - -# %% -host_component = InfraGraphService.get_component(server, Component.NIC) -switch_component = InfraGraphService.get_component(switch, Component.PORT) -# link each host to one leaf switch -for idx in range(hosts.count): - edge = astra.configuration.infragraph.infrastructure.edges.add( - scheme=InfrastructureEdge.ONE2ONE, link=rack_link.name - ) - edge.ep1.instance = f"{hosts.name}[{idx}]" - edge.ep1.component = f"{host_component.name}[0]" - edge.ep2.instance = f"{rack_switch.name}[0]" - edge.ep2.component = f"{switch_component.name}[{idx * 2}]" - edge = astra.configuration.infragraph.infrastructure.edges.add( - scheme=InfrastructureEdge.ONE2ONE, link=rack_link.name - ) - edge.ep1.instance = f"{hosts.name}[{idx}]" - edge.ep1.component = f"{host_component.name}[1]" - edge.ep2.instance = f"{rack_switch.name}[0]" - edge.ep2.component = f"{switch_component.name}[{idx * 2 + 1}]" - -# annotation -host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -host_device_spec.device_bandwidth_gbps = 200 -host_device_spec.device_latency_ms = 0.05 -host_device_spec.device_name = "server" -host_device_spec.device_type = "host" -astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - -switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() -switch_device_spec.device_bandwidth_gbps = 200 -switch_device_spec.device_latency_ms = 0.05 -switch_device_spec.device_name = "switch" -switch_device_spec.device_type = "switch" -astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec -) - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) -df.head() - -# %% [markdown] -# ##### Save infragraph as a yaml - -# %% -import yaml -import os -from common import FileFolderUtils -with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_tier_with_dgx"),"w") as f: - data = astra.configuration.infragraph.infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - -print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_tier_with_dgx.yaml")) diff --git a/client-scripts/notebooks/load_existing_et_example.ipynb b/client-scripts/notebooks/load_existing_et_example.ipynb new file mode 100644 index 0000000..ab30dae --- /dev/null +++ b/client-scripts/notebooks/load_existing_et_example.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "16fbc12d", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df1b7b72", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import pandas as pd\n", + "sys.path.append(\"../utils\")\n", + "from common import FileFolderUtils\n", + "from astra_sim import AstraSim, NetworkBackend\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "markdown", + "id": "0cc9f260", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73e5e3b2", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint=\"172.17.0.2:8989\", tag=\"load_existing_et_example\")" + ] + }, + { + "cell_type": "markdown", + "id": "4027ad0d", + "metadata": {}, + "source": [ + "##### Add existing workload execution traces by giving the path to the workload with basename included, mandatory for AstraSim workload configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c96cc0d", + "metadata": {}, + "outputs": [], + "source": [ + "cwd = os.path.dirname(os.path.abspath(__file__)) if \"__file__\" in globals() else os.getcwd()\n", + "astra.configuration.common_config.workload = os.path.join(cwd, \"../resources/example_workload/workload/all_reduce\")\n", + "print(astra.configuration.common_config.workload)" + ] + }, + { + "cell_type": "markdown", + "id": "45ddaaa6", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ebb2d7", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "print(astra.configuration.common_config.system)" + ] + }, + { + "cell_type": "markdown", + "id": "2ae7c917", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e41016ce", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "e9361e39", + "metadata": {}, + "source": [ + "##### Configure the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a08892e0", + "metadata": {}, + "outputs": [], + "source": [ + "# astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)\n", + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3,4 ,5 ,6, 7]\n", + "print(\"network backend choice set to:\",astra.configuration.network_backend.ns3.topology.choice)\n", + "print(astra.configuration.network_backend.ns3.network.packet_payload_size)\n", + "print(astra.configuration.network_backend.ns3.logical_topology)\n", + "print(astra.configuration.network_backend.ns3.trace)" + ] + }, + { + "cell_type": "markdown", + "id": "37f3dda6", + "metadata": {}, + "source": [ + "##### Configure the network topology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "787e8f1e", + "metadata": {}, + "outputs": [], + "source": [ + "# astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.NC_TOPOLOGY\n", + "# the topology configuration will be set automatically if we configure the nc_topology\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.total_nodes = 9\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.total_switches = 1\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.total_links = 8\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.switch_ids = [8]\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.clear()\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(0, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(1, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(2, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(3, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(4, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(5, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(6, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(7, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "print(astra.configuration.network_backend.ns3.topology.choice)\n", + "print(astra.configuration.network_backend.ns3.topology.nc_topology)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "0853b519", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06ac809f", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False\n", + "\n", + "print(astra.configuration.common_config.cmd_parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "4707a7fa", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a8b6ecb", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "fd3caa52", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12c3c1d4", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "b9027fe6", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a227cfd", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()\n", + "\n", + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"flow_stats.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/load_existing_et_example.py b/client-scripts/notebooks/load_existing_et_example.py deleted file mode 100644 index 7607917..0000000 --- a/client-scripts/notebooks/load_existing_et_example.py +++ /dev/null @@ -1,131 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -import os -import pandas as pd -sys.path.append("../utils") -from common import FileFolderUtils -from astra_sim import AstraSim, NetworkBackend -from pathlib import Path - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - -# %% -astra = AstraSim(server_endpoint="172.17.0.2:8989", tag="load_existing_et_example") - -# %% [markdown] -# ##### Add existing workload execution traces by giving the path to the workload with basename included, mandatory for AstraSim workload configuration. - -# %% -cwd = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd() -astra.configuration.common_config.workload = os.path.join(cwd, "../resources/example_workload/workload/all_reduce") -print(astra.configuration.common_config.workload) - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -print(astra.configuration.common_config.system) - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the network backend - -# %% -# astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8] -astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3,4 ,5 ,6, 7] -print("network backend choice set to:",astra.configuration.network_backend.ns3.topology.choice) -print(astra.configuration.network_backend.ns3.network.packet_payload_size) -print(astra.configuration.network_backend.ns3.logical_topology) -print(astra.configuration.network_backend.ns3.trace) - -# %% [markdown] -# ##### Configure the network topology - -# %% -# astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.NC_TOPOLOGY -# the topology configuration will be set automatically if we configure the nc_topology -astra.configuration.network_backend.ns3.topology.nc_topology.total_nodes = 9 -astra.configuration.network_backend.ns3.topology.nc_topology.total_switches = 1 -astra.configuration.network_backend.ns3.topology.nc_topology.total_links = 8 -astra.configuration.network_backend.ns3.topology.nc_topology.switch_ids = [8] -astra.configuration.network_backend.ns3.topology.nc_topology.connections.clear() -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(0, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(1, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(2, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(3, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(4, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(5, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(6, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(7, 8, "100Gbps", "0.005ms", "0") -print(astra.configuration.network_backend.ns3.topology.choice) -print(astra.configuration.network_backend.ns3.topology.nc_topology) - - - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -print(astra.configuration.common_config.cmd_parameters) - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - - -# %% [markdown] -# ##### Read output files - -# %% - -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() - -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) -df.head() diff --git a/client-scripts/notebooks/ns3_sample.ipynb b/client-scripts/notebooks/ns3_sample.ipynb new file mode 100644 index 0000000..3d81c62 --- /dev/null +++ b/client-scripts/notebooks/ns3_sample.ipynb @@ -0,0 +1,273 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf24eda1", + "metadata": {}, + "source": [ + "##### Import the required modules and configure the system path to locate them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "046d9232", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../utils\")\n", + "import os\n", + "import pandas as pd\n", + "from common import FileFolderUtils\n", + "from astra_sim import AstraSim, Collective, NetworkBackend" + ] + }, + { + "cell_type": "markdown", + "id": "8bab4378", + "metadata": {}, + "source": [ + "##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bc47b4e", + "metadata": {}, + "outputs": [], + "source": [ + "astra = AstraSim(server_endpoint=\"172.17.0.2:8989\", tag=\"ns3_sample\")" + ] + }, + { + "cell_type": "markdown", + "id": "877dfb68", + "metadata": {}, + "source": [ + "##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3580b8b5", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0,8])\n", + "print(astra.configuration.common_config.workload)" + ] + }, + { + "cell_type": "markdown", + "id": "355542c0", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim system config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b0259c1", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO\n", + "astra.configuration.common_config.system.endpoint_delay = 10\n", + "astra.configuration.common_config.system.active_chunks_per_dimension = 1\n", + "astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING]\n", + "astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT]\n", + "astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING]\n", + "astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE\n", + "astra.configuration.common_config.system.local_mem_bw = 1600\n", + "print(astra.configuration.common_config.system)" + ] + }, + { + "cell_type": "markdown", + "id": "5d361352", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim remote memory configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5721dbdc", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION\n", + "print(astra.configuration.common_config.remote_memory)" + ] + }, + { + "cell_type": "markdown", + "id": "bc30b6d8", + "metadata": {}, + "source": [ + "##### Configure the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "878495ee", + "metadata": {}, + "outputs": [], + "source": [ + "# astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3\n", + "astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192)\n", + "astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8]\n", + "astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3,4 ,5 ,6, 7]\n", + "print(\"network backend choice set to:\",astra.configuration.network_backend.ns3.topology.choice)\n", + "print(astra.configuration.network_backend.ns3.network.packet_payload_size)\n", + "print(astra.configuration.network_backend.ns3.logical_topology)\n", + "print(astra.configuration.network_backend.ns3.trace)" + ] + }, + { + "cell_type": "markdown", + "id": "a2b3d18f", + "metadata": {}, + "source": [ + "##### Set up the network topology" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab2c1fe2", + "metadata": {}, + "outputs": [], + "source": [ + "# astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.NC_TOPOLOGY\n", + "# the topology configuration will be set automatically if we configure the nc_topology\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.total_nodes = 9\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.total_switches = 1\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.total_links = 8\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.switch_ids = [8]\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.clear()\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(0, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(1, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(2, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(3, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(4, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(5, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(6, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(7, 8, \"100Gbps\", \"0.005ms\", \"0\")\n", + "print(astra.configuration.network_backend.ns3.topology.choice)\n", + "print(astra.configuration.network_backend.ns3.topology.nc_topology)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "3aad02f4", + "metadata": {}, + "source": [ + "##### Configure ASTRA-sim cmd parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0e32dd9", + "metadata": {}, + "outputs": [], + "source": [ + "astra.configuration.common_config.cmd_parameters.comm_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.injection_scale = 1\n", + "astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False\n", + "\n", + "print(astra.configuration.common_config.cmd_parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "667d96a8", + "metadata": {}, + "source": [ + "#### Start the simulation by specifying the network backend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd777341", + "metadata": {}, + "outputs": [], + "source": [ + "astra.run_simulation(NetworkBackend.NS3)" + ] + }, + { + "cell_type": "markdown", + "id": "ff25abde", + "metadata": {}, + "source": [ + "##### Download all the configurations as a zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7a10140", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "astra.download_configuration()" + ] + }, + { + "cell_type": "markdown", + "id": "5e2a281b", + "metadata": {}, + "source": [ + "##### Read output files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c05384f3", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"fct.csv\"))\n", + "df.head()\n", + "df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, \"flow_stats.csv\"))\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/client-scripts/notebooks/ns3_sample.py b/client-scripts/notebooks/ns3_sample.py deleted file mode 100644 index 2938a33..0000000 --- a/client-scripts/notebooks/ns3_sample.py +++ /dev/null @@ -1,128 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# ##### Import the required modules and configure the system path to locate them - -# %% -import sys -sys.path.append("../utils") -from astra_sim import AstraSim, Collective, NetworkBackend - -# %% [markdown] -# ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - -# %% -astra = AstraSim(server_endpoint="172.17.0.2:8989", tag="ns3_sample") - -# %% [markdown] -# ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - -# %% -astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0,8]) -print(astra.configuration.common_config.workload) - - -# %% [markdown] -# ##### Configure ASTRA-sim system config - -# %% -astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO -astra.configuration.common_config.system.endpoint_delay = 10 -astra.configuration.common_config.system.active_chunks_per_dimension = 1 -astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] -astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] -astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] -astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE -astra.configuration.common_config.system.local_mem_bw = 1600 -print(astra.configuration.common_config.system) - -# %% [markdown] -# ##### Configure ASTRA-sim remote memory configuration - -# %% -astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION -print(astra.configuration.common_config.remote_memory) - -# %% [markdown] -# ##### Configure the network backend - -# %% -# astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 -astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) -astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8] -astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3,4 ,5 ,6, 7] -print("network backend choice set to:",astra.configuration.network_backend.ns3.topology.choice) -print(astra.configuration.network_backend.ns3.network.packet_payload_size) -print(astra.configuration.network_backend.ns3.logical_topology) -print(astra.configuration.network_backend.ns3.trace) - -# %% [markdown] -# ##### Set up the network topology - -# %% -# astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.NC_TOPOLOGY -# the topology configuration will be set automatically if we configure the nc_topology -astra.configuration.network_backend.ns3.topology.nc_topology.total_nodes = 9 -astra.configuration.network_backend.ns3.topology.nc_topology.total_switches = 1 -astra.configuration.network_backend.ns3.topology.nc_topology.total_links = 8 -astra.configuration.network_backend.ns3.topology.nc_topology.switch_ids = [8] -astra.configuration.network_backend.ns3.topology.nc_topology.connections.clear() -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(0, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(1, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(2, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(3, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(4, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(5, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(6, 8, "100Gbps", "0.005ms", "0") -astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(7, 8, "100Gbps", "0.005ms", "0") -print(astra.configuration.network_backend.ns3.topology.choice) -print(astra.configuration.network_backend.ns3.topology.nc_topology) - - - -# %% [markdown] -# ##### Configure ASTRA-sim cmd parameters - -# %% -astra.configuration.common_config.cmd_parameters.comm_scale = 1 -astra.configuration.common_config.cmd_parameters.injection_scale = 1 -astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - -print(astra.configuration.common_config.cmd_parameters) - -# %% [markdown] -# #### Start the simulation by specifying the network backend - -# %% -astra.run_simulation(NetworkBackend.NS3) - -# %% [markdown] -# ##### Download all the configurations as a zip - -# %% -astra.download_configuration() - - -# %% [markdown] -# ##### Read output files - -# %% -import pandas as pd -import os -from common import FileFolderUtils -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) -df.head() -df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) -df.head() diff --git a/client-scripts/utils/common.py b/client-scripts/utils/common.py index b0b35d3f..a9a8662 100644 --- a/client-scripts/utils/common.py +++ b/client-scripts/utils/common.py @@ -236,7 +236,7 @@ def ns3_flow_statistics(): """ df = pd.read_csv( os.path.join(FileFolderUtils().OUTPUT_DIR, "fct.txt"), - delim_whitespace=True, + sep=r"\s+", header=None, ) @@ -273,7 +273,7 @@ def ns3_fct_csv(): """ df = pd.read_csv( os.path.join(FileFolderUtils().OUTPUT_DIR, "fct.txt"), - delim_whitespace=True, + sep=r"\s+", header=None, ) df.columns = [ diff --git a/models/schema/api/api.yaml b/models/schema/api/api.yaml index bd9c906..b0b35d30 100644 --- a/models/schema/api/api.yaml +++ b/models/schema/api/api.yaml @@ -1,7 +1,7 @@ openapi: 3.0.3 info: title: ASTRA-sim APIs - version: 1.2.2 + version: 1.3.0 description: The ASTRA-sim API provides a standardized interface for submitting configurations, executing jobs, checking job status, and retrieving results. license: name: MIT diff --git a/service/Makefile b/service/Makefile index 20705d3..9dd958c 100644 --- a/service/Makefile +++ b/service/Makefile @@ -5,7 +5,6 @@ BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD) PACKAGE_NAME := astra_server ASTRA_SIM_SERVICE_IMAGE := astra_sim_service GENERATED_VERSION := $(shell cat ../.VERSION) -NOTEBOOK_DIR := ../client-scripts/notebooks export PATH:=$(PY_PATHS):$(PATH) help: @@ -60,15 +59,13 @@ version: ## Generate build version .PHONY: pre-test pre-test: rm -rf tests/test-notebook - find $(NOTEBOOK_DIR) -name "*.py" -exec jupytext --to notebook {} \; python3 convert_nb_to_script.py cp -r ../client-scripts/resources tests - rm -f $(NOTEBOOK_DIR)/*.ipynb - rm -f $(NOTEBOOK_DIR)/infragraph/*.ipynb .PHONY: post-test post-test: rm -rf tests/resources + rm -rf tests/test-notebook .PHONY: build build: clean version diff --git a/service/convert_nb_to_script.py b/service/convert_nb_to_script.py index 42124c8..fe5e6d6 100644 --- a/service/convert_nb_to_script.py +++ b/service/convert_nb_to_script.py @@ -5,9 +5,8 @@ import nbformat import textwrap -ignore_notebooks = { - "config_to_schema_sample" -} +ignore_notebooks = {"config_to_schema_sample"} + def wrap_notebook_in_function(input_nb_path: Path, output_py_path: Path, function_name: str): """ @@ -29,6 +28,7 @@ def wrap_notebook_in_function(input_nb_path: Path, output_py_path: Path, functio cleaned_script = "\n".join(cleaned_code) cleaned_script = re.sub(r"\n\s*\n+", "\n\n", cleaned_script) + cleaned_script = re.sub(r"# VISUALIZER_START[\s\S]*?# VISUALIZER_END", "", cleaned_script) cleaned_script = re.sub( r"sys\.path\.append\s*\(.*?\)", 'sys.path.append("../client-scripts/utils")\nsys.path.append("../../client-scripts/utils")\nsys.path.append("./client-scripts/utils")', diff --git a/service/tests/test-notebook/test_analytical_congestion_aware_sample.py b/service/tests/test-notebook/test_analytical_congestion_aware_sample.py deleted file mode 100644 index 5df6c53..0000000 --- a/service/tests/test-notebook/test_analytical_congestion_aware_sample.py +++ /dev/null @@ -1,65 +0,0 @@ -def test_analytical_congestion_aware_sample(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from astra_sim import AstraSim, Collective, NetworkBackend - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "analytical_congestion_aware_sample") - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLTOALL, coll_size= 8*1024*1024, npu_range=[0,8]) - print(astra.configuration.common_config.workload) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the network backend and topology - - # astra.configuration.network_backend.choice = astra.configuration.network_backend.ANALYTICAL_CONGESTION_AWARE - astra.configuration.network_backend.analytical_congestion_aware.topology.network.clear() - astra.configuration.network_backend.analytical_congestion_aware.topology.network.add("switch", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns) - print("Network backend set to", astra.configuration.network_backend.choice) - print("network backend choice set to:",astra.configuration.network_backend.analytical_congestion_aware.topology.choice) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - - print(astra.configuration.network_backend.analytical_congestion_aware.topology.network) - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_AWARE) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_analytical_congestion_unaware_sample.py b/service/tests/test-notebook/test_analytical_congestion_unaware_sample.py deleted file mode 100644 index d29c07b..0000000 --- a/service/tests/test-notebook/test_analytical_congestion_unaware_sample.py +++ /dev/null @@ -1,63 +0,0 @@ -def test_analytical_congestion_unaware_sample(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from astra_sim import AstraSim, Collective, NetworkBackend - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "analytical_congestion_unaware_sample") - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1024*1024*1024, npu_range=[0,8]) - print(astra.configuration.common_config.workload) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the network backend and topology - - # astra.configuration.network_backend.choice = astra.configuration.network_backend.ANALYTICAL_CONGESTION_UNAWARE - astra.configuration.network_backend.analytical_congestion_aware.topology.network.clear() - astra.configuration.network_backend.analytical_congestion_unaware.topology.network.add("fullyconnected", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns) - print("Network backend set to", astra.configuration.network_backend.choice) - print("network backend choice set to:",astra.configuration.network_backend.analytical_congestion_unaware.topology.choice) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_UNAWARE) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_analytical_dgx_device.py b/service/tests/test-notebook/test_analytical_dgx_device.py deleted file mode 100644 index 9fbbf27..0000000 --- a/service/tests/test-notebook/test_analytical_dgx_device.py +++ /dev/null @@ -1,100 +0,0 @@ -def test_analytical_dgx_device(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX, DgxProfile - from infragraph import Infrastructure - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "analytical_dgx_device") - - # ##### Get all available DGX variants - - from typing import get_args - print(get_args(DgxProfile)) - - # ##### Create a Nvidia DGX device fabric using infragraph device blueprint - - server = NvidiaDGX() - infrastructure = Infrastructure() - infrastructure.devices.append(server) - infrastructure.instances.add(name=server.name, device=server.name, count=1) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - total_npus = service.get_component(device=server, type="xpu").count - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Set ASTRA-sim network backend to ANALYTICAL_CONGESTION_AWARE - - astra.configuration.network_backend.analytical_congestion_aware.topology.choice = astra.configuration.network_backend.analytical_congestion_aware.topology.INFRAGRAPH - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = server.name - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_AWARE) - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","analytical_dgx_device.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","analytical_dgx_device.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_analytical_ironwood_rack.py b/service/tests/test-notebook/test_analytical_ironwood_rack.py deleted file mode 100644 index f685470..0000000 --- a/service/tests/test-notebook/test_analytical_ironwood_rack.py +++ /dev/null @@ -1,100 +0,0 @@ -def test_analytical_ironwood_rack(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.devices.ironwood_rack import IronwoodRack - from infragraph import Infrastructure - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "analytical_ironwood_rack") - - # ##### Create a ironwood rack device fabric using infragraph device blueprint - - server = IronwoodRack() - infrastructure = Infrastructure() - infrastructure.devices.append(server) - - infrastructure.instances.add(name=server.name, device=server.name, count=1) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - total_npus = service.get_component(device=server, type="xpu").count - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure network backend to ANALYTICAL_CONGESTION_AWARE - - astra.configuration.network_backend.analytical_congestion_unaware.topology.choice = astra.configuration.network_backend.analytical_congestion_unaware.topology.INFRAGRAPH - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = server.name - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.ANALYTICAL_CONGESTION_UNAWARE) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","analytical_dgx_device.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","analytical_dgx_device.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_htsim_clos_fabric_2tier.py b/service/tests/test-notebook/test_htsim_clos_fabric_2tier.py deleted file mode 100644 index 4bb2f43..0000000 --- a/service/tests/test-notebook/test_htsim_clos_fabric_2tier.py +++ /dev/null @@ -1,121 +0,0 @@ -def test_htsim_clos_fabric_2tier(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric - from infragraph.blueprints.devices.generic.server import Server - from infragraph.blueprints.devices.generic.generic_switch import Switch - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "htsim_clos_fabric_2tier") - - # ##### Create a two-tier clos fabric using infragraph fabric blueprint - - server = Server() - switch = Switch(port_count=8) - infrastructure = ClosFatTreeFabric(switch, server, 2,[]) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - total_npus = 16 - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.preferred_dataset_splits = 4 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - astra.configuration.common_config.system.peak_perf = 900 - astra.configuration.common_config.system.roofline_enabled = 0 - print(astra.configuration.common_config.system) - - # ##### Configure the remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or network_topology_configuration) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM - astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH - - # ##### Select htsim protocol - - astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP - print("Network backend set to", astra.configuration.network_backend.choice) - print("network topology choice set to:",astra.configuration.network_backend.htsim.topology.choice) - print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol) - astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus) - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 1000 - host_device_spec.device_latency_ms = 0.005 - host_device_spec.device_name = "server" - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - switch_device_spec.device_bandwidth_gbps = 1000 - switch_device_spec.device_latency_ms = 0.005 - switch_device_spec.device_name = "switch" - switch_device_spec.device_type = "switch" - astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec - ) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.HTSIM) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","htsim_clos_fabric_2tier.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","htsim_clos_fabric_2tier.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_htsim_clos_fabric_3tier.py b/service/tests/test-notebook/test_htsim_clos_fabric_3tier.py deleted file mode 100644 index 91ff932..0000000 --- a/service/tests/test-notebook/test_htsim_clos_fabric_3tier.py +++ /dev/null @@ -1,121 +0,0 @@ -def test_htsim_clos_fabric_3tier(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric - from infragraph.blueprints.devices.generic.server import Server - from infragraph.blueprints.devices.generic.generic_switch import Switch - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "htsim_clos_fabric_3tier") - - # ##### Create a three-tier clos fabric using infragraph fabric blueprint - - server = Server() - switch = Switch(port_count=8) - infrastructure = ClosFatTreeFabric(switch, server, 3,[]) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - total_npus = 64 - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.preferred_dataset_splits = 4 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - astra.configuration.common_config.system.peak_perf = 900 - astra.configuration.common_config.system.roofline_enabled = 0 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or network_topology_configuration) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.HTSIM - astra.configuration.network_backend.htsim.topology.choice = astra.configuration.network_backend.htsim.topology.INFRAGRAPH - - # ##### Select htsim protocol - - astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP - print("Network backend set to", astra.configuration.network_backend.choice) - print("network topology choice set to:",astra.configuration.network_backend.htsim.topology.choice) - print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol) - astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = str(total_npus) - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 1000 - host_device_spec.device_latency_ms = 0.005 - host_device_spec.device_name = "server" - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - switch_device_spec.device_bandwidth_gbps = 1000 - switch_device_spec.device_latency_ms = 0.005 - switch_device_spec.device_name = "switch" - switch_device_spec.device_type = "switch" - astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec - ) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.HTSIM) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","htsim_clos_fabric_3tier.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","htsim_clos_fabric_3tier.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_htsim_sample.py b/service/tests/test-notebook/test_htsim_sample.py deleted file mode 100644 index 9309d71..0000000 --- a/service/tests/test-notebook/test_htsim_sample.py +++ /dev/null @@ -1,97 +0,0 @@ -def test_htsim_sample(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from astra_sim import AstraSim, Collective, NetworkBackend - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - - astra = AstraSim(f"0.0.0.0:{port_number}",tag = "htsim_sample") - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 * 1024 * 1024, npu_range=[0, 8]) - print(astra.configuration.common_config.workload) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.preferred_dataset_splits = 4 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - astra.configuration.common_config.system.peak_perf = 900 - astra.configuration.common_config.system.roofline_enabled = 0 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the network backend, htsim protocol and topology - - astra.configuration.network_backend.htsim.topology.network_topology_configuration.network.clear() - astra.configuration.network_backend.htsim.topology.network_topology_configuration.network.add("ring", 8, 100, 0.005) # add(type_of_topology, number_of_nodes, bandwidth_in_gbps, latency_in_ns) - astra.configuration.network_backend.htsim.htsim_protocol.choice = astra.configuration.network_backend.htsim.htsim_protocol.TCP - print("Network backend set to", astra.configuration.network_backend.choice) - print("network backend choice set to:",astra.configuration.network_backend.htsim.topology.choice) - print("protocol set to", astra.configuration.network_backend.htsim.htsim_protocol.choice) - - # ##### Configure the fat tree topology. - - # Configuring topo file - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.nodes = 8 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.podsize = 4 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tiers = 3 - - # Configuring values for each tiers - # Configuring values for tier 0 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.set( - downlink_speed_gbps=200 - ) - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.radix_down = 2 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.radix_up = 2 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_0.downlink_latency_ns = 1000 - - # Configuring values for tier 1 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.set( - downlink_speed_gbps=200 - ) - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.radix_down = 2 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.radix_up = 4 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.downlink_latency_ns = 1000 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_1.bundle = 1 - - # Configuring values for tier 2 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.set( - downlink_speed_gbps=100 - ) - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.radix_down = 4 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.downlink_latency_ns = 1000 - astra.configuration.network_backend.htsim.topology.network_topology_configuration.htsim_topology.fat_tree.tier_2.bundle = 2 - - astra.configuration.network_backend.htsim.htsim_protocol.tcp.nodes = "8" - print(astra.configuration.network_backend.htsim.topology.network_topology_configuration) - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.HTSIM) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_load_existing_et_example.py b/service/tests/test-notebook/test_load_existing_et_example.py deleted file mode 100644 index 077a245..0000000 --- a/service/tests/test-notebook/test_load_existing_et_example.py +++ /dev/null @@ -1,101 +0,0 @@ -def test_load_existing_et_example(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - import os - import pandas as pd - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from common import FileFolderUtils - from astra_sim import AstraSim, NetworkBackend - from pathlib import Path - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - - astra = AstraSim(f"0.0.0.0:{port_number}", tag="load_existing_et_example") - - # ##### Add existing workload execution traces by giving the path to the workload with basename included, mandatory for AstraSim workload configuration. - - cwd = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd() - astra.configuration.common_config.workload = os.path.join(cwd, "../resources/example_workload/workload/all_reduce") - print(astra.configuration.common_config.workload) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the network backend - - # astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8] - astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3,4 ,5 ,6, 7] - print("network backend choice set to:",astra.configuration.network_backend.ns3.topology.choice) - print(astra.configuration.network_backend.ns3.network.packet_payload_size) - print(astra.configuration.network_backend.ns3.logical_topology) - print(astra.configuration.network_backend.ns3.trace) - - # ##### Configure the network topology - - # astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.NC_TOPOLOGY - # the topology configuration will be set automatically if we configure the nc_topology - astra.configuration.network_backend.ns3.topology.nc_topology.total_nodes = 9 - astra.configuration.network_backend.ns3.topology.nc_topology.total_switches = 1 - astra.configuration.network_backend.ns3.topology.nc_topology.total_links = 8 - astra.configuration.network_backend.ns3.topology.nc_topology.switch_ids = [8] - astra.configuration.network_backend.ns3.topology.nc_topology.connections.clear() - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(0, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(1, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(2, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(3, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(4, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(5, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(6, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(7, 8, "100Gbps", "0.005ms", "0") - print(astra.configuration.network_backend.ns3.topology.choice) - print(astra.configuration.network_backend.ns3.topology.nc_topology) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - print(astra.configuration.common_config.cmd_parameters) - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) - df.head() - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_clos_fabric_2tier.py b/service/tests/test-notebook/test_ns3_clos_fabric_2tier.py deleted file mode 100644 index ecfb49e..0000000 --- a/service/tests/test-notebook/test_ns3_clos_fabric_2tier.py +++ /dev/null @@ -1,128 +0,0 @@ -def test_ns3_clos_fabric_2tier(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric - from infragraph.blueprints.devices.generic.server import Server - from infragraph.blueprints.devices.generic.generic_switch import Switch - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "ns3_clos_fabric_2tier") - - # ##### Create a two-tier clos fabric using infragraph fabric blueprint - - server = Server() - switch = Switch(port_count=8) - infrastructure = ClosFatTreeFabric(switch, server, 2,[]) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - total_npus = 32 - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or nc_topology) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - # ##### Adding ns3 trace and logical dimension - - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] - astra.configuration.network_backend.ns3.trace.trace_ids = [] - for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = "server" - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - switch_device_spec.device_bandwidth_gbps = 100 - switch_device_spec.device_latency_ms = 0.05 - switch_device_spec.device_name = "switch" - switch_device_spec.device_type = "switch" - astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec - ) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) - df.head() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_clos_fabric_2tier.yaml"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_clos_fabric_2tier.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_clos_fabric_3tier.py b/service/tests/test-notebook/test_ns3_clos_fabric_3tier.py deleted file mode 100644 index 5a7fd93..0000000 --- a/service/tests/test-notebook/test_ns3_clos_fabric_3tier.py +++ /dev/null @@ -1,125 +0,0 @@ -def test_ns3_clos_fabric_3tier(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.fabrics.clos_fat_tree_fabric import ClosFatTreeFabric - from infragraph.blueprints.devices.generic.server import Server - from infragraph.blueprints.devices.generic.generic_switch import Switch - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "ns3_clos_fabric_3tier") - - # ##### Create a three-tier clos fabric using infragraph fabric blueprint - - server = Server() - switch = Switch(port_count=4) - infrastructure = ClosFatTreeFabric(switch, server, 3,[]) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - total_npus = 16 - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or nc_topology) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - # ##### Adding ns3 trace and logical dimension - - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] - astra.configuration.network_backend.ns3.trace.trace_ids = [] - for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = "server" - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - switch_device_spec.device_bandwidth_gbps = 100 - switch_device_spec.device_latency_ms = 0.05 - switch_device_spec.device_name = "switch" - switch_device_spec.device_type = "switch" - astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec - ) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_clos_fabric_3tier"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_clos_fabric_3tier.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_sample.py b/service/tests/test-notebook/test_ns3_sample.py deleted file mode 100644 index bdcdbdd..0000000 --- a/service/tests/test-notebook/test_ns3_sample.py +++ /dev/null @@ -1,98 +0,0 @@ -def test_ns3_sample(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from astra_sim import AstraSim, Collective, NetworkBackend - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs. - - astra = AstraSim(f"0.0.0.0:{port_number}", tag="ns3_sample") - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0,8]) - print(astra.configuration.common_config.workload) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the network backend - - # astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8] - astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3,4 ,5 ,6, 7] - print("network backend choice set to:",astra.configuration.network_backend.ns3.topology.choice) - print(astra.configuration.network_backend.ns3.network.packet_payload_size) - print(astra.configuration.network_backend.ns3.logical_topology) - print(astra.configuration.network_backend.ns3.trace) - - # ##### Set up the network topology - - # astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.NC_TOPOLOGY - # the topology configuration will be set automatically if we configure the nc_topology - astra.configuration.network_backend.ns3.topology.nc_topology.total_nodes = 9 - astra.configuration.network_backend.ns3.topology.nc_topology.total_switches = 1 - astra.configuration.network_backend.ns3.topology.nc_topology.total_links = 8 - astra.configuration.network_backend.ns3.topology.nc_topology.switch_ids = [8] - astra.configuration.network_backend.ns3.topology.nc_topology.connections.clear() - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(0, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(1, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(2, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(3, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(4, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(5, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(6, 8, "100Gbps", "0.005ms", "0") - astra.configuration.network_backend.ns3.topology.nc_topology.connections.add(7, 8, "100Gbps", "0.005ms", "0") - print(astra.configuration.network_backend.ns3.topology.choice) - print(astra.configuration.network_backend.ns3.topology.nc_topology) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - print(astra.configuration.common_config.cmd_parameters) - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) - df.head() - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_single_dgx_device.py b/service/tests/test-notebook/test_ns3_single_dgx_device.py deleted file mode 100644 index 7e9dbf7..0000000 --- a/service/tests/test-notebook/test_ns3_single_dgx_device.py +++ /dev/null @@ -1,121 +0,0 @@ -def test_ns3_single_dgx_device(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX, DgxProfile - from infragraph import Infrastructure - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "ns3_single_dgx") - - # ##### Get all available DGX variants - - from typing import get_args - print(get_args(DgxProfile)) - - # ##### Create a Nvidia DGX device fabric using infragraph device blueprint - - server = NvidiaDGX("dgx_h100") - infrastructure = Infrastructure() - infrastructure.devices.append(server) - infrastructure.instances.add(name=server.name, device=server.name, count=1) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - total_npus = service.get_component(device=server, type="xpu").count - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or nc_topology) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - # ##### Adding ns3 trace and logical dimension - - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] - astra.configuration.network_backend.ns3.trace.trace_ids = [] - for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - - # ##### Adding ASTRA-sim - Infragraph specific annotation for Nvidia DGX - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = server.name - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_dgx"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_dgx.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_single_ironwood_device.py b/service/tests/test-notebook/test_ns3_single_ironwood_device.py deleted file mode 100644 index c298204..0000000 --- a/service/tests/test-notebook/test_ns3_single_ironwood_device.py +++ /dev/null @@ -1,117 +0,0 @@ -def test_ns3_single_ironwood_device(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - import networkx - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.devices.ironwood_rack import IronwoodRack - from infragraph import Infrastructure - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "ns3_single_ironwood") - - # ##### Create a ironwood rack device fabric using infragraph device blueprint - - server = IronwoodRack() - infrastructure = Infrastructure() - infrastructure.devices.append(server) - infrastructure.instances.add(name=server.name, device=server.name, count=1) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - print(astra.configuration.infragraph.infrastructure) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - total_npus = service.get_component(server, "xpu").count - print(total_npus) - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 1 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING, astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or nc_topology) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - # ##### Adding ns3 trace and logical dimension - - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [4, 4, 4] - astra.configuration.network_backend.ns3.trace.trace_ids = [] - for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = server.name - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_ironwood"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_ironwood.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_single_tier_with_dgx.py b/service/tests/test-notebook/test_ns3_single_tier_with_dgx.py deleted file mode 100644 index 7509ca2..0000000 --- a/service/tests/test-notebook/test_ns3_single_tier_with_dgx.py +++ /dev/null @@ -1,126 +0,0 @@ -def test_ns3_single_tier_with_dgx(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from astra_sim import AstraSim, Collective, NetworkBackend - from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX - from infragraph.blueprints.fabrics.single_tier_fabric import SingleTierFabric - import networkx - from infragraph.infragraph_service import InfraGraphService - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "ns3_single_tier_with_dgx") - - # ##### Create a single tier rack device with two Nvidia DGX and a single switch using infragraph device, fabric blueprint - - dgx_count = 2 - server = NvidiaDGX() - infrastructure = SingleTierFabric(server, dgx_count) - astra.configuration.infragraph.infrastructure.deserialize(infrastructure.serialize()) - - # ##### Initialize the Infragraph service, display the fabric topology, and retrieve/set the total number of NPUs to generate the collective - - service = InfraGraphService() - service.set_graph(infrastructure) - - g = service.get_networkx_graph() - print(networkx.write_network_text(g, vertical_chains=True)) - - total_npus = service.get_component(server, "xpu").count * dgx_count - print(total_npus) - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0, total_npus]) - - # ##### Configure ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or nc_topology) - - astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - - # ##### Adding ns3 trace and logical dimension - - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [total_npus] - astra.configuration.network_backend.ns3.trace.trace_ids = [] - for i in range(0, total_npus): - astra.configuration.network_backend.ns3.trace.trace_ids.append(i) - - # ##### Adding ASTRA-sim - Infragraph specific annotation - - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 100 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = server.name - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - switch_device_spec.device_bandwidth_gbps = 100 - switch_device_spec.device_latency_ms = 0.05 - switch_device_spec.device_name = "switch" - switch_device_spec.device_type = "switch" - astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec - ) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_tier_with_dgx"),"w") as f: - data = infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_tier_with_dgx.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}' diff --git a/service/tests/test-notebook/test_ns3_single_tier_with_generic_server.py b/service/tests/test-notebook/test_ns3_single_tier_with_generic_server.py deleted file mode 100644 index 932cf1d..0000000 --- a/service/tests/test-notebook/test_ns3_single_tier_with_generic_server.py +++ /dev/null @@ -1,157 +0,0 @@ -def test_ns3_single_tier_with_generic_server(port_number): - - try: - - # ##### Import the required modules and configure the system path to locate them - - import sys - sys.path.append("../client-scripts/utils") - sys.path.append("../../client-scripts/utils") - sys.path.append("./client-scripts/utils") - from astra_sim import AstraSim, Collective, NetworkBackend - from astra_sim_sdk import Device, Component - from infragraph import Component, InfrastructureEdge - from infragraph.infragraph_service import InfraGraphService - from infragraph.blueprints.devices.generic.server import Server - from infragraph.blueprints.devices.generic.generic_switch import Switch - import astra_sim_sdk.astra_sim_sdk as astra_sim_kit - - # ##### Call the AstraSim client helper with the server endpoint and tag to connect to the ASTRA-sim gRPC server, initialize the SDK, and create a tagged folder for configs, results, and logs - - astra = AstraSim(f"0.0.0.0:{port_number}", tag = "ns3_single_tier_with_generic_server") - - # ##### Generate workload execution traces for each rank and set the required data size for AstraSim configuration - - astra.configuration.common_config.workload = astra.generate_collective(collective=Collective.ALLREDUCE, coll_size= 8 *1024*1024, npu_range=[0,8]) - print(astra.configuration.common_config.workload) - - # ##### Configure the ASTRA-sim system config - - astra.configuration.common_config.system.scheduling_policy = astra.configuration.common_config.system.LIFO - astra.configuration.common_config.system.endpoint_delay = 10 - astra.configuration.common_config.system.active_chunks_per_dimension = 1 - astra.configuration.common_config.system.all_gather_implementation = [astra.configuration.common_config.system.RING] - astra.configuration.common_config.system.all_to_all_implementation = [astra.configuration.common_config.system.DIRECT] - astra.configuration.common_config.system.all_reduce_implementation = [astra.configuration.common_config.system.ONERING] - astra.configuration.common_config.system.collective_optimization = astra.configuration.common_config.system.LOCALBWAWARE - astra.configuration.common_config.system.local_mem_bw = 1600 - print(astra.configuration.common_config.system) - - # ##### Configure ASTRA-sim remote memory configuration - - astra.configuration.common_config.remote_memory.memory_type = astra.configuration.common_config.remote_memory.NO_MEMORY_EXPANSION - print(astra.configuration.common_config.remote_memory) - - # ##### Configure the selected network backend and the topology (infragraph or nc_topology) - - # We need to configure the network backend here since we are translating the topology from infragraph and not creating it directly from the sdk. - astra.configuration.network_backend.choice = astra.configuration.network_backend.NS3 - astra.configuration.network_backend.ns3.topology.choice = astra.configuration.network_backend.ns3.topology.INFRAGRAPH - astra.configuration.network_backend.ns3.network.packet_payload_size = int(8192) - astra.configuration.network_backend.ns3.logical_topology.logical_dimensions = [8] - astra.configuration.network_backend.ns3.trace.trace_ids = [0, 1, 2, 3, 4, 5, 6, 7] - - # ##### Creating Infrastructure with four host and one rack Device - - astra.configuration.infragraph.infrastructure.name = "1host-4ranks" - - server = Device() - server.deserialize((Server(npu_factor=1).serialize())) - - hosts = astra.configuration.infragraph.infrastructure.instances.add( - name="host", device=server.name, count=4 - ) - switch = Device() - switch.deserialize(Switch(port_count=16).serialize()) - - rack_switch = astra.configuration.infragraph.infrastructure.instances.add( - name="rack_switch", device=switch.name, count=1 - ) - - astra.configuration.infragraph.infrastructure.devices.append(server).append(switch) - - # ##### Creating Links - - rack_link = astra.configuration.infragraph.infrastructure.links.add( - name="rack-link", - description="Link characteristics for connectivity between servers and rack switch", - ) - rack_link.physical.bandwidth.gigabits_per_second = 200 - - # ##### Adding edges and annotations - - host_component = InfraGraphService.get_component(server, Component.NIC) - switch_component = InfraGraphService.get_component(switch, Component.PORT) - # link each host to one leaf switch - for idx in range(hosts.count): - edge = astra.configuration.infragraph.infrastructure.edges.add( - scheme=InfrastructureEdge.ONE2ONE, link=rack_link.name - ) - edge.ep1.instance = f"{hosts.name}[{idx}]" - edge.ep1.component = f"{host_component.name}[0]" - edge.ep2.instance = f"{rack_switch.name}[0]" - edge.ep2.component = f"{switch_component.name}[{idx * 2}]" - edge = astra.configuration.infragraph.infrastructure.edges.add( - scheme=InfrastructureEdge.ONE2ONE, link=rack_link.name - ) - edge.ep1.instance = f"{hosts.name}[{idx}]" - edge.ep1.component = f"{host_component.name}[1]" - edge.ep2.instance = f"{rack_switch.name}[0]" - edge.ep2.component = f"{switch_component.name}[{idx * 2 + 1}]" - - # annotation - host_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - host_device_spec.device_bandwidth_gbps = 200 - host_device_spec.device_latency_ms = 0.05 - host_device_spec.device_name = "server" - host_device_spec.device_type = "host" - astra.configuration.infragraph.annotations.device_specifications.append(host_device_spec) - - switch_device_spec = astra_sim_kit.AnnotationDeviceSpecifications() - switch_device_spec.device_bandwidth_gbps = 200 - switch_device_spec.device_latency_ms = 0.05 - switch_device_spec.device_name = "switch" - switch_device_spec.device_type = "switch" - astra.configuration.infragraph.annotations.device_specifications.append( - switch_device_spec - ) - - # ##### Configure ASTRA-sim cmd parameters - - astra.configuration.common_config.cmd_parameters.comm_scale = 1 - astra.configuration.common_config.cmd_parameters.injection_scale = 1 - astra.configuration.common_config.cmd_parameters.rendezvous_protocol = False - - # #### Start the simulation by specifying the network backend - - astra.run_simulation(NetworkBackend.NS3) - - # ##### Download all the configurations as a zip - - astra.download_configuration() - - # ##### Read output files - - import pandas as pd - import os - from common import FileFolderUtils - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "fct.csv")) - df.head() - - df = pd.read_csv(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR, "flow_stats.csv")) - df.head() - - # ##### Save infragraph as a yaml - - import yaml - import os - from common import FileFolderUtils - with open(os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"../infrastructure","ns3_single_tier_with_dgx"),"w") as f: - data = astra.configuration.infragraph.infrastructure.serialize("dict") - yaml.dump(data, f, default_flow_style=False, indent=4) - - print("saved yaml to:", os.path.join(FileFolderUtils.get_instance().OUTPUT_DIR,"..","ns3_single_tier_with_dgx.yaml")) - - assert True - except Exception as e: - assert False, f'Unexpected exception: {e}'