Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .dockerfile/hoti2024/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## Use Ubuntu
FROM ubuntu:24.04
FROM ubuntu:22.04
LABEL maintainer="Will Won <william.won@gatech.edu>"


Expand All @@ -13,8 +13,9 @@ RUN apt -y install \
gcc g++ clang-format \
make cmake \
libboost-dev libboost-program-options-dev \
python3.12 python3-pip python3-venv \
graphviz
python3.11 python3-pip python3-venv \
graphviz \
openmpi-bin openmpi-doc libopenmpi-dev

## Create Python venv: Required for Python 3.12
RUN python3 -m venv /opt/venv/astra-sim
Expand Down
18 changes: 18 additions & 0 deletions hoti2024/clone_astra_sim_ns3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")

# Clone ASTRA-sim
(
git clone git@github.com:astra-sim/astra-sim.git
cd ${SCRIPT_DIR}/astra-sim/
git checkout tags/tutorial-hoti2024-ns3
git submodule update --init --recursive
)

# Create Chakra symlink for easy access
(
ln -s astra-sim/extern/graph_frontend/chakra .
)
13 changes: 13 additions & 0 deletions hoti2024/clone_collectiveapi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")

# Clone MSCCLang-tools
(
git clone git@github.com:astra-sim/collectiveapi.git
cd ${SCRIPT_DIR}/collectiveapi
git submodule update --init --recursive
)

2 changes: 1 addition & 1 deletion hoti2024/compile_astra_sim.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -e
# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")

# Install Chakra
# Compile ASTRA-sim with analytical model
(
cd ${SCRIPT_DIR}/astra-sim/build/astra_analytical
./build.sh
Expand Down
19 changes: 19 additions & 0 deletions hoti2024/compile_astra_sim_ns3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")

# Compile ASTRA-sim with ns3 backend model
(
cd ${SCRIPT_DIR}/astra-sim
bash ./build/astra_ns3/build.sh
)

# Create Symlinks to configuration files
# Symbolic links are only to make it easier to examine.
# Inputs to execution scripts, etc. need to use the actual files.
ln -s ${SCRIPT_DIR}/astra-sim/extern/network_backend/ns-3/scratch/config/config.txt ./demo4/inputs/ns3_config_switch.txt
ln -s ${SCRIPT_DIR}/astra-sim/extern/network_backend/ns-3/scratch/config/config_clos.txt ./demo4/inputs/ns3_config_clos.txt
ln -s ${SCRIPT_DIR}/astra-sim/extern/network_backend/ns-3/scratch/topology/8_nodes_1_switch_topology.txt ./demo4/inputs/8_nodes_1_switch.txt
ln -s ${SCRIPT_DIR}/astra-sim/extern/network_backend/ns-3/scratch/topology/128_nodes_16_switch_topology.txt ./demo4/inputs/128_nodes_16_switch_clos.txt
47 changes: 47 additions & 0 deletions hoti2024/demo4/generate_all_reduce_128nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os

from chakra.third_party.utils.protolib import encodeMessage as encode_message
from chakra.et_def.et_def_pb2 import (
Node as ChakraNode,
BoolList,
GlobalMetadata,
AttributeProto as ChakraAttr,
COMM_COLL_NODE,
ALL_REDUCE,
)


def main() -> None:
# create directories
if not os.path.exists("./allreduce_128"):
os.makedirs("./allreduce_128")


# metadata
npus_count = 128 # 8 NPUs
coll_size = 1_048_576 # 1 MB

for npu_id in range(npus_count):
output_filename = f"allreduce_128/allreduce.{npu_id}.et"
with open(output_filename, "wb") as et:
# Chakra Metadata
encode_message(et, GlobalMetadata(version="0.0.4"))

# create Chakra Node
node = ChakraNode()
node.id = 1
node.name = "All-Reduce"
node.type = COMM_COLL_NODE

# assign attributes
node.attr.append(ChakraAttr(name="is_cpu_op", bool_val=False))
node.attr.append(ChakraAttr(name="comm_type", int64_val=ALL_REDUCE))
node.attr.append(ChakraAttr(name="comm_size", uint64_val=coll_size))
node.attr.append(ChakraAttr(name="involved_dim", bool_list=BoolList(values=[True])))

# store Chakra ET file
encode_message(et, node)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions hoti2024/demo4/inputs/RemoteMemory.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"memory-type": "NO_MEMORY_EXPANSION"
}
4 changes: 4 additions & 0 deletions hoti2024/demo4/inputs/Ring_8.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
topology: [ Ring ]
npus_count: [ 8 ]
bandwidth: [ 50.0 ] # GB/s
latency: [ 500.0 ] # ns
4 changes: 4 additions & 0 deletions hoti2024/demo4/inputs/Ring_Ring_Ring.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
topology: [ Ring, Ring, Ring ]
npus_count: [ 4, 2, 2 ]
bandwidth: [ 50.0, 50.0, 50.0 ] # GB/s
latency: [ 500.0, 500.0, 500.0 ] # ns
4 changes: 4 additions & 0 deletions hoti2024/demo4/inputs/Ring_Switch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
topology: [ Ring, Switch ]
npus_count: [ 4, 2 ]
bandwidth: [ 50.0, 10.0 ] # GB/s
latency: [ 500.0, 2500.0 ] # ns
13 changes: 13 additions & 0 deletions hoti2024/demo4/inputs/Ring_Switch_sys.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"scheduling-policy": "LIFO",
"endpoint-delay": 10,
"active-chunks-per-dimension": 1,
"preferred-dataset-splits": 4,
"all-reduce-implementation": ["ring","ring"],
"all-gather-implementation": ["ring","ring"],
"reduce-scatter-implementation": ["ring","ring"],
"all-to-all-implementation": ["ring","ring"],
"collective-optimization": "localBWAware",
"local-mem-bw": 50,
"boost-mode": 0
}
13 changes: 13 additions & 0 deletions hoti2024/demo4/inputs/Ring_sys.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"scheduling-policy": "LIFO",
"endpoint-delay": 10,
"active-chunks-per-dimension": 1,
"preferred-dataset-splits": 4,
"all-reduce-implementation": ["ring"],
"all-gather-implementation": ["ring"],
"reduce-scatter-implementation": ["ring"],
"all-to-all-implementation": ["ring"],
"collective-optimization": "localBWAware",
"local-mem-bw": 50,
"boost-mode": 0
}
3 changes: 3 additions & 0 deletions hoti2024/demo4/inputs/logical_128nodes_1D.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"logical-dims": ["128"]
}
3 changes: 3 additions & 0 deletions hoti2024/demo4/inputs/logical_128nodes_2D.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"logical-dims": ["16","8"]
}
3 changes: 3 additions & 0 deletions hoti2024/demo4/inputs/logical_8nodes_1D.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"logical-dims": ["8"]
}
15 changes: 15 additions & 0 deletions hoti2024/demo4/run_demo4-1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")
ASTRA_SIM=${SCRIPT_DIR}/../astra-sim/build/astra_analytical/build/bin/AstraSim_Analytical_Congestion_Unaware

# Run ASTRA-sim
(
${ASTRA_SIM} \
--workload-configuration=${SCRIPT_DIR}/../demo1/allreduce/allreduce \
--system-configuration=${SCRIPT_DIR}/inputs/Ring_Switch_sys.json \
--network-configuration=${SCRIPT_DIR}/inputs/Ring_Switch.yml \
--remote-memory-configuration=${SCRIPT_DIR}/inputs/RemoteMemory.json
)
33 changes: 33 additions & 0 deletions hoti2024/demo4/run_demo4-2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# #!/bin/bash
# # Absolue path to this script
# SCRIPT_DIR=$(dirname "$(realpath $0)")
# # Absolute paths to useful directories
# ASTRA_SIM_DIR="${SCRIPT_DIR:?}"/../astra-sim
# NS3_DIR="${ASTRA_SIM_DIR:?}"/extern/network_backend/ns-3
# # Inputs - change as necessary.
# WORKLOAD="${SCRIPT_DIR:?}"/../../extern/graph_frontend/chakra/one_comm_coll_node_allgather
# SYSTEM="${SCRIPT_DIR:?}"/inputs/Ring.json
# MEMORY="${SCRIPT_DIR:?}"/inputs/RemoteMemory.json
# LOGICAL_TOPOLOGY="${ASTRA_SIM_DIR:?}"/inputs/network/ns3/sample_8nodes_1D.json
# # Note that ONLY this file is relative to NS3_DIR/simulation
# NETWORK="../../../ns-3/scratch/config/config.txt"

#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")
ASTRA_SIM_BUILD_DIR=${SCRIPT_DIR}/../astra-sim/extern/network_backend/ns-3/build/scratch/
ASTRA_SIM=./ns3.42-AstraSimNetwork-default

# Run ASTRA-sim
(
cd ${ASTRA_SIM_BUILD_DIR}
${ASTRA_SIM} \
--workload-configuration=${SCRIPT_DIR}/../demo1/allreduce/allreduce \
--system-configuration=${SCRIPT_DIR}/inputs/Ring_sys.json \
--remote-memory-configuration=${SCRIPT_DIR}/inputs/RemoteMemory.json \
--logical-topology-configuration=${SCRIPT_DIR}/inputs/logical_8nodes_1D.json \
--network-configuration=../../../ns-3/scratch/config/config.txt \
--comm-group-configuration=\"empty\"
)
33 changes: 33 additions & 0 deletions hoti2024/demo4/run_demo4-3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# #!/bin/bash
# # Absolue path to this script
# SCRIPT_DIR=$(dirname "$(realpath $0)")
# # Absolute paths to useful directories
# ASTRA_SIM_DIR="${SCRIPT_DIR:?}"/../astra-sim
# NS3_DIR="${ASTRA_SIM_DIR:?}"/extern/network_backend/ns-3
# # Inputs - change as necessary.
# WORKLOAD="${SCRIPT_DIR:?}"/../../extern/graph_frontend/chakra/one_comm_coll_node_allgather
# SYSTEM="${SCRIPT_DIR:?}"/inputs/Ring.json
# MEMORY="${SCRIPT_DIR:?}"/inputs/RemoteMemory.json
# LOGICAL_TOPOLOGY="${ASTRA_SIM_DIR:?}"/inputs/network/ns3/sample_8nodes_1D.json
# # Note that ONLY this file is relative to NS3_DIR/simulation
# NETWORK="../../../ns-3/scratch/config/config.txt"

#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")
ASTRA_SIM_BUILD_DIR=${SCRIPT_DIR}/../astra-sim/extern/network_backend/ns-3/build/scratch/
ASTRA_SIM=./ns3.42-AstraSimNetwork-default

# Run ASTRA-sim
(
cd ${ASTRA_SIM_BUILD_DIR}
${ASTRA_SIM} \
--workload-configuration=${SCRIPT_DIR}/allreduce_128/allreduce \
--system-configuration=${SCRIPT_DIR}/inputs/Ring_sys.json \
--remote-memory-configuration=${SCRIPT_DIR}/inputs/RemoteMemory.json \
--logical-topology-configuration=${SCRIPT_DIR}/inputs/logical_128nodes_1D.json \
--network-configuration=../../../ns-3/scratch/config/config_clos.txt \
--comm-group-configuration=\"empty\"
)
3 changes: 3 additions & 0 deletions hoti2024/demo5/inputs/RemoteMemory.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"memory-type": "NO_MEMORY_EXPANSION"
}
4 changes: 4 additions & 0 deletions hoti2024/demo5/inputs/Ring_8.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
topology: [ Ring ]
npus_count: [ 8 ]
bandwidth: [ 50.0 ] # GB/s
latency: [ 500.0 ] # ns
10 changes: 10 additions & 0 deletions hoti2024/demo5/inputs/Ring_chakra.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"scheduling-policy": "LIFO",
"endpoint-delay": 10,
"active-chunks-per-dimension": 1,
"preferred-dataset-splits": 4,
"all-reduce-implementation-chakra": ["/app/hoti2024/demo5/inputs/custom_ring"],
"collective-optimization": "localBWAware",
"local-mem-bw": 50,
"boost-mode": 0
}
15 changes: 15 additions & 0 deletions hoti2024/demo5/mscclang_to_chakra.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")

# MSCCLang -> MSCCL-IR (XML)
python3 ${SCRIPT_DIR}/../collectiveapi/msccl-tools/examples/mscclang/allreduce_a100_ring.py 8 1 1 > inputs/custom_allreduce_ring.xml \

# MSCCL-IR -> Chakra
cd ${SCRIPT_DIR}/../collectiveapi/chakra_converter/
python3 et_converter.py \
--input_type msccl \
--input_filename ../../demo5/inputs/custom_allreduce_ring.xml \
--output_filename ../../demo5/inputs/custom_ring
15 changes: 15 additions & 0 deletions hoti2024/demo5/run_demo5.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")
ASTRA_SIM=${SCRIPT_DIR}/../astra-sim/build/astra_analytical/build/bin/AstraSim_Analytical_Congestion_Aware

# Run ASTRA-sim
(
${ASTRA_SIM} \
--workload-configuration=${SCRIPT_DIR}/../demo1/allreduce/allreduce \
--system-configuration=${SCRIPT_DIR}/inputs/Ring_chakra.json \
--network-configuration=${SCRIPT_DIR}/inputs/Ring_8.yml \
--remote-memory-configuration=${SCRIPT_DIR}/inputs/RemoteMemory.json
)
12 changes: 12 additions & 0 deletions hoti2024/install_msccl_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
set -e

# Path
SCRIPT_DIR=$(dirname "$(realpath $0)")

# Install MSCCL-tools
(

cd ${SCRIPT_DIR}/collectiveapi/msccl-tools
pip install .
)