Skip to content

Commit 865af34

Browse files
committed
more
1 parent 424a4b5 commit 865af34

File tree

10 files changed

+141
-205
lines changed

10 files changed

+141
-205
lines changed

examples/64.sh

Lines changed: 0 additions & 64 deletions
This file was deleted.

examples/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
## Files
2+
3+
- **train.py**: This file contains the model definition, data loading, training loop, and other essential components for training a GNN.
4+
5+
- **run_4.sh**: This is an example shell script for Perlmutter, demonstrating how to run a Plexus-parallelized GNN on 4 GPUs. It includes placeholders that should be replaced with appropriate values for specific experiments, such as dataset path, output directory, etc. The script can be adapted to run on different numbers of GPUs and with different datasets.
6+
7+
For example, the script can be launched using:
8+
```bash
9+
sbatch run_4.sh 1 1 4 0
10+
```
11+
This would execute the training with a 3D parallelism configuration of (X, Y, Z) = (1, 1, 4) for trial number 0. The trial number is often used to differentiate output files from multiple runs.
12+
13+
- **get_rank.sh**: This shell script is used to set the ranks for the GPUs involved in the distributed training process. It also limits the core dump file size to 0.
14+
15+
- **parse_results.py**: This Python script contains the `process_log_file` function, which can be used to parse the timing results from the output log file generated by a training run.
16+
17+
- `export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"`: This can be set if there are warnings about fragmentation which can cause GPU OOM issues.

examples/parse_results.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import re
2+
3+
4+
def extract_avg_time(line):
5+
match = re.search(r"Avg Time: ([0-9]*\.?[0-9]+)", line)
6+
return float(match.group(1)) if match else 0
7+
8+
9+
def process_log_file(filename, warmup):
10+
"""
11+
Args:
12+
filename - path to file to parse
13+
warmup - number of epochs to ignore in the calculation
14+
15+
Returns:
16+
tuple containing the epoch time and communication time, averaged across non-warmup epochs
17+
"""
18+
19+
comm_times, epoch_times = [], []
20+
comm_time, comp_time, cross_time = None, None, None
21+
22+
with open(filename, "r") as file:
23+
for line in file:
24+
line = line.strip()
25+
26+
if (
27+
"epoch " in line
28+
and comm_time is not None
29+
and comp_time is not None
30+
and cross_time is not None
31+
):
32+
epoch_times.append(comp_time + comm_time + cross_time)
33+
comm_times.append(comm_time)
34+
comm_time = 0
35+
comp_time = 0
36+
cross_time = 0
37+
elif "epoch " in line:
38+
comm_time = 0
39+
comp_time = 0
40+
cross_time = 0
41+
elif comm_time is not None and any(
42+
keyword in line
43+
for keyword in ["gather ", "all-reduce ", "reduce-scatter "]
44+
):
45+
comm_time += extract_avg_time(line)
46+
elif comp_time is not None and any(
47+
keyword in line
48+
for keyword in [
49+
"AGG = A * H ",
50+
"OUT = AGG * W ",
51+
"GRAD_W = AGG.T * GRAD_OUT ",
52+
"GRAD_AGG = GRAD_OUT * W.T ",
53+
"GRAD_H = A.T * GRAD_AGG ",
54+
]
55+
):
56+
comp_time += extract_avg_time(line)
57+
elif cross_time is not None and any(
58+
keyword in line for keyword in ["cross entropy"]
59+
):
60+
cross_time += extract_avg_time(line)
61+
62+
if comm_time is not None and comp_time is not None and cross_time is not None:
63+
epoch_times.append(comp_time + comm_time + cross_time)
64+
comm_times.append(comm_time)
65+
66+
return sum(epoch_times[warmup:]) / (len(epoch_times) - warmup), sum(
67+
comm_times[warmup:]
68+
) / (len(comm_times) - warmup)

examples/process_comm_model.py

Lines changed: 0 additions & 117 deletions
This file was deleted.

examples/4.sh renamed to examples/run_4.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
#!/bin/bash
2-
#SBATCH -q debug
32
#SBATCH --time=00:10:00
43
#SBATCH --gpus-per-node=4
5-
#SBATCH -A m2404_g
4+
#SBATCH -A <account>
65
#SBATCH --nodes=1
76
#SBATCH --ntasks-per-node=4
87
#SBATCH -C gpu
98

109
module load nccl
1110
module load cudatoolkit/12.4
12-
source $SCRATCH/gnn-env/bin/activate
11+
source <path/to/venv/bin/activate>
1312

1413
NNODES=$SLURM_JOB_NUM_NODES
1514
GPUS=$(( NNODES * 4 ))
@@ -44,10 +43,10 @@ G_INTRA_C=$2
4443
G_INTRA_D=$3
4544
TRIAL_NUM=$4
4645

47-
SCRIPT="../../../main/train.py --G_intra_r ${G_INTRA_R} --G_intra_c ${G_INTRA_C} --G_intra_d ${G_INTRA_D} --gpus_per_node ${GPUS_PER_NODE} --num_epochs 10"
48-
SCRIPT="$SCRIPT --data_dir $SCRATCH/gnn-env/gnn-datasets/partitioned_products"
46+
SCRIPT="train.py --G_intra_r ${G_INTRA_R} --G_intra_c ${G_INTRA_C} --G_intra_d ${G_INTRA_D} --gpus_per_node ${GPUS_PER_NODE} --num_epochs 10"
47+
SCRIPT="$SCRIPT --data_dir <path/to/dataset>"
4948

50-
run_cmd="srun -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 ../.././get_rank.sh python -u $SCRIPT > ../../../results/products/perlmutter/scaling/${GPUS}/${TRIAL_NUM}/products_X${G_INTRA_R}Y${G_INTRA_C}Z${G_INTRA_D}.txt 2>&1"
49+
run_cmd="srun -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 ./get_rank.sh python -u $SCRIPT > <path/to/save/results/to>/<dataset/output/file/name>_X${G_INTRA_R}Y${G_INTRA_C}Z${G_INTRA_D}_${TRIAL_NUM}.txt 2>&1"
5150

5251
echo $run_cmd
5352
eval $run_cmd

performance/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
## Files
2+
3+
- **comm_model.py**: This file is used to model communication times.
4+
5+
- **comp_model.py**: This file is used to model the computation time of Sparse Matrix-Matrix Multiplication (SpMM), a core operation in GNN training.
6+
7+
- **mem_model.py**: This file is used to model the GPU memory usage of different 3D configurations.
8+

performance/comm_model.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,18 @@ def get_bw(ip, my, GPUS_PER_NODE, version, machine):
7676

7777

7878
def compute_config_costs(G, N, D_list, version, machine):
79+
"""
80+
Args:
81+
G - number of gpus
82+
N - number of nodes
83+
D_list - list of features at each layer (ex: 3 GCN layers with 128 hidden dim, 100 feature size, 60 classes [100, 128, 128, 60])
84+
version - "v1 for placement/bandwidth agnostic, v2 for placement aware with theoretical bandwidth, v3 for placement aware with empirical bandwidths"
85+
machine - currently supports perlmutter and frontier, but bandwidths for other machines can also be added
86+
87+
Returns:
88+
Estimated communication time (ms) for each 3D config
89+
"""
90+
7991
if machine == "perlmutter":
8092
GPUS_PER_NODE = 4
8193
elif machine == "frontier":
@@ -195,7 +207,3 @@ def compute_config_costs(G, N, D_list, version, machine):
195207
config_to_cost = dict(sorted(config_to_cost.items(), key=lambda item: item[1]))
196208

197209
return config_to_cost
198-
199-
200-
if __name__ == "__main__":
201-
print(compute_config_costs(64, 2449029, [100, 128, 128, 47], "v3", "perlmutter"))

performance/comp_model.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,18 @@ def split_into_three_powers_of_two(G):
1919

2020

2121
# don't include number of classes in D_list
22-
def comp_model(N, NNZ, G, D_list):
22+
def comp_model(N, NNZ, G, D_list, coef=[1, 1, 1]):
23+
"""
24+
Args:
25+
N - number of nodes in graph
26+
NNZ - number of nonzeros in graph's adjacency matrix
27+
D_list - list of features at each layer excluding number of classes (ex: 3 GCN layers with 128 hidden dim, 100 feature size, [100, 128, 128])
28+
coef - coefficients to multiply the three terms of the model by to get times in ms (default coefficients don't result in meaningful times, but give an ordering of the configs
29+
30+
Returns:
31+
Estimated SpMM time (ms) for each 3D config
32+
"""
33+
2334
cost_dict = dict()
2435
for x, y, z in split_into_three_powers_of_two(G):
2536
flops_cost, fwd_penalty, bwd_penalty = 0, 0, 0
@@ -36,13 +47,12 @@ def comp_model(N, NNZ, G, D_list):
3647
D_list[i] * [x, z, y][(i + 1) % 3]
3748
)
3849

39-
curr_cost = flops_cost + fwd_penalty + bwd_penalty
40-
41-
cost_dict[(x, y, z)] = (curr_cost, flops_cost, fwd_penalty, bwd_penalty)
42-
cost_dict = dict(sorted(cost_dict.items(), key=lambda kv: kv[1][0]))
43-
return cost_dict
50+
cost_dict[(x, y, z)] = (
51+
(coef[0] * (flops_cost**0.5))
52+
+ (coef[1] * (flops_cost**0.5) * fwd_penalty)
53+
+ (coef[2] * (flops_cost**0.5) * bwd_penalty)
54+
)
4455

56+
cost_dict = dict(sorted(cost_dict.items(), key=lambda kv: kv[1]))
4557

46-
if __name__ == "__main__":
47-
x = comp_model(2449029, 126167053, 64, [100, 128, 128])
48-
print(x.keys())
58+
return cost_dict

0 commit comments

Comments
 (0)