Skip to content

Commit c8ff9ec

Browse files
committed
SmartSim-Scaling ReadMe additions, plotting edits, reading data edits, and app edits (#39)
This merge will organize the current README layout, provide new performance results, remove the setting of the model and script in the application, refactor our current plotting code and add two additional requirements. [ committed by @amandarichardsonn ] [ reviewed by @billschereriii @ashao ]
1 parent 9e610b1 commit c8ff9ec

File tree

78 files changed

+3391
-3193
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+3391
-3193
lines changed

PERFORMANCE.md

Lines changed: 356 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 191 additions & 611 deletions
Large diffs are not rendered by default.

batch_scripts/run_aggregation_python_fs_slurm.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#SBATCH -t 24:00:00
66

77
cd ..
8-
module load slurm
98
python driver.py aggregation_scaling_python_fs --exp_name='aggregation-scaling-py-fs-batch' \
109
--client_nodes=[60] \
1110
--clients_per_node=[48] \

batch_scripts/run_aggregation_python_slurm.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
#SBATCH -N 93
44
#SBATCH --exclusive
55
#SBATCH -t 24:00:00
6-
6+
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
77
cd ..
8-
module load slurm
98
python driver.py aggregation_scaling_python --exp_name='aggregation-scaling-py-batch' \
109
--client_nodes=[60] \
1110
--clients_per_node=[48] \

batch_scripts/run_aggregation_slurm.sh

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
#SBATCH -t 12:00:00
66
#SBATCH -C SK48
77
#SBATCH --oversubscribe
8-
8+
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
99
cd ..
10-
module load slurm
11-
python driver.py aggregation_scaling --client_nodes=[60] \
10+
python driver.py aggregation_scaling --exp_name='aggregation-scaling-batch' \
11+
--client_nodes=[60] \
1212
--clients_per_node=[48] \
13-
--db_nodes=[16,32] \
13+
--db_nodes=[16] \
1414
--db_cpus=32 --net_ifname=ipogif0 \
1515
--run_db_as_batch=False \
16-
--tensors_per_dataset=[1,4]
16+
--tensors_per_dataset=[4] \
17+
--tensor_bytes=[1024000] \
18+
--iterations=20 \
19+
--tensors_per_dataset=[4]
1720

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,9 @@
11
#!/bin/bash
22

3-
#SBATCH -N 1
3+
#SBATCH -N 16
4+
#SBATCH -C "P100*16"
45
#SBATCH --exclusive
5-
#SBATCH -p allgriz
6-
#SBATCH -t 1:00:00
7-
8-
module load cudatoolkit/11.7 cudnn PrgEnv-intel
9-
source ~/pyenvs/smartsim-dev/bin/activate
10-
6+
#SBATCH -t 10:00:00
7+
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
118
cd ..
12-
python driver.py inference_colocated --clients_per_node=[12,24,36,60,96] \
13-
--nodes=[1] --db_tpq=[2] \
14-
--db_cpus=[12] --pin_app_cpus=[True] \
15-
--net_type="uds" --node_feature='{}' --languages=['fortran','cpp']
9+
python driver.py inference_colocated --nodes=[4, 8, 12, 16]
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
#!/bin/bash
22

3-
#SBATCH -N 60
3+
#SBATCH -N 116
4+
#SBATCH -C "[P100*16&SK48*100]"
45
#SBATCH --exclusive
56
#SBATCH -t 10:00:00
6-
7+
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
78
cd ..
8-
module load slurm
9-
python driver.py inference_standard --client_nodes=[20,40,60] \
10-
--db_nodes=[4,8,16] --db_tpq=[1,2,4] \
11-
--db_cpus=[8,16]
9+
python driver.py inference_standard --client_nodes=[25, 50, 75, 100] \
10+
--db_nodes=[4, 8, 16] --db_tpq=[1] \
11+
--db_cpus=[8]

batch_scripts/run_throughput_pbs.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#PBS -o throughput.out
66
#PBS -N smartsim-throughput
77
#PBS -V
8-
8+
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
99
PYTHON=/lus/snx11242/spartee/miniconda/envs/0.4.0/bin/python
1010
cd $PBS_O_WORKDIR/../
1111
$PYTHON driver.py throughput_standard --client_nodes=[128,256,512] \

batch_scripts/run_throughput_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
#SBATCH -t 10:00:00
66
#SBATCH -C SK48
77
#SBATCH --oversubscribe
8-
8+
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
99
cd ..
10-
module load slurm
1110
python driver.py throughput_standard --client_nodes=[60] \
1211
--clients_per_node=[48] \
1312
--db_nodes=[32] \
14-
--db_cpus=32 --net_ifname=ipogif0 \
13+
--db_cpus=[32] --net_ifname=ipogif0 \
1514
--run_db_as_batch=False
1615

cpp-data-aggregation/aggregation_consumer.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ void run_aggregation_consumer(std::ofstream& timing_file,
3535
// Allocate arrays to hold timings
3636
std::vector<double> get_list_times;
3737

38+
// Allocate arrays to hold timings
39+
std::vector<double> poll_list_times;
40+
3841
// Retrieve the number of iterations to run
3942
int iterations = get_iterations();
4043
log_data(context, LLDebug, "Running with iterations: " + std::to_string(iterations));
@@ -59,6 +62,7 @@ void run_aggregation_consumer(std::ofstream& timing_file,
5962
log_data(context, LLInfo, "Consuming list " + std::to_string(i));
6063
}
6164

65+
double poll_list_start = MPI_Wtime();
6266
// Have rank 0 check that the aggregation list is full
6367
if(rank == 0) {
6468
bool list_is_ready = client.poll_list_length(list_name,
@@ -73,7 +77,10 @@ void run_aggregation_consumer(std::ofstream& timing_file,
7377
throw std::runtime_error(list_size_error);
7478
}
7579
}
76-
80+
double poll_list_end = MPI_Wtime();
81+
log_data(context, LLDebug, "poll_list completed");
82+
delta_t = poll_list_end - poll_list_start;
83+
poll_list_times.push_back(delta_t);
7784
// Have all ranks wait until the aggregation list is full
7885
MPI_Barrier(MPI_COMM_WORLD);
7986

@@ -104,6 +111,8 @@ void run_aggregation_consumer(std::ofstream& timing_file,
104111
for (int i = 0; i < iterations; i++) {
105112
timing_file << rank << "," << "get_list" << ","
106113
<< get_list_times[i] << "\n";
114+
timing_file << rank << "," << "poll_list" << ","
115+
<< poll_list_times[i] << "\n";
107116
}
108117

109118
// Write loop time to file

0 commit comments

Comments
 (0)