CrayLabs
diff --git a/‎PERFORMANCE.md‎
Lines changed: 356 additions & 0 deletions b/‎PERFORMANCE.md‎
Lines changed: 356 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 191 additions & 611 deletions b/‎README.md‎
Lines changed: 191 additions & 611 deletions
diff --git a/‎batch_scripts/run_aggregation_python_fs_slurm.sh‎
Lines changed: 0 additions & 1 deletion b/‎batch_scripts/run_aggregation_python_fs_slurm.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎batch_scripts/run_aggregation_python_slurm.sh‎
Lines changed: 1 addition & 2 deletions b/‎batch_scripts/run_aggregation_python_slurm.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎batch_scripts/run_aggregation_slurm.sh‎
Lines changed: 8 additions & 5 deletions b/‎batch_scripts/run_aggregation_slurm.sh‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎batch_scripts/run_inference_colo_slurm.sh‎
Lines changed: 5 additions & 11 deletions b/‎batch_scripts/run_inference_colo_slurm.sh‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎batch_scripts/run_inference_standard_slurm.sh‎
Lines changed: 6 additions & 6 deletions b/‎batch_scripts/run_inference_standard_slurm.sh‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎batch_scripts/run_throughput_pbs.sh‎
Lines changed: 1 addition & 1 deletion b/‎batch_scripts/run_throughput_pbs.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎batch_scripts/run_throughput_slurm.sh‎
Lines changed: 2 additions & 3 deletions b/‎batch_scripts/run_throughput_slurm.sh‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎cpp-data-aggregation/aggregation_consumer.cpp‎
Lines changed: 10 additions & 1 deletion b/‎cpp-data-aggregation/aggregation_consumer.cpp‎
Lines changed: 10 additions & 1 deletion
@@ -5,7 +5,6 @@
 #SBATCH -t 24:00:00
 
 cd ..
-module load slurm
 python driver.py aggregation_scaling_python_fs --exp_name='aggregation-scaling-py-fs-batch' \
                                                --client_nodes=[60] \
                                                --clients_per_node=[48] \
 
@@ -3,9 +3,8 @@
 #SBATCH -N 93
 #SBATCH --exclusive
 #SBATCH -t 24:00:00
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
 python driver.py aggregation_scaling_python --exp_name='aggregation-scaling-py-batch' \
                                             --client_nodes=[60] \
                                             --clients_per_node=[48] \
 
@@ -5,13 +5,16 @@
 #SBATCH -t 12:00:00
 #SBATCH -C SK48
 #SBATCH --oversubscribe
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
-python driver.py aggregation_scaling --client_nodes=[60] \
+python driver.py aggregation_scaling --exp_name='aggregation-scaling-batch' \
+                                     --client_nodes=[60] \
                                      --clients_per_node=[48] \
-                                     --db_nodes=[16,32] \
+                                     --db_nodes=[16] \
                                      --db_cpus=32 --net_ifname=ipogif0 \
                                      --run_db_as_batch=False \
-                                     --tensors_per_dataset=[1,4]
+                                     --tensors_per_dataset=[4] \
+                                     --tensor_bytes=[1024000] \
+                                     --iterations=20 \
+                                     --tensors_per_dataset=[4]
 
@@ -1,15 +1,9 @@
 #!/bin/bash
 
-#SBATCH -N 1
+#SBATCH -N 16
+#SBATCH -C "P100*16"
 #SBATCH --exclusive
-#SBATCH -p allgriz
-#SBATCH -t 1:00:00
-
-module load cudatoolkit/11.7 cudnn PrgEnv-intel
-source ~/pyenvs/smartsim-dev/bin/activate
-
+#SBATCH -t 10:00:00
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-python driver.py inference_colocated --clients_per_node=[12,24,36,60,96] \
-                                     --nodes=[1] --db_tpq=[2] \
-                                     --db_cpus=[12] --pin_app_cpus=[True] \
-                                     --net_type="uds" --node_feature='{}' --languages=['fortran','cpp']
+python driver.py inference_colocated --nodes=[4, 8, 12, 16]
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-#SBATCH -N 60
+#SBATCH -N 116
+#SBATCH -C "[P100*16&SK48*100]"
 #SBATCH --exclusive
 #SBATCH -t 10:00:00
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
-python driver.py inference_standard --client_nodes=[20,40,60] \
-                                    --db_nodes=[4,8,16] --db_tpq=[1,2,4] \
-                                    --db_cpus=[8,16]
+python driver.py inference_standard --client_nodes=[25, 50, 75, 100] \
+                                    --db_nodes=[4, 8, 16] --db_tpq=[1] \
+                                    --db_cpus=[8]
@@ -5,7 +5,7 @@
 #PBS -o throughput.out
 #PBS -N smartsim-throughput
 #PBS -V
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 PYTHON=/lus/snx11242/spartee/miniconda/envs/0.4.0/bin/python
 cd $PBS_O_WORKDIR/../
 $PYTHON driver.py throughput_standard --client_nodes=[128,256,512] \
 
@@ -5,12 +5,11 @@
 #SBATCH -t 10:00:00
 #SBATCH -C SK48
 #SBATCH --oversubscribe
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
 python driver.py throughput_standard --client_nodes=[60] \
                                     --clients_per_node=[48] \
                                     --db_nodes=[32] \
-                                    --db_cpus=32 --net_ifname=ipogif0 \
+                                    --db_cpus=[32] --net_ifname=ipogif0 \
                                     --run_db_as_batch=False
 
@@ -35,6 +35,9 @@ void run_aggregation_consumer(std::ofstream& timing_file,
     // Allocate arrays to hold timings
     std::vector<double> get_list_times;
 
+    // Allocate arrays to hold timings
+    std::vector<double> poll_list_times;
+
     // Retrieve the number of iterations to run
     int iterations = get_iterations();
     log_data(context, LLDebug, "Running with iterations: " + std::to_string(iterations));
@@ -59,6 +62,7 @@ void run_aggregation_consumer(std::ofstream& timing_file,
             log_data(context, LLInfo, "Consuming list " + std::to_string(i));
         }
 
+        double poll_list_start = MPI_Wtime();
         // Have rank 0 check that the aggregation list is full
         if(rank == 0) {
             bool list_is_ready = client.poll_list_length(list_name,
@@ -73,7 +77,10 @@ void run_aggregation_consumer(std::ofstream& timing_file,
                 throw std::runtime_error(list_size_error);
             }
         }
-
+        double poll_list_end = MPI_Wtime();
+        log_data(context, LLDebug, "poll_list completed");
+        delta_t = poll_list_end - poll_list_start;
+        poll_list_times.push_back(delta_t);
         // Have all ranks wait until the aggregation list is full
         MPI_Barrier(MPI_COMM_WORLD);
 
@@ -104,6 +111,8 @@ void run_aggregation_consumer(std::ofstream& timing_file,
     for (int i = 0; i < iterations; i++) {
         timing_file << rank << "," << "get_list" << ","
                     << get_list_times[i] << "\n";
+        timing_file << rank << "," << "poll_list" << ","
+                    << poll_list_times[i] << "\n";
     }
 
     // Write loop time to file