treewalk secondary

astro-YYH · astro-YYH · commit 6602817aa705 · 2024-10-28T22:00:45.000-07:00
diff --git a/libgadget/treewalk.c b/libgadget/treewalk.c
@@ -686,6 +686,68 @@ static struct CommBuffer ev_secondary(struct CommBuffer * imports, struct ImpExp
     return res_imports;
 }
 
+static struct CommBuffer ev_secondary_gpu(struct CommBuffer * imports, struct ImpExpCounts* counts, TreeWalk * tw, const struct gravshort_tree_params * TreeParams_ptr)
+{
+    struct CommBuffer res_imports = {0};
+    alloc_commbuffer(&res_imports, counts->NTask, 1);
+    res_imports.databuf = (char *) mymalloc2("ImportResult", counts->Nimport * tw->result_type_elsize);
+
+    MPI_Datatype type;
+    MPI_Type_contiguous(tw->result_type_elsize, MPI_BYTE, &type);
+    MPI_Type_commit(&type);
+    int * complete_array = ta_malloc("completes", int, imports->nrequest_all);
+
+    int tot_completed = 0;
+    /* Test each request in turn until it completes*/
+    while(tot_completed < imports->nrequest_all) {
+        int complete_cnt = MPI_UNDEFINED;
+        /* Check for some completed requests: note that cleanup is performed if the requests are complete.
+         * There may be only 1 completed request, and we need to wait again until we have more.*/
+        MPI_Waitsome(imports->nrequest_all, imports->rdata_all, &complete_cnt, complete_array, MPI_STATUSES_IGNORE);
+        /* This happens if all requests are MPI_REQUEST_NULL. It should never be hit*/
+        if (complete_cnt == MPI_UNDEFINED)
+            break;
+        int j;
+        for(j = 0; j < complete_cnt; j++) {
+            const int i = complete_array[j];
+            /* Note the task number index is not the index in the request array (some tasks were skipped because we have zero exports)! */
+            const int task = imports->rqst_task[i];
+            const int64_t nimports_task = counts->Import_count[task];
+            // message(1, "starting at %d with %d for iport %d task %d\n", counts->Import_offset[task], counts->Import_count[task], i, task);
+            char * databufstart = imports->databuf + counts->Import_offset[task] * tw->query_type_elsize;
+            char * dataresultstart = res_imports.databuf + counts->Import_offset[task] * tw->result_type_elsize;
+            /* This sends each set of imports to a parallel for loop. This may lead to suboptimal resource allocation if only a small number of imports come from a processor.
+            * If there are a large number of importing ranks each with a small number of imports, a better scheme could be to send each chunk to a separate openmp task.
+            * However, each openmp task by default only uses 1 thread. One may explicitly enable openmp nested parallelism, but I think that is not safe,
+            * or it would be enabled by default.*/
+            // #pragma omp parallel
+            //     {
+            //         int64_t j;
+            //         LocalTreeWalk lv[1];
+
+            //         ev_init_thread(tw, lv);
+            //         lv->mode = TREEWALK_GHOSTS;
+            //         #pragma omp for
+            //         for(j = 0; j < nimports_task; j++) {
+            //             TreeWalkQueryBase * input = (TreeWalkQueryBase *) (databufstart + j * tw->query_type_elsize);
+            //             TreeWalkResultBase * output = (TreeWalkResultBase *) (dataresultstart + j * tw->result_type_elsize);
+            //             treewalk_init_result(tw, output, input);
+            //             lv->target = -1;
+            //             tw->visit(input, output, lv);
+            //         }
+            //     }
+            run_treewalk_secondary_kernel(tw, P, TreeParams_ptr, databufstart, dataresultstart, nimports_task);
+            /* Send the completed data back*/
+            res_imports.rqst_task[res_imports.nrequest_all] = task;
+            MPI_Isend(dataresultstart, nimports_task, type, task, 101923, counts->comm, &res_imports.rdata_all[res_imports.nrequest_all++]);
+            tot_completed++;
+        }
+    };
+    myfree(complete_array);
+    MPI_Type_free(&type);
+    return res_imports;
+}
+
 static struct ImpExpCounts
 ev_export_import_counts(TreeWalk * tw, MPI_Comm comm)
 {
@@ -879,12 +941,14 @@ treewalk_run(TreeWalk * tw, int * active_set, size_t size, struct gravshort_tree
                 message(0, "Starting ev_primary (cpu) for %s with %ld particles\n", tw->ev_label, tw->WorkSetSize);
                 ev_primary(tw); // cpu version
 #else
-                if (TreeParams_ptr == NULL)
+                if (TreeParams_ptr == NULL) {
                     message(0, "Starting ev_primary (cpu) for %s with %ld particles\n", tw->ev_label, tw->WorkSetSize);
                     ev_primary(tw); // cpu version still used for FoF now
-                else
+                }
+                else {
                     message(0, "Starting ev_primary (gpu) for %s with %ld particles\n", tw->ev_label, tw->WorkSetSize);
                     ev_primary_gpu(tw, TreeParams_ptr); /* do local particles and prepare export list */
+                }
 #endif
                 message(0, "Finished ev_primary for %s with %ld particles\n", tw->ev_label, tw->WorkSetSize);
             }
@@ -897,7 +961,21 @@ treewalk_run(TreeWalk * tw, int * active_set, size_t size, struct gravshort_tree
             /* Posts recvs to get the export results (which are sent in ev_secondary).*/
             struct CommBuffer res_exports = {0};
             ev_recv_export_result(&res_exports, &counts, tw);
+#ifdef TREE_CPU
+            message(0, "Starting ev_secondary (cpu) for %s with %ld particles\n", tw->ev_label);
             struct CommBuffer res_imports = ev_secondary(&imports, &counts, tw);
+#else
+            struct CommBuffer res_imports;
+            if (TreeParams_ptr == NULL) {
+                message(0, "Starting ev_secondary (cpu) for %s with %ld particles\n", tw->ev_label);
+                res_imports = ev_secondary(&imports, &counts, tw); // cpu version still used for FoF now
+            }
+            else {
+                message(0, "Starting ev_secondary (gpu) for %s\n", tw->ev_label);
+                res_imports = ev_secondary_gpu(&imports, &counts, tw, TreeParams_ptr);
+            }
+#endif
+            message(0, "Finished ev_secondary for %s\n", tw->ev_label);
             // report_memory_usage(tw->ev_label);
             free_commbuffer(&imports);
             tend = second();
diff --git a/libgadget/treewalk_kernel.cu b/libgadget/treewalk_kernel.cu
@@ -514,3 +514,36 @@ void run_treewalk_kernel(TreeWalk *tw, struct particle_data *particles, const st
     //     message(0, "CUDA error: %s\n", cudaGetErrorString(err));
     // }
 }
+
+__global__ void treewalk_secondary_kernel(TreeWalk *tw, struct particle_data *particles, const struct gravshort_tree_params * TreeParams_ptr, char* databufstart, char* dataresultstart, const int64_t nimports_task) {
+
+    // Use a direct instance rather than an array
+    LocalTreeWalk lv;
+    ev_init_thread_device(tw, &lv);
+    lv.mode = TREEWALK_GHOSTS;
+
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid < nimports_task) {
+
+        TreeWalkQueryGravShort * input = (TreeWalkQueryGravShort *) (databufstart + tid * tw->query_type_elsize);
+        TreeWalkResultGravShort * output = (TreeWalkResultGravShort *) (dataresultstart + tid * tw->result_type_elsize);
+
+        // Initialize query and result using device functions
+        // treewalk_init_query_device(tw, &input, i, NULL, particles);
+        treewalk_init_result_device(tw, output, input);
+
+        // Perform treewalk for particle
+        lv.target = -1;
+        force_treeev_shortrange_device(input, output, &lv, TreeParams_ptr, particles);
+
+    }
+}
+
+void run_treewalk_secondary_kernel(TreeWalk *tw, struct particle_data *particles, const struct gravshort_tree_params * TreeParams_ptr, char* databufstart, char* dataresultstart, const int64_t nimports_task) {
+    // workset is NULL at a PM step
+    int threadsPerBlock = 256;
+    int blocks = (nimports_task + threadsPerBlock - 1) / threadsPerBlock;
+    treewalk_secondary_kernel<<<blocks, threadsPerBlock>>>(tw, particles, TreeParams_ptr, databufstart, dataresultstart, nimports_task);
+    cudaDeviceSynchronize();
+}
diff --git a/libgadget/treewalk_kernel.h b/libgadget/treewalk_kernel.h
@@ -13,4 +13,6 @@ void run_treewalk_kernel(TreeWalk *tw, struct particle_data *particles, const st
 
 void run_gravshort_fill_ntab(const enum ShortRangeForceWindowType ShortRangeForceWindowType, const double Asmth);
 
+void run_treewalk_secondary_kernel(TreeWalk *tw, struct particle_data *particles, const struct gravshort_tree_params * TreeParams_ptr, char* databufstart, char* dataresultstart, const int64_t nimports_task);
+
 #endif  // TREEWALK_KERNEL_H