Remove support for multiple PackedData inserts

pentschev · pentschev · commit b353aa35388c · 2025-11-21T14:18:57.000-08:00
diff --git a/cpp/include/rapidsmpf/coll/allreduce.hpp b/cpp/include/rapidsmpf/coll/allreduce.hpp
@@ -63,12 +63,9 @@ using ReduceKernel = std::function<void(PackedData& accum, PackedData&& incoming
  * memory consumption and `O(R)` communication operations.
  *
  * Semantics:
- *  - Each rank may call `insert` any number of times with a local sequence number.
- *  - Conceptually, the *k*-th insertion on each rank participates in a single
- *    global reduction. That is, insertions are paired across ranks by their
- *    local insertion order, not by sequence number values.
- *  - Once all ranks call `insert_finished`, `wait_and_extract` returns one
- *    globally-reduced `PackedData` per local insertion on this rank.
+ *  - Each rank calls `insert` exactly once to contribute data to the reduction.
+ *  - Once all ranks call `insert_finished`, `wait_and_extract` returns the
+ *    globally-reduced `PackedData`.
  *
  * The actual reduction is implemented via a type-erased `ReduceKernel` that is
  * supplied at construction time. Helper factories such as
@@ -119,15 +116,11 @@ class AllReduce {
     /**
      * @brief Insert packed data into the allreduce operation.
      *
-     * @param sequence_number Local ordered sequence number of the data.
      * @param packed_data The data to contribute to the allreduce.
      *
-     * The caller promises that:
-     *  - `sequence_number`s are non-decreasing on each rank.
-     *  - The *k*-th call to `insert` on each rank corresponds to the same logical
-     *    reduction across all ranks (i.e., same element type and shape).
+     * @throws std::runtime_error If insert has already been called on this instance.
      */
-    void insert(std::uint64_t sequence_number, PackedData&& packed_data);
+    void insert(PackedData&& packed_data);
 
     /**
      * @brief Mark that this rank has finished contributing data.
@@ -143,19 +136,18 @@ class AllReduce {
     [[nodiscard]] bool finished() const noexcept;
 
     /**
-     * @brief Wait for completion and extract all reduced data.
+     * @brief Wait for completion and extract the reduced data.
      *
-     * Blocks until the allreduce operation completes and returns all locally
-     * reduced results, ordered by local insertion order.
+     * Blocks until the allreduce operation completes and returns the
+     * globally reduced result.
      *
      * @param timeout Optional maximum duration to wait. Negative values mean
      *        no timeout.
      *
-     * @return A vector containing reduced packed data, one entry per local
-     *         insertion on this rank.
+     * @return The reduced packed data.
      * @throws std::runtime_error If the timeout is reached.
      */
-    [[nodiscard]] std::vector<PackedData> wait_and_extract(
+    [[nodiscard]] PackedData wait_and_extract(
         std::chrono::milliseconds timeout = std::chrono::milliseconds{-1}
     );
 
@@ -172,16 +164,16 @@ class AllReduce {
     [[nodiscard]] bool is_ready() const noexcept;
 
   private:
-    /// @brief Perform the reduction across all ranks for all gathered contributions.
-    [[nodiscard]] std::vector<PackedData> reduce_all(std::vector<PackedData>&& gathered);
+    /// @brief Perform the reduction across all ranks for the gathered contributions.
+    [[nodiscard]] PackedData reduce_all(std::vector<PackedData>&& gathered);
 
     ReduceKernel reduce_kernel_;  ///< Type-erased reduction kernel
     std::function<void(void)> finished_callback_;  ///< Optional finished callback
 
     Rank nranks_;  ///< Number of ranks in the communicator
     AllGather gatherer_;  ///< Underlying allgather primitive
 
-    std::atomic<std::uint32_t> nlocal_insertions_{0};  ///< Number of local inserts
+    bool inserted_{false};  ///< Whether insert has been called
 };
 
 namespace detail {
diff --git a/cpp/src/coll/allreduce.cpp b/cpp/src/coll/allreduce.cpp
@@ -26,6 +26,7 @@ AllReduce::AllReduce(
 )
     : reduce_kernel_{std::move(reduce_kernel)},
       finished_callback_{std::move(finished_callback)},
+      nranks_{comm->nranks()},
       gatherer_{
           std::move(comm), std::move(progress_thread), op_id, br, std::move(statistics)
       } {
@@ -37,9 +38,14 @@ AllReduce::AllReduce(
 
 AllReduce::~AllReduce() = default;
 
-void AllReduce::insert(std::uint64_t sequence_number, PackedData&& packed_data) {
-    nlocal_insertions_.fetch_add(1, std::memory_order_relaxed);
-    gatherer_.insert(sequence_number, std::move(packed_data));
+void AllReduce::insert(PackedData&& packed_data) {
+    RAPIDSMPF_EXPECTS(
+        !inserted_,
+        "AllReduce::insert can only be called once per instance",
+        std::runtime_error
+    );
+    inserted_ = true;
+    gatherer_.insert(0, std::move(packed_data));
 }
 
 void AllReduce::insert_finished() {
@@ -50,7 +56,7 @@ bool AllReduce::finished() const noexcept {
     return gatherer_.finished();
 }
 
-std::vector<PackedData> AllReduce::wait_and_extract(std::chrono::milliseconds timeout) {
+PackedData AllReduce::wait_and_extract(std::chrono::milliseconds timeout) {
     // Block until the underlying allgather completes, then perform the reduction locally
     // (exactly once).
     auto gathered =
@@ -62,48 +68,24 @@ bool AllReduce::is_ready() const noexcept {
     return gatherer_.finished();
 }
 
-std::vector<PackedData> AllReduce::reduce_all(std::vector<PackedData>&& gathered) {
+PackedData AllReduce::reduce_all(std::vector<PackedData>&& gathered) {
     auto const total = gathered.size();
 
-    if (total == 0) {
-        return {};
-    }
-
     RAPIDSMPF_EXPECTS(
-        total % nranks_ == 0,
-        "AllReduce expects each rank to contribute the same number of messages",
+        total == static_cast<std::size_t>(nranks_),
+        "AllReduce expects exactly one contribution from each rank",
         std::runtime_error
     );
 
-    auto const n_local =
-        static_cast<std::size_t>(nlocal_insertions_.load(std::memory_order_acquire));
-    auto const n_per_rank = total / nranks_;
-
-    // We allow non-uniform insertion counts across ranks but require that the local
-    // insertion count matches the per-rank contribution implied by the gather.
-    RAPIDSMPF_EXPECTS(
-        n_local == 0 || n_local == n_per_rank,
-        "AllReduce local insertion count does not match gathered contributions per rank",
-        std::runtime_error
-    );
+    // Start with rank 0's contribution as the accumulator
+    auto accum = std::move(gathered[0]);
 
-    std::vector<PackedData> results;
-    results.reserve(n_per_rank);
-
-    // Conceptually, the k-th insertion on each rank participates in a single
-    // reduction. With ordered allgather results, entries are laid out as:
-    //   [rank0:0..n_per_rank-1][rank1:0..n_per_rank-1]...[rankP-1:0..n_per_rank-1]
-    for (std::size_t k = 0; k < n_per_rank; ++k) {
-        // Start from rank 0's contribution for this logical insertion.
-        auto accum = std::move(gathered[k]);
-        for (std::size_t r = 1; r < nranks_; ++r) {
-            auto idx = r * n_per_rank + k;
-            reduce_kernel_(accum, std::move(gathered[idx]));
-        }
-        results.emplace_back(std::move(accum));
+    // Reduce contributions from all other ranks into the accumulator
+    for (std::size_t r = 1; r < static_cast<std::size_t>(nranks_); ++r) {
+        reduce_kernel_(accum, std::move(gathered[r]));
     }
 
-    return results;
+    return accum;
 }
 
 }  // namespace rapidsmpf::coll
diff --git a/cpp/tests/test_allreduce.cpp b/cpp/tests/test_allreduce.cpp