From f149e25f47f79cd5d9a0b4e908b3b55c4f014984 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Sun, 26 Oct 2025 15:25:34 -0600 Subject: [PATCH] Ifpack2/Tpetra: fix #14529 and re-do of #14337 Work around parallel_scan issue; move BTD symbolic phase to device again. And make Tpetra::Map::lazyPushToHost private again. Signed-off-by: Brian Kelley --- .../src/Ifpack2_BlockTriDiContainer_impl.hpp | 221 +++++++++--------- packages/tpetra/core/src/Tpetra_Map_decl.hpp | 5 - 2 files changed, 109 insertions(+), 117 deletions(-) diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index 46485b5fd461..8d13e900173d 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -50,6 +50,8 @@ #include #include +#include + #include #include "Ifpack2_BlockHelper.hpp" @@ -1948,8 +1950,7 @@ void performSymbolicPhase(const Teuchos::RCP; - using execution_space = typename impl_type::execution_space; - using host_execution_space = typename impl_type::host_execution_space; + using execution_space = typename impl_type::execution_space; using local_ordinal_type = typename impl_type::local_ordinal_type; using global_ordinal_type = typename impl_type::global_ordinal_type; @@ -1958,10 +1959,11 @@ void performSymbolicPhase(const Teuchos::RCP; constexpr int vector_length = impl_type::vector_length; constexpr int internal_vector_length = impl_type::internal_vector_length; @@ -1975,90 +1977,97 @@ void performSymbolicPhase(const Teuchos::RCPgetLocalNumRows() != 0); const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows(); - // mirroring to host - const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr); - const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow); - const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part); - const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0); - const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr); + const auto partptr = interf.partptr; + const auto lclrow = interf.lclrow; + const auto rowidx2part = interf.rowidx2part; + const auto part2rowidx0 = interf.part2rowidx0; + const auto packptr = interf.packptr; - const local_ordinal_type nrows = partptr(partptr.extent(0) - 1); + // TODO: add nrows as a member of part interface? + const local_ordinal_type nrows = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), Kokkos::subview(partptr, partptr.extent(0) - 1))(); - Kokkos::View col2row("col2row", A->getLocalNumCols()); + Kokkos::View col2row("col2row", A->getLocalNumCols()); // find column to row map on host - Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits::invalid()); + Kokkos::deep_copy(execution_space(), col2row, Teuchos::OrdinalTraits::invalid()); { - const auto rowmap = g->getRowMap(); - const auto colmap = g->getColMap(); - const auto dommap = g->getDomainMap(); - TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null())); - rowmap->lazyPushToHost(); - colmap->lazyPushToHost(); - dommap->lazyPushToHost(); - -#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__) - const Kokkos::RangePolicy policy(0, nrows); + TEUCHOS_ASSERT(!(g->getRowMap().is_null() || g->getColMap().is_null() || g->getDomainMap().is_null())); +#if defined(BLOCKTRIDICONTAINER_DEBUG) + { + // On host: check that row, col, domain maps are consistent + auto rowmapHost = g->getRowMap(); + auto colmapHost = g->getColMap(); + auto dommapHost = g->getDomainMap(); + for (local_ordinal_type lr = 0; lr < nrows; lr++) { + const global_ordinal_type gid = rowmapHost->getGlobalElement(lr); + TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits::invalid()); + if (dommapHost->isNodeGlobalElement(gid)) { + const local_ordinal_type lc = colmapHost->getLocalElement(gid); + TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits::invalid(), + BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid + << " gives an invalid local column."); + } + } + } +#endif + auto rowmap = g->getRowMap()->getLocalMap(); + auto colmap = g->getColMap()->getLocalMap(); + auto dommap = g->getDomainMap()->getLocalMap(); + + const Kokkos::RangePolicy policy(0, nrows); Kokkos::parallel_for( "performSymbolicPhase::RangePolicy::col2row", policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) { - const global_ordinal_type gid = rowmap->getGlobalElement(lr); - TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits::invalid()); - if (dommap->isNodeGlobalElement(gid)) { - const local_ordinal_type lc = colmap->getLocalElement(gid); -#if defined(BLOCKTRIDICONTAINER_DEBUG) - TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits::invalid(), - BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid - << " gives an invalid local column."); -#endif - col2row(lc) = lr; + const global_ordinal_type gid = rowmap.getGlobalElement(lr); + if (dommap.getLocalElement(gid) != lo_traits::invalid()) { + const local_ordinal_type lc = colmap.getLocalElement(gid); + col2row(lc) = lr; } }); -#endif } // construct the D and R graphs in A = D + R. { - const auto local_graph = g->getLocalGraphHost(); + const auto local_graph = g->getLocalGraphDevice(); const auto local_graph_rowptr = local_graph.row_map; TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast(nrows + 1)); const auto local_graph_colidx = local_graph.entries; // assume no overlap. - Kokkos::View lclrow2idx("lclrow2idx", nrows); + Kokkos::View lclrow2idx("lclrow2idx", nrows); { - const Kokkos::RangePolicy policy(0, nrows); + const Kokkos::RangePolicy policy(0, nrows); Kokkos::parallel_for( "performSymbolicPhase::RangePolicy::lclrow2idx", policy, KOKKOS_LAMBDA(const local_ordinal_type &i) { - lclrow2idx[lclrow(i)] = i; + lclrow2idx(lclrow(i)) = i; }); } // count (block) nnzs in D and R. - typedef BlockHelperDetails::SumReducer sum_reducer_type; - typename sum_reducer_type::value_type sum_reducer_value; + size_type D_nnz, R_nnz_owned, R_nnz_remote; { - const Kokkos::RangePolicy policy(0, nrows); + const Kokkos::RangePolicy policy(0, nrows); Kokkos::parallel_reduce // profiling interface does not work ( //"performSymbolicPhase::RangePolicy::count_nnz", - policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) { + policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, size_type &update_D_nnz, size_type &update_R_nnz_owned, size_type &update_R_nnz_remote) { // LID -> index. - const local_ordinal_type ri0 = lclrow2idx[lr]; + const local_ordinal_type ri0 = lclrow2idx(lr); const local_ordinal_type pi0 = rowidx2part(ri0); for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) { const local_ordinal_type lc = local_graph_colidx(j); - const local_ordinal_type lc2r = col2row[lc]; + const local_ordinal_type lc2r = col2row(lc); bool incr_R = false; do { // breakable if (lc2r == (local_ordinal_type)-1) { incr_R = true; break; } - const local_ordinal_type ri = lclrow2idx[lc2r]; + const local_ordinal_type ri = lclrow2idx(lc2r); const local_ordinal_type pi = rowidx2part(ri); if (pi != pi0) { incr_R = true; @@ -2068,23 +2077,20 @@ void performSymbolicPhase(const Teuchos::RCP= ri && ri0 <= ri + 1) - ++update.v[0]; // D_nnz + ++update_D_nnz; else incr_R = true; } while (0); if (incr_R) { if (lc < nrows) - ++update.v[1]; // R_nnz_owned + ++update_R_nnz_owned; else - ++update.v[2]; // R_nnz_remote + ++update_R_nnz_remote; } } }, - sum_reducer_type(sum_reducer_value)); + D_nnz, R_nnz_owned, R_nnz_remote); } - size_type D_nnz = sum_reducer_value.v[0]; - size_type R_nnz_owned = sum_reducer_value.v[1]; - size_type R_nnz_remote = sum_reducer_value.v[2]; if (!overlap_communication_and_computation) { R_nnz_owned += R_nnz_remote; @@ -2093,10 +2099,10 @@ void performSymbolicPhase(const Teuchos::RCP::invalid()); @@ -2105,9 +2111,9 @@ void performSymbolicPhase(const Teuchos::RCP policy(0, nparts); + const Kokkos::RangePolicy policy(0, nparts); Kokkos::parallel_for( - "performSymbolicPhase::RangePolicy::D_graph", + "performSymbolicPhase::RangePolicy::D_graph", policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) { const local_ordinal_type part_ri0 = part2rowidx0(pi0); local_ordinal_type offset = 0; @@ -2131,10 +2137,12 @@ void performSymbolicPhase(const Teuchos::RCP::invalid()); + { + auto D_A_colindsub_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), D_A_colindsub); + for (size_t i = 0; i < D_A_colindsub_host.extent(0); ++i) + TEUCHOS_ASSERT(D_A_colindsub_host(i) != Teuchos::OrdinalTraits::invalid()); + } #endif - Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub); // Allocate values. { @@ -2157,19 +2165,19 @@ void performSymbolicPhase(const Teuchos::RCP policy(0, nrows); + const Kokkos::RangePolicy policy(0, nrows); Kokkos::parallel_for( - "performSymbolicPhase::RangePolicy::R_graph_count", + "performSymbolicPhase::RangePolicy::R_graph_count", policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) { const local_ordinal_type ri0 = lclrow2idx[lr]; const local_ordinal_type pi0 = rowidx2part(ri0); @@ -2193,59 +2201,48 @@ void performSymbolicPhase(const Teuchos::RCP update_type; + // Prefix-sums to finish computing R_rowptr and R_rowptr_remote + Kokkos::Experimental::exclusive_scan(execution_space(), R_rowptr, R_rowptr, size_type(0)); + Kokkos::Experimental::exclusive_scan(execution_space(), R_rowptr_remote, R_rowptr_remote, size_type(0)); { - Kokkos::RangePolicy policy(0, nrows + 1); - Kokkos::parallel_scan( - "performSymbolicPhase::RangePolicy::R_graph_fill", - policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) { - update_type val; - val.v[0] = R_rowptr(lr); - if (overlap_communication_and_computation) - val.v[1] = R_rowptr_remote(lr); - - if (final) { - R_rowptr(lr) = update.v[0]; - if (overlap_communication_and_computation) - R_rowptr_remote(lr) = update.v[1]; - - if (lr < nrows) { - const local_ordinal_type ri0 = lclrow2idx[lr]; - const local_ordinal_type pi0 = rowidx2part(ri0); - - size_type cnt_rowptr = R_rowptr(lr); - size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage - - const size_type j0 = local_graph_rowptr(lr); - for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) { - const local_ordinal_type lc = local_graph_colidx(j); - const local_ordinal_type lc2r = col2row[lc]; - if (lc2r != (local_ordinal_type)-1) { - const local_ordinal_type ri = lclrow2idx[lc2r]; - const local_ordinal_type pi = rowidx2part(ri); - if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) - continue; - } - const local_ordinal_type row_entry = j - j0; - if (!overlap_communication_and_computation || lc < nrows) - R_A_colindsub(cnt_rowptr++) = row_entry; - else - R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry; - } + // Fill R graph entries (R_A_colindsub and R_A_colindsub_remote) + Kokkos::RangePolicy policy(0, nrows); + Kokkos::parallel_for( + "performSymbolicPhase::RangePolicy::R_graph_fill", + policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) { + const local_ordinal_type ri0 = lclrow2idx[lr]; + const local_ordinal_type pi0 = rowidx2part(ri0); + + size_type cnt_rowptr = R_rowptr(lr); + size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage + + const size_type j0 = local_graph_rowptr(lr); + for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) { + const local_ordinal_type lc = local_graph_colidx(j); + const local_ordinal_type lc2r = col2row[lc]; + if (lc2r != (local_ordinal_type)-1) { + const local_ordinal_type ri = lclrow2idx[lc2r]; + const local_ordinal_type pi = rowidx2part(ri); + if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) + continue; } + const local_ordinal_type row_entry = j - j0; + if (!overlap_communication_and_computation || lc < nrows) + R_A_colindsub(cnt_rowptr++) = row_entry; + else + R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry; } - update += val; }); } - TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned); - Kokkos::deep_copy(amd.rowptr, R_rowptr); - Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub); - if (overlap_communication_and_computation) { - TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote); - Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote); - Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote); + { + // Check that the last elements of R_rowptr (aka amd.rowptr) + // and R_rowptr_remote (aka amd.rowptr_remote) match the expected entry counts + auto r_rowptr_end = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(R_rowptr, nrows)); + TEUCHOS_ASSERT(r_rowptr_end() == R_nnz_owned); + if (overlap_communication_and_computation) { + auto r_rowptr_remote_end = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(R_rowptr_remote, nrows)); + TEUCHOS_ASSERT(r_rowptr_remote_end() == R_nnz_remote); + } } // Allocate or view values. diff --git a/packages/tpetra/core/src/Tpetra_Map_decl.hpp b/packages/tpetra/core/src/Tpetra_Map_decl.hpp index b93ab7131e15..e0cd1ad35d34 100644 --- a/packages/tpetra/core/src/Tpetra_Map_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_decl.hpp @@ -1092,14 +1092,9 @@ class Map : public Teuchos::Describable { const global_ordinal_type indexBase, const Teuchos::RCP>& comm); - public: /// \brief Push the device data to host, if needed - /// - /// \warning lazyPushToHost is SUBJECT TO CHANGE and is for EXPERT USERS ONLY. - /// We STRONGLY advise against its use. void lazyPushToHost() const; - private: //! The communicator over which this Map is distributed. Teuchos::RCP> comm_;