Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 109 additions & 112 deletions packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
#include <KokkosBlas1_nrm1.hpp>
#include <KokkosBlas1_nrm2.hpp>

#include <std_algorithms/Kokkos_ExclusiveScan.hpp>

#include <memory>

#include "Ifpack2_BlockHelper.hpp"
Expand Down Expand Up @@ -1948,8 +1950,7 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::

using impl_type = BlockHelperDetails::ImplType<MatrixType>;

using execution_space = typename impl_type::execution_space;
using host_execution_space = typename impl_type::host_execution_space;
using execution_space = typename impl_type::execution_space;

using local_ordinal_type = typename impl_type::local_ordinal_type;
using global_ordinal_type = typename impl_type::global_ordinal_type;
Expand All @@ -1958,10 +1959,11 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
using size_type_1d_view = typename impl_type::size_type_1d_view;
using vector_type_3d_view = typename impl_type::vector_type_3d_view;
using vector_type_4d_view = typename impl_type::vector_type_4d_view;
using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
using lo_traits = Tpetra::Details::OrdinalTraits<local_ordinal_type>;

constexpr int vector_length = impl_type::vector_length;
constexpr int internal_vector_length = impl_type::internal_vector_length;
Expand All @@ -1975,90 +1977,97 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();

// mirroring to host
const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
const auto partptr = interf.partptr;
const auto lclrow = interf.lclrow;
const auto rowidx2part = interf.rowidx2part;
const auto part2rowidx0 = interf.part2rowidx0;
const auto packptr = interf.packptr;

const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
// TODO: add nrows as a member of part interface?
const local_ordinal_type nrows = Kokkos::create_mirror_view_and_copy(
Kokkos::HostSpace(), Kokkos::subview(partptr, partptr.extent(0) - 1))();

Kokkos::View<local_ordinal_type *, host_execution_space> col2row("col2row", A->getLocalNumCols());
Kokkos::View<local_ordinal_type *, execution_space> col2row("col2row", A->getLocalNumCols());

// find column to row map on host

Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
Kokkos::deep_copy(execution_space(), col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
{
const auto rowmap = g->getRowMap();
const auto colmap = g->getColMap();
const auto dommap = g->getDomainMap();
TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
rowmap->lazyPushToHost();
colmap->lazyPushToHost();
dommap->lazyPushToHost();

#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
TEUCHOS_ASSERT(!(g->getRowMap().is_null() || g->getColMap().is_null() || g->getDomainMap().is_null()));
#if defined(BLOCKTRIDICONTAINER_DEBUG)
{
// On host: check that row, col, domain maps are consistent
auto rowmapHost = g->getRowMap();
auto colmapHost = g->getColMap();
auto dommapHost = g->getDomainMap();
for (local_ordinal_type lr = 0; lr < nrows; lr++) {
const global_ordinal_type gid = rowmapHost->getGlobalElement(lr);
TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
if (dommapHost->isNodeGlobalElement(gid)) {
const local_ordinal_type lc = colmapHost->getLocalElement(gid);
TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
<< " gives an invalid local column.");
}
}
}
#endif
auto rowmap = g->getRowMap()->getLocalMap();
auto colmap = g->getColMap()->getLocalMap();
auto dommap = g->getDomainMap()->getLocalMap();

const Kokkos::RangePolicy<execution_space> policy(0, nrows);
Kokkos::parallel_for(
"performSymbolicPhase::RangePolicy::col2row",
policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
const global_ordinal_type gid = rowmap->getGlobalElement(lr);
TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
if (dommap->isNodeGlobalElement(gid)) {
const local_ordinal_type lc = colmap->getLocalElement(gid);
#if defined(BLOCKTRIDICONTAINER_DEBUG)
TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
<< " gives an invalid local column.");
#endif
col2row(lc) = lr;
const global_ordinal_type gid = rowmap.getGlobalElement(lr);
if (dommap.getLocalElement(gid) != lo_traits::invalid()) {
const local_ordinal_type lc = colmap.getLocalElement(gid);
col2row(lc) = lr;
}
});
#endif
}

// construct the D and R graphs in A = D + R.
{
const auto local_graph = g->getLocalGraphHost();
const auto local_graph = g->getLocalGraphDevice();
const auto local_graph_rowptr = local_graph.row_map;
TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
const auto local_graph_colidx = local_graph.entries;

// assume no overlap.

Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx("lclrow2idx", nrows);
Kokkos::View<local_ordinal_type *, execution_space> lclrow2idx("lclrow2idx", nrows);
{
const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
const Kokkos::RangePolicy<execution_space> policy(0, nrows);
Kokkos::parallel_for(
"performSymbolicPhase::RangePolicy::lclrow2idx",
policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
lclrow2idx[lclrow(i)] = i;
lclrow2idx(lclrow(i)) = i;
});
}

// count (block) nnzs in D and R.
typedef BlockHelperDetails::SumReducer<size_type, 3, host_execution_space> sum_reducer_type;
typename sum_reducer_type::value_type sum_reducer_value;
size_type D_nnz, R_nnz_owned, R_nnz_remote;
{
const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
const Kokkos::RangePolicy<execution_space> policy(0, nrows);
Kokkos::parallel_reduce
// profiling interface does not work
( //"performSymbolicPhase::RangePolicy::count_nnz",
policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, size_type &update_D_nnz, size_type &update_R_nnz_owned, size_type &update_R_nnz_remote) {
// LID -> index.
const local_ordinal_type ri0 = lclrow2idx[lr];
const local_ordinal_type ri0 = lclrow2idx(lr);
const local_ordinal_type pi0 = rowidx2part(ri0);
for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
const local_ordinal_type lc = local_graph_colidx(j);
const local_ordinal_type lc2r = col2row[lc];
const local_ordinal_type lc2r = col2row(lc);
bool incr_R = false;
do { // breakable
if (lc2r == (local_ordinal_type)-1) {
incr_R = true;
break;
}
const local_ordinal_type ri = lclrow2idx[lc2r];
const local_ordinal_type ri = lclrow2idx(lc2r);
const local_ordinal_type pi = rowidx2part(ri);
if (pi != pi0) {
incr_R = true;
Expand All @@ -2068,23 +2077,20 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
// LID space, tridiag LIDs in a row are not necessarily related by
// {-1, 0, 1}.
if (ri0 + 1 >= ri && ri0 <= ri + 1)
++update.v[0]; // D_nnz
++update_D_nnz;
else
incr_R = true;
} while (0);
if (incr_R) {
if (lc < nrows)
++update.v[1]; // R_nnz_owned
++update_R_nnz_owned;
else
++update.v[2]; // R_nnz_remote
++update_R_nnz_remote;
}
}
},
sum_reducer_type(sum_reducer_value));
D_nnz, R_nnz_owned, R_nnz_remote);
}
size_type D_nnz = sum_reducer_value.v[0];
size_type R_nnz_owned = sum_reducer_value.v[1];
size_type R_nnz_remote = sum_reducer_value.v[2];

if (!overlap_communication_and_computation) {
R_nnz_owned += R_nnz_remote;
Expand All @@ -2093,10 +2099,10 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::

// construct the D_00 graph.
{
const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
const auto flat_td_ptr = btdm.flat_td_ptr;

btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
const auto D_A_colindsub = btdm.A_colindsub;

#if defined(BLOCKTRIDICONTAINER_DEBUG)
Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
Expand All @@ -2105,9 +2111,9 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
const local_ordinal_type nparts = partptr.extent(0) - 1;

{
const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
const Kokkos::RangePolicy<execution_space> policy(0, nparts);
Kokkos::parallel_for(
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
"performSymbolicPhase::RangePolicy<execution_space>::D_graph",
policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
const local_ordinal_type part_ri0 = part2rowidx0(pi0);
local_ordinal_type offset = 0;
Expand All @@ -2131,10 +2137,12 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
});
}
#if defined(BLOCKTRIDICONTAINER_DEBUG)
for (size_t i = 0; i < D_A_colindsub.extent(0); ++i)
TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
{
auto D_A_colindsub_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), D_A_colindsub);
for (size_t i = 0; i < D_A_colindsub_host.extent(0); ++i)
TEUCHOS_ASSERT(D_A_colindsub_host(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
}
#endif
Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);

// Allocate values.
{
Expand All @@ -2157,19 +2165,19 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);

const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
const auto R_rowptr = amd.rowptr;
const auto R_A_colindsub = amd.A_colindsub;

amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);

const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
const auto R_rowptr_remote = amd.rowptr_remote;
const auto R_A_colindsub_remote = amd.A_colindsub_remote;

{
const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
const Kokkos::RangePolicy<execution_space> policy(0, nrows);
Kokkos::parallel_for(
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
"performSymbolicPhase::RangePolicy<execution_space>::R_graph_count",
policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
const local_ordinal_type ri0 = lclrow2idx[lr];
const local_ordinal_type pi0 = rowidx2part(ri0);
Expand All @@ -2193,59 +2201,48 @@ void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::
}
});
}

// exclusive scan
typedef BlockHelperDetails::ArrayValueType<size_type, 2> update_type;
// Prefix-sums to finish computing R_rowptr and R_rowptr_remote
Kokkos::Experimental::exclusive_scan(execution_space(), R_rowptr, R_rowptr, size_type(0));
Kokkos::Experimental::exclusive_scan(execution_space(), R_rowptr_remote, R_rowptr_remote, size_type(0));
{
Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
Kokkos::parallel_scan(
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
update_type val;
val.v[0] = R_rowptr(lr);
if (overlap_communication_and_computation)
val.v[1] = R_rowptr_remote(lr);

if (final) {
R_rowptr(lr) = update.v[0];
if (overlap_communication_and_computation)
R_rowptr_remote(lr) = update.v[1];

if (lr < nrows) {
const local_ordinal_type ri0 = lclrow2idx[lr];
const local_ordinal_type pi0 = rowidx2part(ri0);

size_type cnt_rowptr = R_rowptr(lr);
size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage

const size_type j0 = local_graph_rowptr(lr);
for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
const local_ordinal_type lc = local_graph_colidx(j);
const local_ordinal_type lc2r = col2row[lc];
if (lc2r != (local_ordinal_type)-1) {
const local_ordinal_type ri = lclrow2idx[lc2r];
const local_ordinal_type pi = rowidx2part(ri);
if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
continue;
}
const local_ordinal_type row_entry = j - j0;
if (!overlap_communication_and_computation || lc < nrows)
R_A_colindsub(cnt_rowptr++) = row_entry;
else
R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
}
// Fill R graph entries (R_A_colindsub and R_A_colindsub_remote)
Kokkos::RangePolicy<execution_space> policy(0, nrows);
Kokkos::parallel_for(
"performSymbolicPhase::RangePolicy<execution_space>::R_graph_fill",
policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
const local_ordinal_type ri0 = lclrow2idx[lr];
const local_ordinal_type pi0 = rowidx2part(ri0);

size_type cnt_rowptr = R_rowptr(lr);
size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage

const size_type j0 = local_graph_rowptr(lr);
for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
const local_ordinal_type lc = local_graph_colidx(j);
const local_ordinal_type lc2r = col2row[lc];
if (lc2r != (local_ordinal_type)-1) {
const local_ordinal_type ri = lclrow2idx[lc2r];
const local_ordinal_type pi = rowidx2part(ri);
if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
continue;
}
const local_ordinal_type row_entry = j - j0;
if (!overlap_communication_and_computation || lc < nrows)
R_A_colindsub(cnt_rowptr++) = row_entry;
else
R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
}
update += val;
});
}
TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
Kokkos::deep_copy(amd.rowptr, R_rowptr);
Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
if (overlap_communication_and_computation) {
TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
{
// Check that the last elements of R_rowptr (aka amd.rowptr)
// and R_rowptr_remote (aka amd.rowptr_remote) match the expected entry counts
auto r_rowptr_end = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(R_rowptr, nrows));
TEUCHOS_ASSERT(r_rowptr_end() == R_nnz_owned);
if (overlap_communication_and_computation) {
auto r_rowptr_remote_end = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(R_rowptr_remote, nrows));
TEUCHOS_ASSERT(r_rowptr_remote_end() == R_nnz_remote);
}
}

// Allocate or view values.
Expand Down
5 changes: 0 additions & 5 deletions packages/tpetra/core/src/Tpetra_Map_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1092,14 +1092,9 @@ class Map : public Teuchos::Describable {
const global_ordinal_type indexBase,
const Teuchos::RCP<const Teuchos::Comm<int>>& comm);

public:
/// \brief Push the device data to host, if needed
///
/// \warning lazyPushToHost is SUBJECT TO CHANGE and is for EXPERT USERS ONLY.
/// We STRONGLY advise against its use.
void lazyPushToHost() const;

private:
//! The communicator over which this Map is distributed.
Teuchos::RCP<const Teuchos::Comm<int>> comm_;

Expand Down
Loading