Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ifpack2: add option for RCM reordering in streamed KK SPILUK and KK SPTRSV in RILUK #12798

Merged
4 changes: 4 additions & 0 deletions packages/ifpack2/doc/UsersGuide/options.tex
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,10 @@ \subsection{ILU($k$)}\label{s:ILU}
these streams can run concurrently, the total time can be faster. When
this option is not set (i.e. not using stream), the entire sub-domain is
used instead.}
\ccc{fact: kspiluk reordering in streams}
{bool}
{\false}
{Whether RCM reordering is applied to diagonal blocks in streams.}
% All overlap-related code was removed by M. Hoemmen in
%
% commit 162f64572fbf93e2cac73e3034d76a3db918a494
Expand Down
2 changes: 2 additions & 0 deletions packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,8 @@ class RILUK:
bool isKokkosKernelsStream_;
int num_streams_;
std::vector<execution_space> exec_space_instances_;
bool hasStreamReordered_;
std::vector<typename lno_nonzero_view_t::non_const_type> perm_v_;
};

// NOTE (mfh 11 Feb 2015) This used to exist in order to deal with
Expand Down
194 changes: 131 additions & 63 deletions packages/ifpack2/src/Ifpack2_RILUK_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ RILUK<MatrixType>::RILUK (const Teuchos::RCP<const row_matrix_type>& Matrix_in)
Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
isKokkosKernelsSpiluk_(false),
isKokkosKernelsStream_(false),
num_streams_(0)
num_streams_(0),
hasStreamReordered_(false)
{
allocateSolvers();
}
Expand All @@ -116,7 +117,8 @@ RILUK<MatrixType>::RILUK (const Teuchos::RCP<const crs_matrix_type>& Matrix_in)
Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
isKokkosKernelsSpiluk_(false),
isKokkosKernelsStream_(false),
num_streams_(0)
num_streams_(0),
hasStreamReordered_(false)
{
allocateSolvers();
}
Expand Down Expand Up @@ -412,7 +414,7 @@ setParameters (const Teuchos::ParameterList& params)
getParamTryingTypes<int, int, global_ordinal_type>
(nstreams, params, paramName, prefix);
}

// Forward to trisolvers.
L_solver_->setParameters(params);
U_solver_->setParameters(params);
Expand All @@ -427,6 +429,9 @@ setParameters (const Teuchos::ParameterList& params)

if (num_streams_ >= 1) {
this->isKokkosKernelsStream_ = true;
// Will we do reordering in streams?
if (params.isParameter("fact: kspiluk reordering in streams"))
hasStreamReordered_ = params.get<bool> ("fact: kspiluk reordering in streams");
}
else {
this->isKokkosKernelsStream_ = false;
Expand Down Expand Up @@ -524,7 +529,7 @@ void RILUK<MatrixType>::initialize ()
"matrix until the matrix is fill complete. If your matrix is a "
"Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
"range Maps, if appropriate) before calling this method.");

Teuchos::Time timer ("RILUK::initialize");
double startTime = timer.wallTime();
{ // Start timing
Expand Down Expand Up @@ -592,8 +597,10 @@ void RILUK<MatrixType>::initialize ()
}
else {
auto lclMtx = A_local_crs->getLocalMatrixDevice();
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);

if (!hasStreamReordered_)
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
else
perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
for(int i = 0; i < num_streams_; i++) {
Teuchos::RCP<const crs_map_type> A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(),
A_local_diagblks[i].numRows(),
Expand Down Expand Up @@ -654,6 +661,7 @@ void RILUK<MatrixType>::initialize ()
#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030)
L_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute
#endif

if (!isKokkosKernelsStream_) {
U_solver_->setMatrix (U_);
}
Expand Down Expand Up @@ -1050,7 +1058,11 @@ void RILUK<MatrixType>::compute ()
A_local_values_ = lclMtx.values;
}
else {
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
if (!hasStreamReordered_)
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
else
perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);

A_local_diagblks_rowmap_v_ = std::vector<lno_row_view_t>(num_streams_);
A_local_diagblks_entries_v_ = std::vector<lno_nonzero_view_t>(num_streams_);
A_local_diagblks_values_v_ = std::vector<scalar_nonzero_view_t>(num_streams_);
Expand Down Expand Up @@ -1198,77 +1210,133 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t

const scalar_type one = STS::one ();
const scalar_type zero = STS::zero ();

Teuchos::Time timer ("RILUK::apply");
double startTime = timer.wallTime();
{ // Start timing
Teuchos::TimeMonitor timeMon (timer);
if (alpha == one && beta == zero) {
if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
//NOTE (Nov-15-2022):
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());

// Start by solving L Y_tmp = X for Y_tmp.
L_solver_->apply (X, Y_tmp, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) {
MV ReorderedX (X.getMap(), X.getNumVectors());
MV ReorderedY (Y.getMap(), Y.getNumVectors());
for (size_t j = 0; j < X.getNumVectors(); j++) {
auto X_j = X.getVector(j);
auto ReorderedX_j = ReorderedX.getVectorNonConst(j);
auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
local_ordinal_type stream_begin = 0;
local_ordinal_type stream_end;
for(int i = 0; i < num_streams_; i++) {
auto perm_i = perm_v_[i];
stream_end = stream_begin + perm_i.extent(0);
auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii);
});
stream_begin = stream_end;
}
}

U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
#else
// Start by solving L Y = X for Y.
L_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y.elementWiseMultiply (one, *D_, Y, zero);
Kokkos::fence(); // Make sure X is completely reordered
if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y.
// Solve L Y = X for Y.
L_solver_->apply (ReorderedX, Y, mode);
// Solve U Y = Y for Y.
U_solver_->apply (Y, ReorderedY, mode);
}
else { // Solve U^P (L^P Y) = X for Y (where P is * or T).
// Solve U^P Y = X for Y.
U_solver_->apply (ReorderedX, Y, mode);
// Solve L^P Y = Y for Y.
L_solver_->apply (Y, ReorderedY, mode);
}

U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
#endif
for (size_t j = 0; j < Y.getNumVectors(); j++) {
auto Y_j = Y.getVectorNonConst(j);
auto ReorderedY_j = ReorderedY.getVector(j);
auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
local_ordinal_type stream_begin = 0;
local_ordinal_type stream_end;
for(int i = 0; i < num_streams_; i++) {
auto perm_i = perm_v_[i];
stream_end = stream_begin + perm_i.extent(0);
auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii));
});
stream_begin = stream_end;
}
}
}
else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
else {
if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
//NOTE (Nov-15-2022):
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());
//NOTE (Nov-15-2022):
brian-kelley marked this conversation as resolved.
Show resolved Hide resolved
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());

// Start by solving L Y_tmp = X for Y_tmp.
L_solver_->apply (X, Y_tmp, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
}

// Start by solving U^P Y_tmp = X for Y_tmp.
U_solver_->apply (X, Y_tmp, mode);
U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
#else
// Start by solving L Y = X for Y.
L_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
}
if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y.elementWiseMultiply (one, *D_, Y, zero);
}

L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
#endif
}
else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
//NOTE (Nov-15-2022):
brian-kelley marked this conversation as resolved.
Show resolved Hide resolved
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());

// Start by solving U^P Y_tmp = X for Y_tmp.
U_solver_->apply (X, Y_tmp, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
}

L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
#else
// Start by solving U^P Y = X for Y.
U_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y.elementWiseMultiply (one, *D_, Y, zero);
}

L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
// Start by solving U^P Y = X for Y.
U_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y.elementWiseMultiply (one, *D_, Y, zero);
}

L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
#endif
}
}
}
else { // alpha != 1 or beta != 0
Expand Down
39 changes: 39 additions & 0 deletions packages/ifpack2/test/belos/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,15 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles
test_2_RILUK_HTS_nos1_hb.xml
test_2_RILUK_2streams_nos1_hb.xml
test_2_RILUK_4streams_nos1_hb.xml
test_2_RILUK_2streams_rcm_nos1_hb.xml
test_2_RILUK_4streams_rcm_nos1_hb.xml
test_4_ILUT_nos1_hb.xml
test_4_RILUK_nos1_hb.xml
test_4_RILUK_HTS_nos1_hb.xml
test_4_RILUK_2streams_nos1_hb.xml
test_4_RILUK_4streams_nos1_hb.xml
test_4_RILUK_2streams_rcm_nos1_hb.xml
test_4_RILUK_4streams_rcm_nos1_hb.xml
test_SGS_calore1_mm.xml
test_MTSGS_calore1_mm.xml
calore1.mtx
Expand Down Expand Up @@ -403,6 +407,41 @@ IF(Kokkos_ENABLE_CUDA)
NUM_MPI_PROCS 4
STANDARD_PASS_OUTPUT
)
TRIBITS_ADD_TEST(
tif_belos
NAME RILUK_2streams_rcm_hb_belos
ARGS "--xml_file=test_2_RILUK_2streams_rcm_nos1_hb.xml"
COMM serial mpi
NUM_MPI_PROCS 2
STANDARD_PASS_OUTPUT
)

TRIBITS_ADD_TEST(
tif_belos
NAME RILUK_4streams_rcm_hb_belos
ARGS "--xml_file=test_2_RILUK_4streams_rcm_nos1_hb.xml"
COMM serial mpi
NUM_MPI_PROCS 2
STANDARD_PASS_OUTPUT
)

TRIBITS_ADD_TEST(
tif_belos
NAME RILUK_2streams_rcm_hb_belos
ARGS "--xml_file=test_4_RILUK_2streams_rcm_nos1_hb.xml"
COMM serial mpi
NUM_MPI_PROCS 4
STANDARD_PASS_OUTPUT
)

TRIBITS_ADD_TEST(
tif_belos
NAME RILUK_4streams_rcm_hb_belos
ARGS "--xml_file=test_4_RILUK_4streams_rcm_nos1_hb.xml"
COMM serial mpi
NUM_MPI_PROCS 4
STANDARD_PASS_OUTPUT
)
ENDIF()
ENDIF()

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<ParameterList name="test_params">
<Parameter name="hb_file" type="string" value="nos1.rsa"/>

<Parameter name="solver_type" type="string" value="Block Gmres"/>
<ParameterList name="Belos">
<Parameter name="Num Blocks" type="int" value="300"/>
<Parameter name="Verbosity" type="int" value="33"/>
<Parameter name="Output Style" type="int" value="1"/>
<Parameter name="Output Frequency" type="int" value="1"/>
</ParameterList>

<Parameter name="Ifpack2::Preconditioner" type="string" value="RILUK"/>
<ParameterList name="Ifpack2">
<Parameter name="fact: iluk level-of-fill" type="int" value="2"/>
<Parameter name="fact: type" type="string" value="KSPILUK"/>
<Parameter name="fact: kspiluk number-of-streams" type="int" value="2"/>
<Parameter name="fact: kspiluk reordering in streams" type="bool" value="true"/>
<Parameter name="trisolver: type" type="string" value="KSPTRSV"/>
</ParameterList>

<Parameter name="expectNumIters" type="int" value="50"/>
</ParameterList>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<ParameterList name="test_params">
<Parameter name="hb_file" type="string" value="nos1.rsa"/>

<Parameter name="solver_type" type="string" value="Block Gmres"/>
<ParameterList name="Belos">
<Parameter name="Num Blocks" type="int" value="300"/>
<Parameter name="Verbosity" type="int" value="33"/>
<Parameter name="Output Style" type="int" value="1"/>
<Parameter name="Output Frequency" type="int" value="1"/>
</ParameterList>

<Parameter name="Ifpack2::Preconditioner" type="string" value="RILUK"/>
<ParameterList name="Ifpack2">
<Parameter name="fact: iluk level-of-fill" type="int" value="2"/>
<Parameter name="fact: type" type="string" value="KSPILUK"/>
<Parameter name="fact: kspiluk number-of-streams" type="int" value="4"/>
<Parameter name="fact: kspiluk reordering in streams" type="bool" value="true"/>
<Parameter name="trisolver: type" type="string" value="KSPTRSV"/>
</ParameterList>

<Parameter name="expectNumIters" type="int" value="50"/>
</ParameterList>
Loading
Loading