Skip to content

Commit

Permalink
Merge Pull Request #13017 from vbrunini/Trilinos/team_parallel_scale_…
Browse files Browse the repository at this point in the history
…local_crs

Automatically Merged using Trilinos Pull Request AutoTester
PR Title: b'Tpetra: Add team parallelism to a couple kernels associated with matrix equilibration'
PR Author: vbrunini
  • Loading branch information
trilinos-autotester authored May 29, 2024
2 parents 74d9ce7 + 8a1f84a commit 1176070
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class LeftScaleLocalCrsMatrix {
static_assert (ScalingFactorsViewType::rank == 1,
"scalingFactors must be a rank-1 Kokkos::View.");
using device_type = typename LocalSparseMatrixType::device_type;
using LO = typename LocalSparseMatrixType::ordinal_type;
using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;

/// \param A_lcl [in/out] The local sparse matrix.
///
Expand All @@ -94,26 +96,26 @@ class LeftScaleLocalCrsMatrix {
{}

KOKKOS_INLINE_FUNCTION void
operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const
operator () (const typename policy_type::member_type & team) const
{
using LO = typename LocalSparseMatrixType::ordinal_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
const mag_type curRowNorm = scalingFactors_(lclRow);
// Users are responsible for any divisions or multiplications by
// zero.
const mag_type scalingFactor = assumeSymmetric_ ?
KAM::sqrt (curRowNorm) : curRowNorm;
auto curRow = A_lcl_.row (lclRow);
const LO numEnt = curRow.length;
for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
if (divide) { // constexpr, so should get compiled out
curRow.value (k) = curRow.value(k) / scalingFactor;
}
else {
curRow.value (k) = curRow.value(k) * scalingFactor;
}
}
});
}

private:
Expand Down Expand Up @@ -145,7 +147,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
using device_type = typename LocalSparseMatrixType::device_type;
using execution_space = typename device_type::execution_space;
using LO = typename LocalSparseMatrixType::ordinal_type;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;

const LO lclNumRows = A_lcl.numRows ();
if (divide) {
Expand All @@ -154,15 +156,15 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
typename ScalingFactorsViewType::const_type, true>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("leftScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
else {
using functor_type =
LeftScaleLocalCrsMatrix<LocalSparseMatrixType,
typename ScalingFactorsViewType::const_type, false>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("leftScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class RightScaleLocalCrsMatrix {
static_assert (ScalingFactorsViewType::rank == 1,
"scalingFactors must be a rank-1 Kokkos::View.");
using device_type = typename LocalSparseMatrixType::device_type;
using LO = typename LocalSparseMatrixType::ordinal_type;
using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;

/// \param A_lcl [in/out] The local sparse matrix.
///
Expand All @@ -96,14 +98,14 @@ class RightScaleLocalCrsMatrix {
{}

KOKKOS_INLINE_FUNCTION void
operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const
operator () (const typename policy_type::member_type & team) const
{
using LO = typename LocalSparseMatrixType::ordinal_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
auto curRow = A_lcl_.row (lclRow);
const LO numEnt = curRow.length;
for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
const LO lclColInd = curRow.colidx(k);
const mag_type curColNorm = scalingFactors_(lclColInd);
// Users are responsible for any divisions or multiplications by
Expand All @@ -116,7 +118,7 @@ class RightScaleLocalCrsMatrix {
else {
curRow.value(k) = curRow.value(k) * scalingFactor;
}
}
});
}

private:
Expand Down Expand Up @@ -148,7 +150,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
using device_type = typename LocalSparseMatrixType::device_type;
using execution_space = typename device_type::execution_space;
using LO = typename LocalSparseMatrixType::ordinal_type;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;

const LO lclNumRows = A_lcl.numRows ();
if (divide) {
Expand All @@ -157,15 +159,15 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
typename ScalingFactorsViewType::const_type, true>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("rightScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
else {
using functor_type =
RightScaleLocalCrsMatrix<LocalSparseMatrixType,
typename ScalingFactorsViewType::const_type, false>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("rightScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
}

Expand Down
112 changes: 63 additions & 49 deletions packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
/// Tpetra_computeRowAndColumnOneNorms_decl.hpp in this directory.

#include "Tpetra_Details_copyConvert.hpp"
#include "Tpetra_Details_EquilibrationInfo.hpp"
#include "Tpetra_CrsMatrix.hpp"
#include "Tpetra_Export.hpp"
#include "Tpetra_Map.hpp"
Expand Down Expand Up @@ -310,6 +311,7 @@ class ComputeLocalRowScaledColumnNorms {
using val_type = typename Kokkos::ArithTraits<SC>::val_type;
using mag_type = typename Kokkos::ArithTraits<val_type>::mag_type;
using device_type = typename crs_matrix_type::device_type;
using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;

ComputeLocalRowScaledColumnNorms (const Kokkos::View<mag_type*, device_type>& rowScaledColNorms,
const Kokkos::View<const mag_type*, device_type>& rowNorms,
Expand All @@ -319,18 +321,19 @@ class ComputeLocalRowScaledColumnNorms {
A_lcl_ (A.getLocalMatrixDevice ())
{}

KOKKOS_INLINE_FUNCTION void operator () (const LO lclRow) const {
KOKKOS_INLINE_FUNCTION void operator () (const typename policy_type::member_type &team) const {
using KAT = Kokkos::ArithTraits<val_type>;

const LO lclRow = team.league_rank();
const auto curRow = A_lcl_.rowConst (lclRow);
const mag_type rowNorm = rowNorms_[lclRow];
const LO numEnt = curRow.length;
for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
const mag_type matrixAbsVal = KAT::abs (curRow.value(k));
const LO lclCol = curRow.colidx(k);

Kokkos::atomic_add (&rowScaledColNorms_[lclCol], matrixAbsVal / rowNorm);
}
});
}

static void
Expand All @@ -339,14 +342,13 @@ class ComputeLocalRowScaledColumnNorms {
const crs_matrix_type& A)
{
using execution_space = typename device_type::execution_space;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using functor_type = ComputeLocalRowScaledColumnNorms<SC, LO, GO, NT>;

functor_type functor (rowScaledColNorms, rowNorms, A);
const LO lclNumRows =
static_cast<LO> (A.getRowMap ()->getLocalNumElements ());
Kokkos::parallel_for ("computeLocalRowScaledColumnNorms",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}

private:
Expand Down Expand Up @@ -409,6 +411,7 @@ class ComputeLocalRowOneNorms {
using local_matrix_device_type =
typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
using policy_type = Kokkos::TeamPolicy<typename local_matrix_device_type::execution_space, LO>;

ComputeLocalRowOneNorms (const equib_info_type& equib, // in/out
const local_matrix_device_type& A_lcl, // in
Expand Down Expand Up @@ -441,12 +444,13 @@ class ComputeLocalRowOneNorms {
}

KOKKOS_INLINE_FUNCTION void
operator () (const LO lclRow, value_type& dst) const
operator () (const typename policy_type::member_type& team, value_type& dst) const
{
using KAT = Kokkos::ArithTraits<val_type>;
using mag_type = typename KAT::mag_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
const GO gblRow = rowMap_.getGlobalElement (lclRow);
// OK if invalid(); then we simply won't find the diagonal entry.
const GO lclDiagColInd = colMap_.getLocalElement (gblRow);
Expand All @@ -456,33 +460,37 @@ class ComputeLocalRowOneNorms {

mag_type rowNorm {0.0};
val_type diagVal {0.0};
value_type dstThread {0};

for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) {
const val_type matrixVal = curRow.value (k);
if (KAT::isInf (matrixVal)) {
dst |= 1;
dstContrib |= 1;
}
if (KAT::isNan (matrixVal)) {
dst |= 2;
dstContrib |= 2;
}
const mag_type matrixAbsVal = KAT::abs (matrixVal);
rowNorm += matrixAbsVal;
normContrib += matrixAbsVal;
const LO lclCol = curRow.colidx (k);
if (lclCol == lclDiagColInd) {
diagVal = curRow.value (k); // assume no repeats
diagContrib = curRow.value (k); // assume no repeats
}
} // for each entry in row
}, Kokkos::Sum<mag_type>(rowNorm), Kokkos::Sum<val_type>(diagVal), Kokkos::BOr<value_type>(dstThread)); // for each entry in row

// This is a local result. If the matrix has an overlapping
// row Map, then the global result might differ.
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
Kokkos::single(Kokkos::PerTeam(team), [&](){
dst |= dstThread;
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
});
}

private:
Expand All @@ -501,6 +509,7 @@ class ComputeLocalRowAndColumnOneNorms {
using equib_info_type = EquilibrationInfo<val_type, typename NT::device_type>;
using local_matrix_device_type = typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
using policy_type = Kokkos::TeamPolicy<typename local_matrix_device_type::execution_space, LO>;

public:
ComputeLocalRowAndColumnOneNorms (const equib_info_type& equib, // in/out
Expand Down Expand Up @@ -534,12 +543,13 @@ class ComputeLocalRowAndColumnOneNorms {
}

KOKKOS_INLINE_FUNCTION void
operator () (const LO lclRow, value_type& dst) const
operator () (const typename policy_type::member_type& team, value_type& dst) const
{
using KAT = Kokkos::ArithTraits<val_type>;
using mag_type = typename KAT::mag_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
const GO gblRow = rowMap_.getGlobalElement (lclRow);
// OK if invalid(); then we simply won't find the diagonal entry.
const GO lclDiagColInd = colMap_.getLocalElement (gblRow);
Expand All @@ -549,46 +559,50 @@ class ComputeLocalRowAndColumnOneNorms {

mag_type rowNorm {0.0};
val_type diagVal {0.0};
value_type dstThread {0};

for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) {
const val_type matrixVal = curRow.value (k);
if (KAT::isInf (matrixVal)) {
dst |= 1;
dstContrib |= 1;
}
if (KAT::isNan (matrixVal)) {
dst |= 2;
dstContrib |= 2;
}
const mag_type matrixAbsVal = KAT::abs (matrixVal);
rowNorm += matrixAbsVal;
normContrib += matrixAbsVal;
const LO lclCol = curRow.colidx (k);
if (lclCol == lclDiagColInd) {
diagVal = curRow.value (k); // assume no repeats
diagContrib = curRow.value (k); // assume no repeats
}
if (! equib_.assumeSymmetric) {
Kokkos::atomic_add (&(equib_.colNorms[lclCol]), matrixAbsVal);
}
} // for each entry in row
}, Kokkos::Sum<mag_type>(rowNorm), Kokkos::Sum<val_type>(diagVal), Kokkos::BOr<value_type>(dstThread)); // for each entry in row

// This is a local result. If the matrix has an overlapping
// row Map, then the global result might differ.
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
// NOTE (mfh 24 May 2018) We could actually compute local
// rowScaledColNorms in situ at this point, if ! assumeSymmetric
// and row Map is the same as range Map (so that the local row
// norms are the same as the global row norms).
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
if (! equib_.assumeSymmetric &&
lclDiagColInd != Tpetra::Details::OrdinalTraits<LO>::invalid ()) {
// Don't need an atomic update here, since this lclDiagColInd is
// a one-to-one function of lclRow.
equib_.colDiagonalEntries[lclDiagColInd] += diagVal;
}
Kokkos::single(Kokkos::PerTeam(team), [&](){
dst |= dstThread;
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
// NOTE (mfh 24 May 2018) We could actually compute local
// rowScaledColNorms in situ at this point, if ! assumeSymmetric
// and row Map is the same as range Map (so that the local row
// norms are the same as the global row norms).
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
if (! equib_.assumeSymmetric &&
lclDiagColInd != Tpetra::Details::OrdinalTraits<LO>::invalid ()) {
// Don't need an atomic update here, since this lclDiagColInd is
// a one-to-one function of lclRow.
equib_.colDiagonalEntries[lclDiagColInd] += diagVal;
}
});
}

private:
Expand All @@ -605,7 +619,7 @@ EquilibrationInfo<typename Kokkos::ArithTraits<SC>::val_type, typename NT::devic
computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
{
using execution_space = typename NT::device_type::execution_space;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
using functor_type = ComputeLocalRowOneNorms<SC, LO, GO, NT>;
using val_type = typename Kokkos::ArithTraits<SC>::val_type;
using device_type = typename NT::device_type;
Expand All @@ -621,7 +635,7 @@ computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
A.getColMap ()->getLocalMap ());
int result = 0;
Kokkos::parallel_reduce ("computeLocalRowOneNorms",
range_type (0, lclNumRows), functor,
policy_type (lclNumRows, Kokkos::AUTO), functor,
result);
equib.foundInf = (result & 1) != 0;
equib.foundNan = (result & 2) != 0;
Expand All @@ -638,7 +652,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
const bool assumeSymmetric)
{
using execution_space = typename NT::device_type::execution_space;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
using functor_type = ComputeLocalRowAndColumnOneNorms<SC, LO, GO, NT>;
using val_type = typename Kokkos::ArithTraits<SC>::val_type;
using device_type = typename NT::device_type;
Expand All @@ -653,7 +667,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
A.getColMap ()->getLocalMap ());
int result = 0;
Kokkos::parallel_reduce ("computeLocalRowAndColumnOneNorms",
range_type (0, lclNumRows), functor,
policy_type (lclNumRows, Kokkos::AUTO), functor,
result);
equib.foundInf = (result & 1) != 0;
equib.foundNan = (result & 2) != 0;
Expand Down

0 comments on commit 1176070

Please sign in to comment.