Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tpetra: Add team parallelism to a couple kernels associated with matrix equilibration #13017

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class LeftScaleLocalCrsMatrix {
static_assert (ScalingFactorsViewType::rank == 1,
"scalingFactors must be a rank-1 Kokkos::View.");
using device_type = typename LocalSparseMatrixType::device_type;
using LO = typename LocalSparseMatrixType::ordinal_type;
using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;

/// \param A_lcl [in/out] The local sparse matrix.
///
Expand All @@ -94,26 +96,26 @@ class LeftScaleLocalCrsMatrix {
{}

KOKKOS_INLINE_FUNCTION void
operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const
operator () (const typename policy_type::member_type & team) const
{
using LO = typename LocalSparseMatrixType::ordinal_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
const mag_type curRowNorm = scalingFactors_(lclRow);
// Users are responsible for any divisions or multiplications by
// zero.
const mag_type scalingFactor = assumeSymmetric_ ?
KAM::sqrt (curRowNorm) : curRowNorm;
auto curRow = A_lcl_.row (lclRow);
const LO numEnt = curRow.length;
for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
if (divide) { // constexpr, so should get compiled out
curRow.value (k) = curRow.value(k) / scalingFactor;
}
else {
curRow.value (k) = curRow.value(k) * scalingFactor;
}
}
});
}

private:
Expand Down Expand Up @@ -145,7 +147,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
using device_type = typename LocalSparseMatrixType::device_type;
using execution_space = typename device_type::execution_space;
using LO = typename LocalSparseMatrixType::ordinal_type;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;

const LO lclNumRows = A_lcl.numRows ();
if (divide) {
Expand All @@ -154,15 +156,15 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
typename ScalingFactorsViewType::const_type, true>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("leftScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
else {
using functor_type =
LeftScaleLocalCrsMatrix<LocalSparseMatrixType,
typename ScalingFactorsViewType::const_type, false>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("leftScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class RightScaleLocalCrsMatrix {
static_assert (ScalingFactorsViewType::rank == 1,
"scalingFactors must be a rank-1 Kokkos::View.");
using device_type = typename LocalSparseMatrixType::device_type;
using LO = typename LocalSparseMatrixType::ordinal_type;
using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;

/// \param A_lcl [in/out] The local sparse matrix.
///
Expand All @@ -96,14 +98,14 @@ class RightScaleLocalCrsMatrix {
{}

KOKKOS_INLINE_FUNCTION void
operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const
operator () (const typename policy_type::member_type & team) const
{
using LO = typename LocalSparseMatrixType::ordinal_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
auto curRow = A_lcl_.row (lclRow);
const LO numEnt = curRow.length;
for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
const LO lclColInd = curRow.colidx(k);
const mag_type curColNorm = scalingFactors_(lclColInd);
// Users are responsible for any divisions or multiplications by
Expand All @@ -116,7 +118,7 @@ class RightScaleLocalCrsMatrix {
else {
curRow.value(k) = curRow.value(k) * scalingFactor;
}
}
});
}

private:
Expand Down Expand Up @@ -148,7 +150,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
using device_type = typename LocalSparseMatrixType::device_type;
using execution_space = typename device_type::execution_space;
using LO = typename LocalSparseMatrixType::ordinal_type;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;

const LO lclNumRows = A_lcl.numRows ();
if (divide) {
Expand All @@ -157,15 +159,15 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
typename ScalingFactorsViewType::const_type, true>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("rightScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
else {
using functor_type =
RightScaleLocalCrsMatrix<LocalSparseMatrixType,
typename ScalingFactorsViewType::const_type, false>;
functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
Kokkos::parallel_for ("rightScaleLocalCrsMatrix",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}
}

Expand Down
112 changes: 63 additions & 49 deletions packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
/// Tpetra_computeRowAndColumnOneNorms_decl.hpp in this directory.

#include "Tpetra_Details_copyConvert.hpp"
#include "Tpetra_Details_EquilibrationInfo.hpp"
#include "Tpetra_CrsMatrix.hpp"
#include "Tpetra_Export.hpp"
#include "Tpetra_Map.hpp"
Expand Down Expand Up @@ -310,6 +311,7 @@ class ComputeLocalRowScaledColumnNorms {
using val_type = typename Kokkos::ArithTraits<SC>::val_type;
using mag_type = typename Kokkos::ArithTraits<val_type>::mag_type;
using device_type = typename crs_matrix_type::device_type;
using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;

ComputeLocalRowScaledColumnNorms (const Kokkos::View<mag_type*, device_type>& rowScaledColNorms,
const Kokkos::View<const mag_type*, device_type>& rowNorms,
Expand All @@ -319,18 +321,19 @@ class ComputeLocalRowScaledColumnNorms {
A_lcl_ (A.getLocalMatrixDevice ())
{}

KOKKOS_INLINE_FUNCTION void operator () (const LO lclRow) const {
KOKKOS_INLINE_FUNCTION void operator () (const typename policy_type::member_type &team) const {
using KAT = Kokkos::ArithTraits<val_type>;

const LO lclRow = team.league_rank();
const auto curRow = A_lcl_.rowConst (lclRow);
const mag_type rowNorm = rowNorms_[lclRow];
const LO numEnt = curRow.length;
for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
const mag_type matrixAbsVal = KAT::abs (curRow.value(k));
const LO lclCol = curRow.colidx(k);

Kokkos::atomic_add (&rowScaledColNorms_[lclCol], matrixAbsVal / rowNorm);
}
});
}

static void
Expand All @@ -339,14 +342,13 @@ class ComputeLocalRowScaledColumnNorms {
const crs_matrix_type& A)
{
using execution_space = typename device_type::execution_space;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using functor_type = ComputeLocalRowScaledColumnNorms<SC, LO, GO, NT>;

functor_type functor (rowScaledColNorms, rowNorms, A);
const LO lclNumRows =
static_cast<LO> (A.getRowMap ()->getLocalNumElements ());
Kokkos::parallel_for ("computeLocalRowScaledColumnNorms",
range_type (0, lclNumRows), functor);
policy_type (lclNumRows, Kokkos::AUTO), functor);
}

private:
Expand Down Expand Up @@ -409,6 +411,7 @@ class ComputeLocalRowOneNorms {
using local_matrix_device_type =
typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
using policy_type = Kokkos::TeamPolicy<typename local_matrix_device_type::execution_space, LO>;

ComputeLocalRowOneNorms (const equib_info_type& equib, // in/out
const local_matrix_device_type& A_lcl, // in
Expand Down Expand Up @@ -441,12 +444,13 @@ class ComputeLocalRowOneNorms {
}

KOKKOS_INLINE_FUNCTION void
operator () (const LO lclRow, value_type& dst) const
operator () (const typename policy_type::member_type& team, value_type& dst) const
{
using KAT = Kokkos::ArithTraits<val_type>;
using mag_type = typename KAT::mag_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
const GO gblRow = rowMap_.getGlobalElement (lclRow);
// OK if invalid(); then we simply won't find the diagonal entry.
const GO lclDiagColInd = colMap_.getLocalElement (gblRow);
Expand All @@ -456,33 +460,37 @@ class ComputeLocalRowOneNorms {

mag_type rowNorm {0.0};
val_type diagVal {0.0};
value_type dstThread {0};

for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) {
const val_type matrixVal = curRow.value (k);
if (KAT::isInf (matrixVal)) {
dst |= 1;
dstContrib |= 1;
}
if (KAT::isNan (matrixVal)) {
dst |= 2;
dstContrib |= 2;
}
const mag_type matrixAbsVal = KAT::abs (matrixVal);
rowNorm += matrixAbsVal;
normContrib += matrixAbsVal;
const LO lclCol = curRow.colidx (k);
if (lclCol == lclDiagColInd) {
diagVal = curRow.value (k); // assume no repeats
diagContrib = curRow.value (k); // assume no repeats
}
} // for each entry in row
}, Kokkos::Sum<mag_type>(rowNorm), Kokkos::Sum<val_type>(diagVal), Kokkos::BOr<value_type>(dstThread)); // for each entry in row

// This is a local result. If the matrix has an overlapping
// row Map, then the global result might differ.
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
Kokkos::single(Kokkos::PerTeam(team), [&](){
dst |= dstThread;
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
});
}

private:
Expand All @@ -501,6 +509,7 @@ class ComputeLocalRowAndColumnOneNorms {
using equib_info_type = EquilibrationInfo<val_type, typename NT::device_type>;
using local_matrix_device_type = typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
using policy_type = Kokkos::TeamPolicy<typename local_matrix_device_type::execution_space, LO>;

public:
ComputeLocalRowAndColumnOneNorms (const equib_info_type& equib, // in/out
Expand Down Expand Up @@ -534,12 +543,13 @@ class ComputeLocalRowAndColumnOneNorms {
}

KOKKOS_INLINE_FUNCTION void
operator () (const LO lclRow, value_type& dst) const
operator () (const typename policy_type::member_type& team, value_type& dst) const
{
using KAT = Kokkos::ArithTraits<val_type>;
using mag_type = typename KAT::mag_type;
using KAM = Kokkos::ArithTraits<mag_type>;

const LO lclRow = team.league_rank();
const GO gblRow = rowMap_.getGlobalElement (lclRow);
// OK if invalid(); then we simply won't find the diagonal entry.
const GO lclDiagColInd = colMap_.getLocalElement (gblRow);
Expand All @@ -549,46 +559,50 @@ class ComputeLocalRowAndColumnOneNorms {

mag_type rowNorm {0.0};
val_type diagVal {0.0};
value_type dstThread {0};

for (LO k = 0; k < numEnt; ++k) {
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) {
const val_type matrixVal = curRow.value (k);
if (KAT::isInf (matrixVal)) {
dst |= 1;
dstContrib |= 1;
}
if (KAT::isNan (matrixVal)) {
dst |= 2;
dstContrib |= 2;
}
const mag_type matrixAbsVal = KAT::abs (matrixVal);
rowNorm += matrixAbsVal;
normContrib += matrixAbsVal;
const LO lclCol = curRow.colidx (k);
if (lclCol == lclDiagColInd) {
diagVal = curRow.value (k); // assume no repeats
diagContrib = curRow.value (k); // assume no repeats
}
if (! equib_.assumeSymmetric) {
Kokkos::atomic_add (&(equib_.colNorms[lclCol]), matrixAbsVal);
}
} // for each entry in row
}, Kokkos::Sum<mag_type>(rowNorm), Kokkos::Sum<val_type>(diagVal), Kokkos::BOr<value_type>(dstThread)); // for each entry in row

// This is a local result. If the matrix has an overlapping
// row Map, then the global result might differ.
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
// NOTE (mfh 24 May 2018) We could actually compute local
// rowScaledColNorms in situ at this point, if ! assumeSymmetric
// and row Map is the same as range Map (so that the local row
// norms are the same as the global row norms).
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
if (! equib_.assumeSymmetric &&
lclDiagColInd != Tpetra::Details::OrdinalTraits<LO>::invalid ()) {
// Don't need an atomic update here, since this lclDiagColInd is
// a one-to-one function of lclRow.
equib_.colDiagonalEntries[lclDiagColInd] += diagVal;
}
Kokkos::single(Kokkos::PerTeam(team), [&](){
dst |= dstThread;
if (diagVal == KAT::zero ()) {
dst |= 4;
}
if (rowNorm == KAM::zero ()) {
dst |= 8;
}
// NOTE (mfh 24 May 2018) We could actually compute local
// rowScaledColNorms in situ at this point, if ! assumeSymmetric
// and row Map is the same as range Map (so that the local row
// norms are the same as the global row norms).
equib_.rowDiagonalEntries[lclRow] = diagVal;
equib_.rowNorms[lclRow] = rowNorm;
if (! equib_.assumeSymmetric &&
lclDiagColInd != Tpetra::Details::OrdinalTraits<LO>::invalid ()) {
// Don't need an atomic update here, since this lclDiagColInd is
// a one-to-one function of lclRow.
equib_.colDiagonalEntries[lclDiagColInd] += diagVal;
}
});
}

private:
Expand All @@ -605,7 +619,7 @@ EquilibrationInfo<typename Kokkos::ArithTraits<SC>::val_type, typename NT::devic
computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
{
using execution_space = typename NT::device_type::execution_space;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
using functor_type = ComputeLocalRowOneNorms<SC, LO, GO, NT>;
using val_type = typename Kokkos::ArithTraits<SC>::val_type;
using device_type = typename NT::device_type;
Expand All @@ -621,7 +635,7 @@ computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
A.getColMap ()->getLocalMap ());
int result = 0;
Kokkos::parallel_reduce ("computeLocalRowOneNorms",
range_type (0, lclNumRows), functor,
policy_type (lclNumRows, Kokkos::AUTO), functor,
result);
equib.foundInf = (result & 1) != 0;
equib.foundNan = (result & 2) != 0;
Expand All @@ -638,7 +652,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
const bool assumeSymmetric)
{
using execution_space = typename NT::device_type::execution_space;
using range_type = Kokkos::RangePolicy<execution_space, LO>;
using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
using functor_type = ComputeLocalRowAndColumnOneNorms<SC, LO, GO, NT>;
using val_type = typename Kokkos::ArithTraits<SC>::val_type;
using device_type = typename NT::device_type;
Expand All @@ -653,7 +667,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
A.getColMap ()->getLocalMap ());
int result = 0;
Kokkos::parallel_reduce ("computeLocalRowAndColumnOneNorms",
range_type (0, lclNumRows), functor,
policy_type (lclNumRows, Kokkos::AUTO), functor,
result);
equib.foundInf = (result & 1) != 0;
equib.foundNan = (result & 2) != 0;
Expand Down
Loading