diff --git a/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp b/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp index 302301cd38da..aa6bc75753d0 100644 --- a/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp @@ -76,6 +76,8 @@ class LeftScaleLocalCrsMatrix { static_assert (ScalingFactorsViewType::rank == 1, "scalingFactors must be a rank-1 Kokkos::View."); using device_type = typename LocalSparseMatrixType::device_type; + using LO = typename LocalSparseMatrixType::ordinal_type; + using policy_type = Kokkos::TeamPolicy; /// \param A_lcl [in/out] The local sparse matrix. /// @@ -94,11 +96,11 @@ class LeftScaleLocalCrsMatrix { {} KOKKOS_INLINE_FUNCTION void - operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const + operator () (const typename policy_type::member_type & team) const { - using LO = typename LocalSparseMatrixType::ordinal_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const mag_type curRowNorm = scalingFactors_(lclRow); // Users are responsible for any divisions or multiplications by // zero. @@ -106,14 +108,14 @@ class LeftScaleLocalCrsMatrix { KAM::sqrt (curRowNorm) : curRowNorm; auto curRow = A_lcl_.row (lclRow); const LO numEnt = curRow.length; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) { if (divide) { // constexpr, so should get compiled out curRow.value (k) = curRow.value(k) / scalingFactor; } else { curRow.value (k) = curRow.value(k) * scalingFactor; } - } + }); } private: @@ -145,7 +147,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, using device_type = typename LocalSparseMatrixType::device_type; using execution_space = typename device_type::execution_space; using LO = typename LocalSparseMatrixType::ordinal_type; - using range_type = Kokkos::RangePolicy; + using policy_type = Kokkos::TeamPolicy; const LO lclNumRows = A_lcl.numRows (); if (divide) { @@ -154,7 +156,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, true>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("leftScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } else { using functor_type = @@ -162,7 +164,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, false>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("leftScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } } diff --git a/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp b/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp index 219888164161..a6a48e8b2f8e 100644 --- a/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp @@ -76,6 +76,8 @@ class RightScaleLocalCrsMatrix { static_assert (ScalingFactorsViewType::rank == 1, "scalingFactors must be a rank-1 Kokkos::View."); using device_type = typename LocalSparseMatrixType::device_type; + using LO = typename LocalSparseMatrixType::ordinal_type; + using policy_type = Kokkos::TeamPolicy; /// \param A_lcl [in/out] The local sparse matrix. /// @@ -96,14 +98,14 @@ class RightScaleLocalCrsMatrix { {} KOKKOS_INLINE_FUNCTION void - operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const + operator () (const typename policy_type::member_type & team) const { - using LO = typename LocalSparseMatrixType::ordinal_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); auto curRow = A_lcl_.row (lclRow); const LO numEnt = curRow.length; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) { const LO lclColInd = curRow.colidx(k); const mag_type curColNorm = scalingFactors_(lclColInd); // Users are responsible for any divisions or multiplications by @@ -116,7 +118,7 @@ class RightScaleLocalCrsMatrix { else { curRow.value(k) = curRow.value(k) * scalingFactor; } - } + }); } private: @@ -148,7 +150,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, using device_type = typename LocalSparseMatrixType::device_type; using execution_space = typename device_type::execution_space; using LO = typename LocalSparseMatrixType::ordinal_type; - using range_type = Kokkos::RangePolicy; + using policy_type = Kokkos::TeamPolicy; const LO lclNumRows = A_lcl.numRows (); if (divide) { @@ -157,7 +159,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, true>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("rightScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } else { using functor_type = @@ -165,7 +167,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, false>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("rightScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } } diff --git a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp index a526d4438f9e..872e7ea23a6e 100644 --- a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp +++ b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp @@ -50,6 +50,7 @@ /// Tpetra_computeRowAndColumnOneNorms_decl.hpp in this directory. #include "Tpetra_Details_copyConvert.hpp" +#include "Tpetra_Details_EquilibrationInfo.hpp" #include "Tpetra_CrsMatrix.hpp" #include "Tpetra_Export.hpp" #include "Tpetra_Map.hpp" @@ -310,6 +311,7 @@ class ComputeLocalRowScaledColumnNorms { using val_type = typename Kokkos::ArithTraits::val_type; using mag_type = typename Kokkos::ArithTraits::mag_type; using device_type = typename crs_matrix_type::device_type; + using policy_type = Kokkos::TeamPolicy; ComputeLocalRowScaledColumnNorms (const Kokkos::View& rowScaledColNorms, const Kokkos::View& rowNorms, @@ -319,18 +321,19 @@ class ComputeLocalRowScaledColumnNorms { A_lcl_ (A.getLocalMatrixDevice ()) {} - KOKKOS_INLINE_FUNCTION void operator () (const LO lclRow) const { + KOKKOS_INLINE_FUNCTION void operator () (const typename policy_type::member_type &team) const { using KAT = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const auto curRow = A_lcl_.rowConst (lclRow); const mag_type rowNorm = rowNorms_[lclRow]; const LO numEnt = curRow.length; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) { const mag_type matrixAbsVal = KAT::abs (curRow.value(k)); const LO lclCol = curRow.colidx(k); Kokkos::atomic_add (&rowScaledColNorms_[lclCol], matrixAbsVal / rowNorm); - } + }); } static void @@ -339,14 +342,13 @@ class ComputeLocalRowScaledColumnNorms { const crs_matrix_type& A) { using execution_space = typename device_type::execution_space; - using range_type = Kokkos::RangePolicy; using functor_type = ComputeLocalRowScaledColumnNorms; functor_type functor (rowScaledColNorms, rowNorms, A); const LO lclNumRows = static_cast (A.getRowMap ()->getLocalNumElements ()); Kokkos::parallel_for ("computeLocalRowScaledColumnNorms", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } private: @@ -409,6 +411,7 @@ class ComputeLocalRowOneNorms { using local_matrix_device_type = typename ::Tpetra::CrsMatrix::local_matrix_device_type; using local_map_type = typename ::Tpetra::Map::local_map_type; + using policy_type = Kokkos::TeamPolicy; ComputeLocalRowOneNorms (const equib_info_type& equib, // in/out const local_matrix_device_type& A_lcl, // in @@ -441,12 +444,13 @@ class ComputeLocalRowOneNorms { } KOKKOS_INLINE_FUNCTION void - operator () (const LO lclRow, value_type& dst) const + operator () (const typename policy_type::member_type& team, value_type& dst) const { using KAT = Kokkos::ArithTraits; using mag_type = typename KAT::mag_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const GO gblRow = rowMap_.getGlobalElement (lclRow); // OK if invalid(); then we simply won't find the diagonal entry. const GO lclDiagColInd = colMap_.getLocalElement (gblRow); @@ -456,33 +460,37 @@ class ComputeLocalRowOneNorms { mag_type rowNorm {0.0}; val_type diagVal {0.0}; + value_type dstThread {0}; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) { const val_type matrixVal = curRow.value (k); if (KAT::isInf (matrixVal)) { - dst |= 1; + dstContrib |= 1; } if (KAT::isNan (matrixVal)) { - dst |= 2; + dstContrib |= 2; } const mag_type matrixAbsVal = KAT::abs (matrixVal); - rowNorm += matrixAbsVal; + normContrib += matrixAbsVal; const LO lclCol = curRow.colidx (k); if (lclCol == lclDiagColInd) { - diagVal = curRow.value (k); // assume no repeats + diagContrib = curRow.value (k); // assume no repeats } - } // for each entry in row + }, Kokkos::Sum(rowNorm), Kokkos::Sum(diagVal), Kokkos::BOr(dstThread)); // for each entry in row // This is a local result. If the matrix has an overlapping // row Map, then the global result might differ. - if (diagVal == KAT::zero ()) { - dst |= 4; - } - if (rowNorm == KAM::zero ()) { - dst |= 8; - } - equib_.rowDiagonalEntries[lclRow] = diagVal; - equib_.rowNorms[lclRow] = rowNorm; + Kokkos::single(Kokkos::PerTeam(team), [&](){ + dst |= dstThread; + if (diagVal == KAT::zero ()) { + dst |= 4; + } + if (rowNorm == KAM::zero ()) { + dst |= 8; + } + equib_.rowDiagonalEntries[lclRow] = diagVal; + equib_.rowNorms[lclRow] = rowNorm; + }); } private: @@ -501,6 +509,7 @@ class ComputeLocalRowAndColumnOneNorms { using equib_info_type = EquilibrationInfo; using local_matrix_device_type = typename ::Tpetra::CrsMatrix::local_matrix_device_type; using local_map_type = typename ::Tpetra::Map::local_map_type; + using policy_type = Kokkos::TeamPolicy; public: ComputeLocalRowAndColumnOneNorms (const equib_info_type& equib, // in/out @@ -534,12 +543,13 @@ class ComputeLocalRowAndColumnOneNorms { } KOKKOS_INLINE_FUNCTION void - operator () (const LO lclRow, value_type& dst) const + operator () (const typename policy_type::member_type& team, value_type& dst) const { using KAT = Kokkos::ArithTraits; using mag_type = typename KAT::mag_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const GO gblRow = rowMap_.getGlobalElement (lclRow); // OK if invalid(); then we simply won't find the diagonal entry. const GO lclDiagColInd = colMap_.getLocalElement (gblRow); @@ -549,46 +559,50 @@ class ComputeLocalRowAndColumnOneNorms { mag_type rowNorm {0.0}; val_type diagVal {0.0}; + value_type dstThread {0}; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) { const val_type matrixVal = curRow.value (k); if (KAT::isInf (matrixVal)) { - dst |= 1; + dstContrib |= 1; } if (KAT::isNan (matrixVal)) { - dst |= 2; + dstContrib |= 2; } const mag_type matrixAbsVal = KAT::abs (matrixVal); - rowNorm += matrixAbsVal; + normContrib += matrixAbsVal; const LO lclCol = curRow.colidx (k); if (lclCol == lclDiagColInd) { - diagVal = curRow.value (k); // assume no repeats + diagContrib = curRow.value (k); // assume no repeats } if (! equib_.assumeSymmetric) { Kokkos::atomic_add (&(equib_.colNorms[lclCol]), matrixAbsVal); } - } // for each entry in row + }, Kokkos::Sum(rowNorm), Kokkos::Sum(diagVal), Kokkos::BOr(dstThread)); // for each entry in row // This is a local result. If the matrix has an overlapping // row Map, then the global result might differ. - if (diagVal == KAT::zero ()) { - dst |= 4; - } - if (rowNorm == KAM::zero ()) { - dst |= 8; - } - // NOTE (mfh 24 May 2018) We could actually compute local - // rowScaledColNorms in situ at this point, if ! assumeSymmetric - // and row Map is the same as range Map (so that the local row - // norms are the same as the global row norms). - equib_.rowDiagonalEntries[lclRow] = diagVal; - equib_.rowNorms[lclRow] = rowNorm; - if (! equib_.assumeSymmetric && - lclDiagColInd != Tpetra::Details::OrdinalTraits::invalid ()) { - // Don't need an atomic update here, since this lclDiagColInd is - // a one-to-one function of lclRow. - equib_.colDiagonalEntries[lclDiagColInd] += diagVal; - } + Kokkos::single(Kokkos::PerTeam(team), [&](){ + dst |= dstThread; + if (diagVal == KAT::zero ()) { + dst |= 4; + } + if (rowNorm == KAM::zero ()) { + dst |= 8; + } + // NOTE (mfh 24 May 2018) We could actually compute local + // rowScaledColNorms in situ at this point, if ! assumeSymmetric + // and row Map is the same as range Map (so that the local row + // norms are the same as the global row norms). + equib_.rowDiagonalEntries[lclRow] = diagVal; + equib_.rowNorms[lclRow] = rowNorm; + if (! equib_.assumeSymmetric && + lclDiagColInd != Tpetra::Details::OrdinalTraits::invalid ()) { + // Don't need an atomic update here, since this lclDiagColInd is + // a one-to-one function of lclRow. + equib_.colDiagonalEntries[lclDiagColInd] += diagVal; + } + }); } private: @@ -605,7 +619,7 @@ EquilibrationInfo::val_type, typename NT::devic computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix& A) { using execution_space = typename NT::device_type::execution_space; - using range_type = Kokkos::RangePolicy; + using policy_type = Kokkos::TeamPolicy; using functor_type = ComputeLocalRowOneNorms; using val_type = typename Kokkos::ArithTraits::val_type; using device_type = typename NT::device_type; @@ -621,7 +635,7 @@ computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix& A) A.getColMap ()->getLocalMap ()); int result = 0; Kokkos::parallel_reduce ("computeLocalRowOneNorms", - range_type (0, lclNumRows), functor, + policy_type (lclNumRows, Kokkos::AUTO), functor, result); equib.foundInf = (result & 1) != 0; equib.foundNan = (result & 2) != 0; @@ -638,7 +652,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix; + using policy_type = Kokkos::TeamPolicy; using functor_type = ComputeLocalRowAndColumnOneNorms; using val_type = typename Kokkos::ArithTraits::val_type; using device_type = typename NT::device_type; @@ -653,7 +667,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrixgetLocalMap ()); int result = 0; Kokkos::parallel_reduce ("computeLocalRowAndColumnOneNorms", - range_type (0, lclNumRows), functor, + policy_type (lclNumRows, Kokkos::AUTO), functor, result); equib.foundInf = (result & 1) != 0; equib.foundNan = (result & 2) != 0;