Skip to content

Commit

Permalink
MueLu: Fix for phase 2b
Browse files Browse the repository at this point in the history
Compute aggWeight locally instead of globally.

Signed-off-by: Christian Glusa <[email protected]>
  • Loading branch information
cgcgcg committed Nov 6, 2024
1 parent d7d7ee2 commit da49512
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 166 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,13 @@ class AggregationPhase2bAlgorithm : public MueLu::AggregationAlgorithmBase<Local
typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const;

void BuildAggregatesRandom(const ParameterList& params,
const LWGraph_kokkos& graph,
Aggregates& aggregates,
typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const;

void BuildAggregatesDeterministic(const ParameterList& params,
const LWGraph_kokkos& graph,
Aggregates& aggregates,
typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const;
template <bool deterministic>
void BuildAggregates(const ParameterList& params,
const LWGraph_kokkos& graph,
Aggregates& aggregates,
typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const;

//@}

std::string description() const { return "Phase 2b (expansion)"; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ void AggregationPhase2bAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::BuildAggreg

LO numLocalAggregates = aggregates.GetNumAggregates();

const int defaultConnectWeight = 100;
const int penaltyConnectWeight = 10;
const LO defaultConnectWeight = 100;
const LO penaltyConnectWeight = 10;

std::vector<int> aggWeight(numLocalAggregates, 0);
std::vector<int> connectWeight(numRows, defaultConnectWeight);
std::vector<int> aggPenalties(numRows, 0);
std::vector<LO> aggWeight(numLocalAggregates, 0);
std::vector<LO> connectWeight(numRows, defaultConnectWeight);
std::vector<LO> aggPenalties(numRows, 0);

// We do this cycle twice.
// I don't know why, but ML does it too
Expand Down Expand Up @@ -118,24 +118,27 @@ void AggregationPhase2bAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::
LO& numNonAggregatedNodes) const {
if (params.get<bool>("aggregation: deterministic")) {
Monitor m(*this, "BuildAggregatesDeterministic");
BuildAggregatesDeterministic(params, graph, aggregates, aggStat, numNonAggregatedNodes);
BuildAggregates<true>(params, graph, aggregates, aggStat, numNonAggregatedNodes);
} else {
Monitor m(*this, "BuildAggregatesRandom");
BuildAggregatesRandom(params, graph, aggregates, aggStat, numNonAggregatedNodes);
BuildAggregates<false>(params, graph, aggregates, aggStat, numNonAggregatedNodes);
}

} // BuildAggregates

template <class LO, class GO, class Node>
void AggregationPhase2bAlgorithm<LO, GO, Node>::
BuildAggregatesRandom(const ParameterList& params,
const LWGraph_kokkos& graph,
Aggregates& aggregates,
typename AggregationAlgorithmBase<LO, GO, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const {
template <class LocalOrdinal, class GlobalOrdinal, class Node>
template <bool deterministic>
void AggregationPhase2bAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::
BuildAggregates(const ParameterList& params,
const LWGraph_kokkos& graph,
Aggregates& aggregates,
typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const {
using device_type = typename LWGraph_kokkos::device_type;
using execution_space = typename LWGraph_kokkos::execution_space;

bool matchMLbehavior = params.get<bool>("aggregation: match ML phase2b");

const LO numRows = graph.GetNodeNumVertices();
const int myRank = graph.GetComm()->getRank();

Expand All @@ -150,9 +153,11 @@ void AggregationPhase2bAlgorithm<LO, GO, Node>::
const LO defaultConnectWeight = 100;
const LO penaltyConnectWeight = 10;

Kokkos::View<LO*, device_type> aggWeight(Kokkos::ViewAllocateWithoutInitializing("aggWeight"), numLocalAggregates); // This gets re-initialized at the start of each "color" loop
Kokkos::View<LO*, device_type> connectWeight(Kokkos::ViewAllocateWithoutInitializing("connectWeight"), numRows);
Kokkos::View<LO*, device_type> aggPenalties("aggPenalties", numLocalAggregates); // This gets initialized to zero here
Kokkos::View<LO*, device_type> aggPenaltyUpdates;
if constexpr (deterministic)
aggPenaltyUpdates = Kokkos::View<LO*, device_type>("aggPenaltyUpdates", numLocalAggregates);

Kokkos::deep_copy(connectWeight, defaultConnectWeight);

Expand All @@ -170,8 +175,6 @@ void AggregationPhase2bAlgorithm<LO, GO, Node>::
}
for (int iter = 0; iter < maxIters; ++iter) {
for (LO color = 1; color <= numColors; ++color) {
Kokkos::deep_copy(aggWeight, 0);

// the reduce counts how many nodes are aggregated by this phase,
// which will then be subtracted from numNonAggregatedNodes
LO numAggregated = 0;
Expand All @@ -182,143 +185,31 @@ void AggregationPhase2bAlgorithm<LO, GO, Node>::
if (aggStat(i) != READY || colors(i) != color)
return;

auto neighOfINode = lclLWGraph.getNeighborVertices(i);
for (int j = 0; j < neighOfINode.length; j++) {
LO neigh = neighOfINode(j);

// We don't check (neigh != i), as it is covered by checking
// (aggStat[neigh] == AGGREGATED)
if (lclLWGraph.isLocalNeighborVertex(neigh) &&
aggStat(neigh) == AGGREGATED)
Kokkos::atomic_add(&aggWeight(vertex2AggId(neigh, 0)),
connectWeight(neigh));
}

int bestScore = -100000;
int bestAggId = -1;
int bestConnect = -1;

auto neighOfINode = lclLWGraph.getNeighborVertices(i);

for (int j = 0; j < neighOfINode.length; j++) {
LO neigh = neighOfINode(j);

if (lclLWGraph.isLocalNeighborVertex(neigh) &&
aggStat(neigh) == AGGREGATED) {
auto aggId = vertex2AggId(neigh, 0);
int score = aggWeight(aggId) - aggPenalties(aggId);

if (score > bestScore) {
bestAggId = aggId;
bestScore = score;
bestConnect = connectWeight(neigh);

} else if (aggId == bestAggId &&
connectWeight(neigh) > bestConnect) {
bestConnect = connectWeight(neigh);
(aggStat(neigh) == AGGREGATED)) {
auto aggId = vertex2AggId(neigh, 0);
LO aggWeight = 0;
for (int k = 0; k < neighOfINode.length; k++) {
LO neigh2 = neighOfINode(k);
if (lclLWGraph.isLocalNeighborVertex(neigh2) &&
(aggStat(neigh2) == AGGREGATED) &&
(vertex2AggId(neigh2, 0) == aggId))
aggWeight += connectWeight(neigh2);
}
}
}
if (bestScore >= 0) {
aggStat(i) = AGGREGATED;
vertex2AggId(i, 0) = bestAggId;
procWinner(i, 0) = myRank;

Kokkos::atomic_add(&aggPenalties(bestAggId), 1);
connectWeight(i) = bestConnect - penaltyConnectWeight;
tmpNumAggregated++;
}
},
numAggregated); // parallel_for
numNonAggregatedNodes -= numAggregated;
}
} // loop over maxIters

} // BuildAggregatesRandom

template <class LO, class GO, class Node>
void AggregationPhase2bAlgorithm<LO, GO, Node>::
BuildAggregatesDeterministic(const ParameterList& params,
const LWGraph_kokkos& graph,
Aggregates& aggregates,
typename AggregationAlgorithmBase<LO, GO, Node>::AggStatType& aggStat,
LO& numNonAggregatedNodes) const {
using device_type = typename LWGraph_kokkos::device_type;
using execution_space = typename LWGraph_kokkos::execution_space;

const LO numRows = graph.GetNodeNumVertices();
const int myRank = graph.GetComm()->getRank();

auto vertex2AggId = aggregates.GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadWrite);
auto procWinner = aggregates.GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadWrite);
auto colors = aggregates.GetGraphColors();
const LO numColors = aggregates.GetGraphNumColors();
LO numLocalAggregates = aggregates.GetNumAggregates();

auto lclLWGraph = graph;

const int defaultConnectWeight = 100;
const int penaltyConnectWeight = 10;

Kokkos::View<int*, device_type> connectWeight(Kokkos::ViewAllocateWithoutInitializing("connectWeight"), numRows);
Kokkos::View<int*, device_type> aggWeight(Kokkos::ViewAllocateWithoutInitializing("aggWeight"), numLocalAggregates); // This gets re-initialized at the start of each "color" loop
Kokkos::View<int*, device_type> aggPenaltyUpdates("aggPenaltyUpdates", numLocalAggregates);
Kokkos::View<int*, device_type> aggPenalties("aggPenalties", numLocalAggregates);

Kokkos::deep_copy(connectWeight, defaultConnectWeight);

// We do this cycle twice.
// I don't know why, but ML does it too
// taw: by running the aggregation routine more than once there is a chance that also
// non-aggregated nodes with a node distance of two are added to existing aggregates.
// Assuming that the aggregate size is 3 in each direction running the algorithm only twice
// should be sufficient.
int maxIters = 2;
int maxNodesPerAggregate = params.get<int>("aggregation: max agg size");
if (maxNodesPerAggregate == std::numeric_limits<int>::max()) {
maxIters = 1;
}
for (int iter = 0; iter < maxIters; ++iter) {
for (LO color = 1; color <= numColors; color++) {
Kokkos::deep_copy(aggWeight, 0);

// the reduce counts how many nodes are aggregated by this phase,
// which will then be subtracted from numNonAggregatedNodes
LO numAggregated = 0;
Kokkos::parallel_for(
"Aggregation Phase 2b: updating agg weights",
Kokkos::RangePolicy<execution_space>(0, numRows),
KOKKOS_LAMBDA(const LO i) {
if (aggStat(i) != READY || colors(i) != color)
return;
auto neighOfINode = lclLWGraph.getNeighborVertices(i);
for (int j = 0; j < neighOfINode.length; j++) {
LO neigh = neighOfINode(j);
// We don't check (neigh != i), as it is covered by checking
// (aggStat[neigh] == AGGREGATED)
if (lclLWGraph.isLocalNeighborVertex(neigh) &&
aggStat(neigh) == AGGREGATED)
Kokkos::atomic_add(&aggWeight(vertex2AggId(neigh, 0)),
connectWeight(neigh));
}
});

Kokkos::parallel_reduce(
"Aggregation Phase 2b: aggregates expansion",
Kokkos::RangePolicy<execution_space>(0, numRows),
KOKKOS_LAMBDA(const LO i, LO& tmpNumAggregated) {
if (aggStat(i) != READY || colors(i) != color)
return;
int bestScore = -100000;
int bestAggId = -1;
int bestConnect = -1;

auto neighOfINode = lclLWGraph.getNeighborVertices(i);
for (int j = 0; j < neighOfINode.length; j++) {
LO neigh = neighOfINode(j);
if (matchMLbehavior && (aggWeight == 0))
return;

if (lclLWGraph.isLocalNeighborVertex(neigh) &&
aggStat(neigh) == AGGREGATED) {
auto aggId = vertex2AggId(neigh, 0);
int score = aggWeight(aggId) - aggPenalties(aggId);
int score = aggWeight - aggPenalties(aggId);

if (score > bestScore) {
bestAggId = aggId;
Expand All @@ -336,24 +227,32 @@ void AggregationPhase2bAlgorithm<LO, GO, Node>::
vertex2AggId(i, 0) = bestAggId;
procWinner(i, 0) = myRank;

Kokkos::atomic_add(&aggPenaltyUpdates(bestAggId), 1);
if constexpr (deterministic) {
Kokkos::atomic_add(&aggPenaltyUpdates(bestAggId), 1);
} else {
Kokkos::atomic_add(&aggPenalties(bestAggId), 1);
}
connectWeight(i) = bestConnect - penaltyConnectWeight;
tmpNumAggregated++;
}
},
numAggregated); // parallel_reduce

Kokkos::parallel_for(
"Aggregation Phase 2b: updating agg penalties",
Kokkos::RangePolicy<execution_space>(0, numLocalAggregates),
KOKKOS_LAMBDA(const LO agg) {
aggPenalties(agg) += aggPenaltyUpdates(agg);
aggPenaltyUpdates(agg) = 0;
});
if constexpr (deterministic) {
Kokkos::parallel_for(
"Aggregation Phase 2b: updating agg penalties",
Kokkos::RangePolicy<execution_space>(0, numLocalAggregates),
KOKKOS_LAMBDA(const LO agg) {
aggPenalties(agg) += aggPenaltyUpdates(agg);
aggPenaltyUpdates(agg) = 0;
});
}

numNonAggregatedNodes -= numAggregated;
}
} // loop over k
} // BuildAggregatesDeterministic
} // loop over maxIters

} // BuildAggregates

} // namespace MueLu

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,8 @@ void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node>::Build(Level
runOnHost = false;

TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: use interface aggregation"), std::invalid_argument, "Option: 'aggregation: use interface aggregation' is not supported in the Kokkos version of uncoupled aggregation");
// Sanity Checking: match ML behavior is not supported in UncoupledAggregation_Kokkos in Phase 1 or Phase 2b, but is in 2a
// Sanity Checking: match ML behavior is not supported in UncoupledAggregation_Kokkos in Phase 1 , but it is in 2a and 2b
TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: match ML phase1"), std::invalid_argument, "Option: 'aggregation: match ML phase1' is not supported in the Kokkos version of uncoupled aggregation");
TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: match ML phase2b"), std::invalid_argument, "Option: 'aggregation: match ML phase2b' is not supported in the Kokkos version of uncoupled aggregation");
}

// Build
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ void gimmeUncoupledAggregates(const Teuchos::RCP<Xpetra::Matrix<Scalar, LocalOrd
params.set<bool>("aggregation: deterministic", false);

params.set<bool>("aggregation: match ML phase2a", true);
params.set<bool>("aggregation: match ML phase2b", false);
params.set<bool>("aggregation: error on nodes with no on-rank neighbors", false);
params.set<bool>("aggregation: phase3 avoid singletons", false);

Expand Down

0 comments on commit da49512

Please sign in to comment.