Skip to content

Commit

Permalink
Merge pull request #6823 from onflow/yurii/6645-consensus-dkg-symmetr…
Browse files Browse the repository at this point in the history
…ic-difference-test
  • Loading branch information
durkmurder authored Jan 6, 2025
2 parents a35632f + 4603a98 commit 2e44aa0
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 24 deletions.
4 changes: 2 additions & 2 deletions cmd/bootstrap/dkg/dkg.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ func RandomBeaconKG(n int, seed []byte) (model.ThresholdKeySet, error) {
return dkgData, nil
}

skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(int(n),
signature.RandomBeaconThreshold(int(n)), seed)
skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(n,
signature.RandomBeaconThreshold(n), seed)
if err != nil {
return model.ThresholdKeySet{}, fmt.Errorf("Beacon KeyGen failed: %w", err)
}
Expand Down
68 changes: 50 additions & 18 deletions cmd/bootstrap/run/epochs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ import (
"encoding/hex"
"fmt"

"github.com/rs/zerolog"

"github.com/onflow/cadence"
"github.com/onflow/crypto"
"github.com/rs/zerolog"

"github.com/onflow/flow-go/cmd/util/cmd/common"
"github.com/onflow/flow-go/fvm/systemcontracts"
Expand All @@ -31,6 +31,51 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
recoveryEpochTargetDuration uint64,
unsafeAllowOverWrite bool,
snapshot *inmem.Snapshot,
) ([]cadence.Value, error) {
log.Info().Msg("collecting internal node network and staking keys")
internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson)
if err != nil {
return nil, fmt.Errorf("failed to read full internal node infos: %w", err)
}

epochProtocolState, err := snapshot.EpochProtocolState()
if err != nil {
return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err)
}
currentEpochCommit := epochProtocolState.EpochCommit()

return GenerateRecoverTxArgsWithDKG(
log,
internalNodes,
collectionClusters,
recoveryEpochCounter,
rootChainID,
numViewsInStakingAuction,
numViewsInEpoch,
recoveryEpochTargetDuration,
unsafeAllowOverWrite,
currentEpochCommit.DKGIndexMap,
currentEpochCommit.DKGParticipantKeys,
currentEpochCommit.DKGGroupKey,
snapshot,
)
}

// GenerateRecoverTxArgsWithDKG generates the required transaction arguments for the `recoverEpoch` transaction.
// No errors are expected during normal operation.
func GenerateRecoverTxArgsWithDKG(log zerolog.Logger,
internalNodes []bootstrap.NodeInfo,
collectionClusters int,
recoveryEpochCounter uint64,
rootChainID flow.ChainID,
numViewsInStakingAuction uint64,
numViewsInEpoch uint64,
recoveryEpochTargetDuration uint64,
unsafeAllowOverWrite bool,
dkgIndexMap flow.DKGIndexMap,
dkgParticipantKeys []crypto.PublicKey,
dkgGroupKey crypto.PublicKey,
snapshot *inmem.Snapshot,
) ([]cadence.Value, error) {
epoch := snapshot.Epochs().Current()
currentEpochCounter, err := epoch.Counter()
Expand Down Expand Up @@ -80,20 +125,13 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
internalCollectors := make(flow.IdentityList, 0)
partnerCollectors := make(flow.IdentityList, 0)

log.Info().Msg("collecting internal node network and staking keys")
internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson)
if err != nil {
return nil, fmt.Errorf("failed to read full internal node infos: %w", err)
}

internalNodesMap := make(map[flow.Identifier]struct{})
for _, node := range internalNodes {
if !currentEpochIdentities.Exists(node.Identity()) {
log.Warn().Msgf("this node (ID %s) is not part of the network according to the bootstrapping data; we might not get any data", node.NodeID)
}
internalNodesMap[node.NodeID] = struct{}{}
}
log.Info().Msg("")

for _, collector := range collectors {
if _, ok := internalNodesMap[collector.NodeID]; ok {
Expand Down Expand Up @@ -134,31 +172,25 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
// The EFM Recovery State Machine will heuristically reject recovery attempts (specifically reject EpochRecover Service
// events, when the intersection between consensus and random beacon committees is too small.

epochProtocolState, err := snapshot.EpochProtocolState()
if err != nil {
return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err)
}
currentEpochCommit := epochProtocolState.EpochCommit()

// NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be
// different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be
// included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed.
dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(currentEpochCommit.DKGGroupKey.Encode()))
dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode()))
if cdcErr != nil {
return nil, fmt.Errorf("failed to convert Random Beacon group key to cadence representation: %w", cdcErr)
}

// copy DKG index map from the current epoch
dkgIndexMapPairs := make([]cadence.KeyValuePair, 0)
for nodeID, index := range currentEpochCommit.DKGIndexMap {
for nodeID, index := range dkgIndexMap {
dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{
Key: cadence.String(nodeID.String()),
Value: cadence.NewInt(index),
})
}
// copy DKG public keys from the current epoch
dkgPubKeys := make([]cadence.Value, 0)
for k, dkgPubKey := range currentEpochCommit.DKGParticipantKeys {
for k, dkgPubKey := range dkgParticipantKeys {
dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode()))
if cdcErr != nil {
return nil, fmt.Errorf("failed convert public beacon key of participant %d to cadence representation: %w", k, cdcErr)
Expand Down
138 changes: 134 additions & 4 deletions integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package cohort2

import (
"fmt"

"strings"
"testing"
"time"
Expand All @@ -15,11 +14,13 @@ import (
sdk "github.com/onflow/flow-go-sdk"

"github.com/onflow/flow-go/cmd/bootstrap/run"
"github.com/onflow/flow-go/cmd/util/cmd/common"
"github.com/onflow/flow-go/integration/tests/epochs"
"github.com/onflow/flow-go/integration/utils"
"github.com/onflow/flow-go/model/bootstrap"
"github.com/onflow/flow-go/model/flow"
"github.com/onflow/flow-go/model/flow/filter"
"github.com/onflow/flow-go/utils/unittest"
)

func TestRecoverEpoch(t *testing.T) {
Expand All @@ -39,9 +40,9 @@ func (s *RecoverEpochSuite) SetupTest() {
s.EpochLen = 150
s.FinalizationSafetyThreshold = 20
s.NumOfCollectionClusters = 1
// we need to use 3 consensus nodes to be able to eject a single node from the consensus committee
// and still have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold
s.NumOfConsensusNodes = 3
// we need to use 4 consensus nodes to be able to eject a single node and still have a super-majority and
// have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold.
s.NumOfConsensusNodes = 4

// run the generic setup, which starts up the network
s.BaseSuite.SetupTest()
Expand Down Expand Up @@ -290,3 +291,132 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() {

s.AssertInEpoch(s.Ctx, 1)
}

// TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that
// enters Epoch Fallback Mode can successfully recover.
// Here, we are testing a scenario where the consensus committee 𝒞 and Random Beacon committee 𝒟 form a symmetric difference with
// cardinality 1. Formally, |𝒞 ∖ 𝒟| = 1 and |𝒟 \ 𝒞| = 1. In other words, there is a node which is part of the consensus committee but not
// part of the Random Beacon committee and another node which is part of the Random Beacon committee but not part of the consensus committee.
// We remove the first consensus node from the Consensus Committee, and the last consensus node from the Random Beacon Committee. For example,
// if the original consensus set is {A, B, C, D} then:
// - the post-recovery consensus committee is {B, C, D}
// - the post-recovery random beacon committee is {A, B, C}
//
// This test will do the following:
// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail.
// 2. Eject the first consensus node by modifying the epoch snapshot.
// 3. Drop the last consensus node from the Random Beacon committee. This hack works only for threshold systems with an even number of participants,
// without changing the threshold - hence we need to start this test with 4 consensus nodes.
// 4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee.
// 5. Submit recover epoch transaction.
// 6. Ensure expected EpochRecover event is emitted.
// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.
func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() {
// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail.

// pause the collection node to trigger EFM by failing DKG
ln := s.GetContainersByRole(flow.RoleCollection)[0]
require.NoError(s.T(), ln.Pause())
s.AwaitFinalizedView(s.Ctx, s.GetDKGEndView(), 2*time.Minute, 500*time.Millisecond)
// start the paused collection node now that we are in EFM
require.NoError(s.T(), ln.Start())

// get final view from the latest snapshot
epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView()
require.NoError(s.T(), err)

// Wait for at least the first view past the current epoch's original FinalView to be finalized.
s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1)
s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond)
s.TimedLogf("observed finalized view %d", epoch1FinalView+1)

// assert that we are in EFM
snapshot, err := s.Client.GetLatestProtocolSnapshot(s.Ctx)
require.NoError(s.T(), err)
epochPhase, err := snapshot.EpochPhase()
require.NoError(s.T(), err)
require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point")

// 2. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args.
// By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node
// which is not part of the consensus committee but is part of the Random Beacon committee.
currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable
ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0]
ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected // writes through to `currentIdentityTable`

// 3. Modify DKG data by removing the last node of the consensus committee from DKG committee. This way we ensure that consensus
// committee has a node which is not part of the Random Beacon committee. For threshold committees of *even size*, we can remove a
// single node without changing the threshold (see [ref. 1] for details). In other words, we can just pretend that there was originally
// one node less in the DKG, while the same number of signatures (threshold +1) are sufficient to construct a group signature.
//
// [ref. 1] function `RandomBeaconThreshold` for computing the threshold in package module/signature; note
// that for reconstructing the group sig, _strictly more_ than `threshold` sig shares are required.
randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))
nConsensusNodes := len(randomBeaconParticipants) - 1

// 4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee.
recoveryDkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes)
for i, participant := range randomBeaconParticipants[:nConsensusNodes] {
recoveryDkgIndexMap[participant.NodeID] = i
}

epochProtocolState, err := snapshot.EpochProtocolState()
require.NoError(s.T(), err)
dkg, err := epochProtocolState.DKG()
require.NoError(s.T(), err)
recoveryThresholdKeyShares := dkg.KeyShares()[:nConsensusNodes]
recoveryThresholdGroupKey := dkg.GroupKey()

// read internal node info from one of the consensus nodes
internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus)
internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson)
require.NoError(s.T(), err)

// At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and
// another node which is part of the Random Beacon committee but not part of the consensus committee.
collectionClusters := s.NumOfCollectionClusters
recoveryEpochCounter := uint64(1)
txArgs, err := run.GenerateRecoverTxArgsWithDKG(
s.Log,
internalNodes,
collectionClusters,
recoveryEpochCounter,
flow.Localnet,
s.StakingAuctionLen,
s.EpochLen,
3000,
false,
recoveryDkgIndexMap,
recoveryThresholdKeyShares,
recoveryThresholdGroupKey,
snapshot,
)
require.NoError(s.T(), err)

// 5. Submit recover epoch transaction to the network.
env := utils.LocalnetEnv()
result := s.recoverEpoch(env, txArgs)
require.NoError(s.T(), result.Error)
require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed)

// 6. Ensure expected EpochRecover event is emitted.
eventType := ""
for _, evt := range result.Events {
if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") {
eventType = evt.Type
break
}
}
require.NotEmpty(s.T(), eventType, "expected FlowEpoch.EpochRecover event type")
events, err := s.Client.GetEventsForBlockIDs(s.Ctx, eventType, []sdk.Identifier{result.BlockID})
require.NoError(s.T(), err)
require.Equal(s.T(), events[0].Events[0].Type, eventType)

// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.
startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64))
s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch)
s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond)
s.TimedLogf("observed finalized first view of recovery epoch %d", startViewOfNextEpoch)

s.AssertInEpoch(s.Ctx, 1)
}

0 comments on commit 2e44aa0

Please sign in to comment.