Merge #132125

132125: kvserver: allow enabling rac pull mode with a send queue r=sumeerbhola a=kvoli 1-4 commits from #132344 --- By default,`kvadmission.flow_control.mode = "apply_to_elastic"` replication admission control operates in push mode, where Raft has its own flow control machinery in operation and controls sending messages to replicas. In pull mode, which could not be enabled prior to this commit (outside of tests), replication flow control (v2) maintains its own machinery (send queue) and controls when to send a message to a replica. Previously, when `kvadmission.flow_control.mode = "apply_to_all"`, replication admission control would remain in push mode. Allow replication flow control to run in pull mode outside of tests, by setting both: ``` kvadmission.flow_control.enabled = true (default true) kvadmission.flow_control.mode = "apply_to_all" (default "apply_to_elastic") ``` And running a cluster which has finalized its upgrade to `v24.3`. We omit a release note, as this functionality is disabled by default and we wish to retain the ability to selectively suggest enabling pull mode to users, as the feature is (1) newly introduced, (2) on the critical path for all replication and (3) subject to change. Part of: #128040 Resolves: #132115 Release note: None Co-authored-by: Austen McClernon <[email protected]>
cockroachdb · Oct 11, 2024 · 64b297f · 64b297f
2 parents 232a855 + 6a28392
commit 64b297f
Show file tree

Hide file tree

Showing 57 changed files with 4,818 additions and 1,038 deletions.
diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go
diff --git a/pkg/kv/kvserver/kvflowcontrol/BUILD.bazel b/pkg/kv/kvserver/kvflowcontrol/BUILD.bazel
@@ -18,7 +18,6 @@ go_library(
         "//pkg/settings",
         "//pkg/settings/cluster",
         "//pkg/util/admission/admissionpb",
-        "//pkg/util/metamorphic",
         "@com_github_cockroachdb_redact//:redact",
         "@com_github_dustin_go_humanize//:go-humanize",
     ],

diff --git a/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go b/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go
@@ -19,7 +19,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/settings"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
-	"github.com/cockroachdb/cockroach/pkg/util/metamorphic"
 	"github.com/cockroachdb/redact"
 	"github.com/dustin/go-humanize"
 )
@@ -38,11 +37,7 @@ var Mode = settings.RegisterEnumSetting(
 	settings.SystemOnly,
 	"kvadmission.flow_control.mode",
 	"determines the 'mode' of flow control we use for replication traffic in KV, if enabled",
-	metamorphic.ConstantWithTestChoice(
-		"kvadmission.flow_control.mode",
-		modeDict[ApplyToElastic], /* default value */
-		modeDict[ApplyToAll],     /* other value */
-	),
+	modeDict[ApplyToElastic], /* default value */
 	modeDict,
 )
 

diff --git a/pkg/kv/kvserver/kvflowcontrol/testing_knobs.go b/pkg/kv/kvserver/kvflowcontrol/testing_knobs.go
@@ -21,6 +21,17 @@ type TestingKnobs struct {
 	// OverrideV2EnabledWhenLeaderLevel is used to override the level at which
 	// RACv2 is enabled when a replica is the leader.
 	OverrideV2EnabledWhenLeaderLevel func() V2EnabledWhenLeaderLevel
+	// OverridePullPushMode is used to override whether the pull mode, or push
+	// mode is enabled.
+	//
+	// - when set to true, pull mode is enabled
+	// - when set to false, push mode is enabled
+	// - when left unset the otherwise set mode is used
+	//
+	// This is used to test the behavior of the flow control in push and pull
+	// mode, while also having the ability to switch between the two
+	// apply_to_(elastic|all) modes.
+	OverridePullPushMode func() bool
 }
 
 // TestingKnobsV1 are the testing knobs that appply to replication flow control

diff --git a/pkg/kv/kvserver/replica_proposal_quota.go b/pkg/kv/kvserver/replica_proposal_quota.go
@@ -109,15 +109,25 @@ func (r *Replica) getQuotaPoolEnabledRLocked(ctx context.Context) bool {
 // admission/flow control should use pull mode, which allows for a send-queue
 // and disabled raft's own flow control.
 func (r *Replica) shouldReplicationAdmissionControlUsePullMode(ctx context.Context) bool {
-	return r.store.cfg.Settings.Version.IsActive(ctx, clusterversion.V24_3_UseRACV2Full) &&
+	if knobs := r.store.cfg.TestingKnobs.FlowControlTestingKnobs; knobs != nil &&
+		knobs.OverridePullPushMode != nil {
+		return knobs.OverridePullPushMode()
+	}
+
+	var versionEnabled bool
+	if knobs := r.store.TestingKnobs().FlowControlTestingKnobs; knobs != nil && knobs.OverrideV2EnabledWhenLeaderLevel != nil {
+		versionEnabled = knobs.OverrideV2EnabledWhenLeaderLevel() == kvflowcontrol.V2EnabledWhenLeaderV2Encoding
+	} else {
+		versionEnabled = r.store.cfg.Settings.Version.IsActive(ctx, clusterversion.V24_3_UseRACV2Full)
+	}
+
+	return (versionEnabled &&
 		kvflowcontrol.Mode.Get(&r.store.cfg.Settings.SV) == kvflowcontrol.ApplyToAll &&
-		kvflowcontrol.Enabled.Get(&r.store.cfg.Settings.SV)
+		kvflowcontrol.Enabled.Get(&r.store.cfg.Settings.SV))
 }
 
 func (r *Replica) replicationAdmissionControlModeToUse(ctx context.Context) rac2.RaftMsgAppMode {
-	// TODO(sumeer): remove the false.
-	usePullMode := r.shouldReplicationAdmissionControlUsePullMode(ctx) && false
-	if usePullMode {
+	if r.shouldReplicationAdmissionControlUsePullMode(ctx) {
 		return rac2.MsgAppPull
 	}
 	return rac2.MsgAppPush

diff --git a/...v2/admission_post_split_merge_v1_encoding → ...post_split_merge_v1_encoding_apply_to_all b/...v2/admission_post_split_merge_v1_encoding → ...post_split_merge_v1_encoding_apply_to_all
@@ -8,7 +8,7 @@
 echo
 ----
 ----
--- Flow token metrics from n1 after issuing a regular 2*1MiB 3x replicated write
+-- Flow token metrics from n1 after issuing a 2*1MiB 3x replicated write
 -- that are yet to get admitted. We see 2*3*1MiB=6MiB deductions of
 -- {regular,elastic} tokens with no corresponding returns. The 2*1MiB writes
 -- happened on what is soon going to be the LHS and RHS of a range being split.
@@ -94,7 +94,7 @@ ORDER BY streams DESC;
 -- (Merging ranges.)
 
 
--- Flow token metrics from n1 after issuing 4MiB of regular replicated writes to
+-- Flow token metrics from n1 after issuing 4MiB of replicated writes to
 -- the post-merged range. We should see 12MiB extra tokens deducted which comes
 -- from 4MiB*3=12MiB. So we stand at 21MiB+12MiB=33MiB tokens deducted now. The
 -- RHS of the range is gone now, and the previously 3*3MiB=9MiB of tokens

diff --git a/...tdata/flow_control_integration_v2/admission_post_split_merge_v1_encoding_apply_to_elastic b/...tdata/flow_control_integration_v2/admission_post_split_merge_v1_encoding_apply_to_elastic
@@ -0,0 +1,181 @@
+# -- When using v1 encoding (V2EnabledWhenLeaderV1Encoding), all entries which
+# -- are subject to admission control are encoded as `raftpb.LowPri`,
+# -- regardless of their original priority; to avoid the overhead of
+# -- deserializing the raft admission metadata. Therefore, as the underlying
+# -- test is shared between the v1 and v2 encoding testdata files, the reader
+# -- should interpret any comments referring to regular tokens as referring to
+# -- elastic token.
+echo
+----
+----
+-- Flow token metrics from n1 after issuing a 2*1MiB 3x replicated write
+-- that are yet to get admitted. We see 2*3*1MiB=6MiB deductions of
+-- {regular,elastic} tokens with no corresponding returns. The 2*1MiB writes
+-- happened on what is soon going to be the LHS and RHS of a range being split.
+SELECT name, crdb_internal.humanize_bytes(value::INT8)
+    FROM crdb_internal.node_metrics
+   WHERE name LIKE '%kvflowcontrol%tokens%'
+ORDER BY name ASC;
+
+  kvflowcontrol.tokens.eval.elastic.available                       | 18 MiB   
+  kvflowcontrol.tokens.eval.elastic.deducted                        | 6.0 MiB  
+  kvflowcontrol.tokens.eval.elastic.returned                        | 0 B      
+  kvflowcontrol.tokens.eval.elastic.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.eval.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.eval.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.eval.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.eval.regular.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.elastic.available                       | 18 MiB   
+  kvflowcontrol.tokens.send.elastic.deducted                        | 6.0 MiB  
+  kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B      
+  kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.elastic.returned                        | 0 B      
+  kvflowcontrol.tokens.send.elastic.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.send.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.send.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.send.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.send.regular.unaccounted                     | 0 B      
+
+
+-- (Splitting range.)
+
+
+-- Flow token metrics from n1 after further issuing 2MiB and 3MiB writes to
+-- post-split LHS and RHS ranges respectively. We should see 15MiB extra tokens
+-- deducted which comes from (2MiB+3MiB)*3=15MiB. So we stand at
+-- 6MiB+15MiB=21MiB now.
+SELECT name, crdb_internal.humanize_bytes(value::INT8)
+    FROM crdb_internal.node_metrics
+   WHERE name LIKE '%kvflowcontrol%tokens%'
+ORDER BY name ASC;
+
+  kvflowcontrol.tokens.eval.elastic.available                       | 3.0 MiB  
+  kvflowcontrol.tokens.eval.elastic.deducted                        | 21 MiB   
+  kvflowcontrol.tokens.eval.elastic.returned                        | 0 B      
+  kvflowcontrol.tokens.eval.elastic.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.eval.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.eval.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.eval.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.eval.regular.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.elastic.available                       | 3.0 MiB  
+  kvflowcontrol.tokens.send.elastic.deducted                        | 21 MiB   
+  kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B      
+  kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.elastic.returned                        | 0 B      
+  kvflowcontrol.tokens.send.elastic.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.send.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.send.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.send.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.send.regular.unaccounted                     | 0 B      
+
+
+-- Observe the newly split off replica, with its own three streams.
+SELECT range_id, count(*) AS streams
+    FROM crdb_internal.kv_flow_control_handles_v2
+GROUP BY (range_id)
+ORDER BY streams DESC;
+
+  range_id | stream_count  
+-----------+---------------
+  70       | 3             
+  71       | 3             
+
+
+-- (Merging ranges.)
+
+
+-- Flow token metrics from n1 after issuing 4MiB of replicated writes to
+-- the post-merged range. We should see 12MiB extra tokens deducted which comes
+-- from 4MiB*3=12MiB. So we stand at 21MiB+12MiB=33MiB tokens deducted now. The
+-- RHS of the range is gone now, and the previously 3*3MiB=9MiB of tokens
+-- deducted for it are released at the subsuming LHS leaseholder.
+SELECT name, crdb_internal.humanize_bytes(value::INT8)
+    FROM crdb_internal.node_metrics
+   WHERE name LIKE '%kvflowcontrol%tokens%'
+ORDER BY name ASC;
+
+  kvflowcontrol.tokens.eval.elastic.available                       | 0 B      
+  kvflowcontrol.tokens.eval.elastic.deducted                        | 33 MiB   
+  kvflowcontrol.tokens.eval.elastic.returned                        | 9.0 MiB  
+  kvflowcontrol.tokens.eval.elastic.returned.disconnect             | 9.0 MiB  
+  kvflowcontrol.tokens.eval.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.eval.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.eval.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.eval.regular.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.elastic.available                       | 0 B      
+  kvflowcontrol.tokens.send.elastic.deducted                        | 33 MiB   
+  kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B      
+  kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.elastic.returned                        | 9.0 MiB  
+  kvflowcontrol.tokens.send.elastic.returned.disconnect             | 9.0 MiB  
+  kvflowcontrol.tokens.send.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.send.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.send.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.send.regular.unaccounted                     | 0 B      
+
+
+-- Observe only the merged replica with its own three streams.
+SELECT range_id, count(*) AS streams
+    FROM crdb_internal.kv_flow_control_handles_v2
+GROUP BY (range_id)
+ORDER BY streams DESC;
+
+  range_id | stream_count  
+-----------+---------------
+  70       | 3             
+
+
+-- (Allow below-raft admission to proceed.)
+
+
+-- Flow token metrics from n1 after work gets admitted. We see all outstanding
+-- {regular,elastic} tokens returned, including those from:
+-- - the LHS before the merge, and
+-- - the LHS and RHS before the original split.
+SELECT name, crdb_internal.humanize_bytes(value::INT8)
+    FROM crdb_internal.node_metrics
+   WHERE name LIKE '%kvflowcontrol%tokens%'
+ORDER BY name ASC;
+
+  kvflowcontrol.tokens.eval.elastic.available                       | 24 MiB   
+  kvflowcontrol.tokens.eval.elastic.deducted                        | 33 MiB   
+  kvflowcontrol.tokens.eval.elastic.returned                        | 33 MiB   
+  kvflowcontrol.tokens.eval.elastic.returned.disconnect             | 9.0 MiB  
+  kvflowcontrol.tokens.eval.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.eval.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.eval.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.eval.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.eval.regular.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.elastic.available                       | 24 MiB   
+  kvflowcontrol.tokens.send.elastic.deducted                        | 33 MiB   
+  kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B      
+  kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.elastic.returned                        | 33 MiB   
+  kvflowcontrol.tokens.send.elastic.returned.disconnect             | 9.0 MiB  
+  kvflowcontrol.tokens.send.elastic.unaccounted                     | 0 B      
+  kvflowcontrol.tokens.send.regular.available                       | 48 MiB   
+  kvflowcontrol.tokens.send.regular.deducted                        | 0 B      
+  kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue     | 0 B      
+  kvflowcontrol.tokens.send.regular.returned                        | 0 B      
+  kvflowcontrol.tokens.send.regular.returned.disconnect             | 0 B      
+  kvflowcontrol.tokens.send.regular.unaccounted                     | 0 B      
+----
+----
+
+# vim:ft=sql
diff --git a/...integration_v2/admission_post_split_merge → ...post_split_merge_v2_encoding_apply_to_all b/...integration_v2/admission_post_split_merge → ...post_split_merge_v2_encoding_apply_to_all
@@ -1,7 +1,7 @@
 echo
 ----
 ----
--- Flow token metrics from n1 after issuing a regular 2*1MiB 3x replicated write
+-- Flow token metrics from n1 after issuing a 2*1MiB 3x replicated write
 -- that are yet to get admitted. We see 2*3*1MiB=6MiB deductions of
 -- {regular,elastic} tokens with no corresponding returns. The 2*1MiB writes
 -- happened on what is soon going to be the LHS and RHS of a range being split.
@@ -87,7 +87,7 @@ ORDER BY streams DESC;
 -- (Merging ranges.)
 
 
--- Flow token metrics from n1 after issuing 4MiB of regular replicated writes to
+-- Flow token metrics from n1 after issuing 4MiB of replicated writes to
 -- the post-merged range. We should see 12MiB extra tokens deducted which comes
 -- from 4MiB*3=12MiB. So we stand at 21MiB+12MiB=33MiB tokens deducted now. The
 -- RHS of the range is gone now, and the previously 3*3MiB=9MiB of tokens