Skip to content

Commit

Permalink
Merge #132125
Browse files Browse the repository at this point in the history
132125: kvserver: allow enabling rac pull mode with a send queue  r=sumeerbhola a=kvoli

1-4 commits from #132344

---

By default,`kvadmission.flow_control.mode = "apply_to_elastic"`
replication admission control operates in push mode, where Raft has its
own flow control machinery in operation and controls sending messages to
replicas.

In pull mode, which could not be enabled prior to this commit (outside
of tests), replication flow control (v2) maintains its own machinery
(send queue) and controls when to send a message to a replica.

Previously, when `kvadmission.flow_control.mode = "apply_to_all"`,
replication admission control would remain in push mode.

Allow replication flow control to run in pull mode outside of tests, by
setting both:

```
kvadmission.flow_control.enabled = true (default true)
kvadmission.flow_control.mode = "apply_to_all" (default "apply_to_elastic")
```

And running a cluster which has finalized its upgrade to `v24.3`.

We omit a release note, as this functionality is disabled by default and
we wish to retain the ability to selectively suggest enabling pull mode
to users, as the feature is (1) newly introduced, (2) on the critical
path for all replication and (3) subject to change.

Part of: #128040
Resolves: #132115
Release note: None

Co-authored-by: Austen McClernon <[email protected]>
  • Loading branch information
craig[bot] and kvoli committed Oct 11, 2024
2 parents 232a855 + 6a28392 commit 64b297f
Show file tree
Hide file tree
Showing 57 changed files with 4,818 additions and 1,038 deletions.
2,108 changes: 1,163 additions & 945 deletions pkg/kv/kvserver/flow_control_integration_test.go

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pkg/kv/kvserver/kvflowcontrol/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ go_library(
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/util/admission/admissionpb",
"//pkg/util/metamorphic",
"@com_github_cockroachdb_redact//:redact",
"@com_github_dustin_go_humanize//:go-humanize",
],
Expand Down
7 changes: 1 addition & 6 deletions pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/metamorphic"
"github.com/cockroachdb/redact"
"github.com/dustin/go-humanize"
)
Expand All @@ -38,11 +37,7 @@ var Mode = settings.RegisterEnumSetting(
settings.SystemOnly,
"kvadmission.flow_control.mode",
"determines the 'mode' of flow control we use for replication traffic in KV, if enabled",
metamorphic.ConstantWithTestChoice(
"kvadmission.flow_control.mode",
modeDict[ApplyToElastic], /* default value */
modeDict[ApplyToAll], /* other value */
),
modeDict[ApplyToElastic], /* default value */
modeDict,
)

Expand Down
11 changes: 11 additions & 0 deletions pkg/kv/kvserver/kvflowcontrol/testing_knobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,17 @@ type TestingKnobs struct {
// OverrideV2EnabledWhenLeaderLevel is used to override the level at which
// RACv2 is enabled when a replica is the leader.
OverrideV2EnabledWhenLeaderLevel func() V2EnabledWhenLeaderLevel
// OverridePullPushMode is used to override whether the pull mode, or push
// mode is enabled.
//
// - when set to true, pull mode is enabled
// - when set to false, push mode is enabled
// - when left unset the otherwise set mode is used
//
// This is used to test the behavior of the flow control in push and pull
// mode, while also having the ability to switch between the two
// apply_to_(elastic|all) modes.
OverridePullPushMode func() bool
}

// TestingKnobsV1 are the testing knobs that appply to replication flow control
Expand Down
20 changes: 15 additions & 5 deletions pkg/kv/kvserver/replica_proposal_quota.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,25 @@ func (r *Replica) getQuotaPoolEnabledRLocked(ctx context.Context) bool {
// admission/flow control should use pull mode, which allows for a send-queue
// and disabled raft's own flow control.
func (r *Replica) shouldReplicationAdmissionControlUsePullMode(ctx context.Context) bool {
return r.store.cfg.Settings.Version.IsActive(ctx, clusterversion.V24_3_UseRACV2Full) &&
if knobs := r.store.cfg.TestingKnobs.FlowControlTestingKnobs; knobs != nil &&
knobs.OverridePullPushMode != nil {
return knobs.OverridePullPushMode()
}

var versionEnabled bool
if knobs := r.store.TestingKnobs().FlowControlTestingKnobs; knobs != nil && knobs.OverrideV2EnabledWhenLeaderLevel != nil {
versionEnabled = knobs.OverrideV2EnabledWhenLeaderLevel() == kvflowcontrol.V2EnabledWhenLeaderV2Encoding
} else {
versionEnabled = r.store.cfg.Settings.Version.IsActive(ctx, clusterversion.V24_3_UseRACV2Full)
}

return (versionEnabled &&
kvflowcontrol.Mode.Get(&r.store.cfg.Settings.SV) == kvflowcontrol.ApplyToAll &&
kvflowcontrol.Enabled.Get(&r.store.cfg.Settings.SV)
kvflowcontrol.Enabled.Get(&r.store.cfg.Settings.SV))
}

func (r *Replica) replicationAdmissionControlModeToUse(ctx context.Context) rac2.RaftMsgAppMode {
// TODO(sumeer): remove the false.
usePullMode := r.shouldReplicationAdmissionControlUsePullMode(ctx) && false
if usePullMode {
if r.shouldReplicationAdmissionControlUsePullMode(ctx) {
return rac2.MsgAppPull
}
return rac2.MsgAppPush
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
echo
----
----
-- Flow token metrics from n1 after issuing a regular 2*1MiB 3x replicated write
-- Flow token metrics from n1 after issuing a 2*1MiB 3x replicated write
-- that are yet to get admitted. We see 2*3*1MiB=6MiB deductions of
-- {regular,elastic} tokens with no corresponding returns. The 2*1MiB writes
-- happened on what is soon going to be the LHS and RHS of a range being split.
Expand Down Expand Up @@ -94,7 +94,7 @@ ORDER BY streams DESC;
-- (Merging ranges.)


-- Flow token metrics from n1 after issuing 4MiB of regular replicated writes to
-- Flow token metrics from n1 after issuing 4MiB of replicated writes to
-- the post-merged range. We should see 12MiB extra tokens deducted which comes
-- from 4MiB*3=12MiB. So we stand at 21MiB+12MiB=33MiB tokens deducted now. The
-- RHS of the range is gone now, and the previously 3*3MiB=9MiB of tokens
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# -- When using v1 encoding (V2EnabledWhenLeaderV1Encoding), all entries which
# -- are subject to admission control are encoded as `raftpb.LowPri`,
# -- regardless of their original priority; to avoid the overhead of
# -- deserializing the raft admission metadata. Therefore, as the underlying
# -- test is shared between the v1 and v2 encoding testdata files, the reader
# -- should interpret any comments referring to regular tokens as referring to
# -- elastic token.
echo
----
----
-- Flow token metrics from n1 after issuing a 2*1MiB 3x replicated write
-- that are yet to get admitted. We see 2*3*1MiB=6MiB deductions of
-- {regular,elastic} tokens with no corresponding returns. The 2*1MiB writes
-- happened on what is soon going to be the LHS and RHS of a range being split.
SELECT name, crdb_internal.humanize_bytes(value::INT8)
FROM crdb_internal.node_metrics
WHERE name LIKE '%kvflowcontrol%tokens%'
ORDER BY name ASC;

kvflowcontrol.tokens.eval.elastic.available | 18 MiB
kvflowcontrol.tokens.eval.elastic.deducted | 6.0 MiB
kvflowcontrol.tokens.eval.elastic.returned | 0 B
kvflowcontrol.tokens.eval.elastic.returned.disconnect | 0 B
kvflowcontrol.tokens.eval.elastic.unaccounted | 0 B
kvflowcontrol.tokens.eval.regular.available | 48 MiB
kvflowcontrol.tokens.eval.regular.deducted | 0 B
kvflowcontrol.tokens.eval.regular.returned | 0 B
kvflowcontrol.tokens.eval.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.eval.regular.unaccounted | 0 B
kvflowcontrol.tokens.send.elastic.available | 18 MiB
kvflowcontrol.tokens.send.elastic.deducted | 6.0 MiB
kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.returned | 0 B
kvflowcontrol.tokens.send.elastic.returned.disconnect | 0 B
kvflowcontrol.tokens.send.elastic.unaccounted | 0 B
kvflowcontrol.tokens.send.regular.available | 48 MiB
kvflowcontrol.tokens.send.regular.deducted | 0 B
kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.regular.returned | 0 B
kvflowcontrol.tokens.send.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.send.regular.unaccounted | 0 B


-- (Splitting range.)


-- Flow token metrics from n1 after further issuing 2MiB and 3MiB writes to
-- post-split LHS and RHS ranges respectively. We should see 15MiB extra tokens
-- deducted which comes from (2MiB+3MiB)*3=15MiB. So we stand at
-- 6MiB+15MiB=21MiB now.
SELECT name, crdb_internal.humanize_bytes(value::INT8)
FROM crdb_internal.node_metrics
WHERE name LIKE '%kvflowcontrol%tokens%'
ORDER BY name ASC;

kvflowcontrol.tokens.eval.elastic.available | 3.0 MiB
kvflowcontrol.tokens.eval.elastic.deducted | 21 MiB
kvflowcontrol.tokens.eval.elastic.returned | 0 B
kvflowcontrol.tokens.eval.elastic.returned.disconnect | 0 B
kvflowcontrol.tokens.eval.elastic.unaccounted | 0 B
kvflowcontrol.tokens.eval.regular.available | 48 MiB
kvflowcontrol.tokens.eval.regular.deducted | 0 B
kvflowcontrol.tokens.eval.regular.returned | 0 B
kvflowcontrol.tokens.eval.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.eval.regular.unaccounted | 0 B
kvflowcontrol.tokens.send.elastic.available | 3.0 MiB
kvflowcontrol.tokens.send.elastic.deducted | 21 MiB
kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.returned | 0 B
kvflowcontrol.tokens.send.elastic.returned.disconnect | 0 B
kvflowcontrol.tokens.send.elastic.unaccounted | 0 B
kvflowcontrol.tokens.send.regular.available | 48 MiB
kvflowcontrol.tokens.send.regular.deducted | 0 B
kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.regular.returned | 0 B
kvflowcontrol.tokens.send.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.send.regular.unaccounted | 0 B


-- Observe the newly split off replica, with its own three streams.
SELECT range_id, count(*) AS streams
FROM crdb_internal.kv_flow_control_handles_v2
GROUP BY (range_id)
ORDER BY streams DESC;

range_id | stream_count
-----------+---------------
70 | 3
71 | 3


-- (Merging ranges.)


-- Flow token metrics from n1 after issuing 4MiB of replicated writes to
-- the post-merged range. We should see 12MiB extra tokens deducted which comes
-- from 4MiB*3=12MiB. So we stand at 21MiB+12MiB=33MiB tokens deducted now. The
-- RHS of the range is gone now, and the previously 3*3MiB=9MiB of tokens
-- deducted for it are released at the subsuming LHS leaseholder.
SELECT name, crdb_internal.humanize_bytes(value::INT8)
FROM crdb_internal.node_metrics
WHERE name LIKE '%kvflowcontrol%tokens%'
ORDER BY name ASC;

kvflowcontrol.tokens.eval.elastic.available | 0 B
kvflowcontrol.tokens.eval.elastic.deducted | 33 MiB
kvflowcontrol.tokens.eval.elastic.returned | 9.0 MiB
kvflowcontrol.tokens.eval.elastic.returned.disconnect | 9.0 MiB
kvflowcontrol.tokens.eval.elastic.unaccounted | 0 B
kvflowcontrol.tokens.eval.regular.available | 48 MiB
kvflowcontrol.tokens.eval.regular.deducted | 0 B
kvflowcontrol.tokens.eval.regular.returned | 0 B
kvflowcontrol.tokens.eval.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.eval.regular.unaccounted | 0 B
kvflowcontrol.tokens.send.elastic.available | 0 B
kvflowcontrol.tokens.send.elastic.deducted | 33 MiB
kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.returned | 9.0 MiB
kvflowcontrol.tokens.send.elastic.returned.disconnect | 9.0 MiB
kvflowcontrol.tokens.send.elastic.unaccounted | 0 B
kvflowcontrol.tokens.send.regular.available | 48 MiB
kvflowcontrol.tokens.send.regular.deducted | 0 B
kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.regular.returned | 0 B
kvflowcontrol.tokens.send.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.send.regular.unaccounted | 0 B


-- Observe only the merged replica with its own three streams.
SELECT range_id, count(*) AS streams
FROM crdb_internal.kv_flow_control_handles_v2
GROUP BY (range_id)
ORDER BY streams DESC;

range_id | stream_count
-----------+---------------
70 | 3


-- (Allow below-raft admission to proceed.)


-- Flow token metrics from n1 after work gets admitted. We see all outstanding
-- {regular,elastic} tokens returned, including those from:
-- - the LHS before the merge, and
-- - the LHS and RHS before the original split.
SELECT name, crdb_internal.humanize_bytes(value::INT8)
FROM crdb_internal.node_metrics
WHERE name LIKE '%kvflowcontrol%tokens%'
ORDER BY name ASC;

kvflowcontrol.tokens.eval.elastic.available | 24 MiB
kvflowcontrol.tokens.eval.elastic.deducted | 33 MiB
kvflowcontrol.tokens.eval.elastic.returned | 33 MiB
kvflowcontrol.tokens.eval.elastic.returned.disconnect | 9.0 MiB
kvflowcontrol.tokens.eval.elastic.unaccounted | 0 B
kvflowcontrol.tokens.eval.regular.available | 48 MiB
kvflowcontrol.tokens.eval.regular.deducted | 0 B
kvflowcontrol.tokens.eval.regular.returned | 0 B
kvflowcontrol.tokens.eval.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.eval.regular.unaccounted | 0 B
kvflowcontrol.tokens.send.elastic.available | 24 MiB
kvflowcontrol.tokens.send.elastic.deducted | 33 MiB
kvflowcontrol.tokens.send.elastic.deducted.force_flush_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.elastic.returned | 33 MiB
kvflowcontrol.tokens.send.elastic.returned.disconnect | 9.0 MiB
kvflowcontrol.tokens.send.elastic.unaccounted | 0 B
kvflowcontrol.tokens.send.regular.available | 48 MiB
kvflowcontrol.tokens.send.regular.deducted | 0 B
kvflowcontrol.tokens.send.regular.deducted.prevent_send_queue | 0 B
kvflowcontrol.tokens.send.regular.returned | 0 B
kvflowcontrol.tokens.send.regular.returned.disconnect | 0 B
kvflowcontrol.tokens.send.regular.unaccounted | 0 B
----
----

# vim:ft=sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
echo
----
----
-- Flow token metrics from n1 after issuing a regular 2*1MiB 3x replicated write
-- Flow token metrics from n1 after issuing a 2*1MiB 3x replicated write
-- that are yet to get admitted. We see 2*3*1MiB=6MiB deductions of
-- {regular,elastic} tokens with no corresponding returns. The 2*1MiB writes
-- happened on what is soon going to be the LHS and RHS of a range being split.
Expand Down Expand Up @@ -87,7 +87,7 @@ ORDER BY streams DESC;
-- (Merging ranges.)


-- Flow token metrics from n1 after issuing 4MiB of regular replicated writes to
-- Flow token metrics from n1 after issuing 4MiB of replicated writes to
-- the post-merged range. We should see 12MiB extra tokens deducted which comes
-- from 4MiB*3=12MiB. So we stand at 21MiB+12MiB=33MiB tokens deducted now. The
-- RHS of the range is gone now, and the previously 3*3MiB=9MiB of tokens
Expand Down
Loading

0 comments on commit 64b297f

Please sign in to comment.