Skip to content

Commit

Permalink
Merge pull request #138755 from cockroachdb/blathers/backport-release…
Browse files Browse the repository at this point in the history
…-24.3-138343

release-24.3: sql: setup http router after version initialization
  • Loading branch information
xinhaoz authored Jan 14, 2025
2 parents 0c830cc + 8468409 commit 6c7b277
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 44 deletions.
66 changes: 32 additions & 34 deletions pkg/ccl/serverccl/tenant_vars_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,41 +69,40 @@ func TestTenantVars(t *testing.T) {
metrics, err := parser.TextToMetricFamilies(resp.Body)
require.NoError(t, err)

userCPU, found := metrics["sys_cpu_user_ns"]
tenantUserCPU, found := metrics["sys_cpu_user_ns"]
require.True(t, found)
require.Len(t, userCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, userCPU.GetType())
cpuUserNanos := userCPU.Metric[0].GetGauge().GetValue()
require.Len(t, tenantUserCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, tenantUserCPU.GetType())
tenantCpuUserNanos := tenantUserCPU.Metric[0].GetGauge().GetValue()

sysCPU, found := metrics["sys_cpu_sys_ns"]
tenantSysCPU, found := metrics["sys_cpu_sys_ns"]
require.True(t, found)
require.Len(t, sysCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, sysCPU.GetType())
cpuSysNanos := sysCPU.Metric[0].GetGauge().GetValue()
require.Len(t, tenantSysCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, tenantSysCPU.GetType())
tenantCpuSysNanos := tenantSysCPU.Metric[0].GetGauge().GetValue()

now, found := metrics["sys_cpu_now_ns"]
require.True(t, found)
require.Len(t, now.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, now.GetType())
nowNanos := now.Metric[0].GetGauge().GetValue()

uptime, found := metrics["sys_uptime"]
tenantUptime, found := metrics["sys_uptime"]
require.True(t, found)
require.Len(t, uptime.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, uptime.GetType())
uptimeSeconds := uptime.Metric[0].GetGauge().GetValue()
require.Len(t, tenantUptime.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, tenantUptime.GetType())
uptimeSeconds := tenantUptime.Metric[0].GetGauge().GetValue()

// The values are between zero and whatever User/Sys time is observed after the get.
require.LessOrEqual(t, float64(startNowNanos), nowNanos)
require.LessOrEqual(t, nowNanos, float64(timeutil.Now().UnixNano()))

cpuTime := gosigar.ProcTime{}
require.NoError(t, cpuTime.Get(os.Getpid()))
require.LessOrEqual(t, 0., cpuUserNanos)
require.LessOrEqual(t, cpuUserNanos, float64(cpuTime.User)*1e6)
require.LessOrEqual(t, 0., cpuSysNanos)
require.LessOrEqual(t, cpuSysNanos, float64(cpuTime.Sys)*1e6)

testCpuTime := gosigar.ProcTime{}
require.NoError(t, testCpuTime.Get(os.Getpid()))
require.LessOrEqual(t, 0., tenantCpuUserNanos)
require.LessOrEqual(t, tenantCpuUserNanos, float64(testCpuTime.User)*1e6)
require.LessOrEqual(t, 0., tenantCpuSysNanos)
require.LessOrEqual(t, tenantCpuSysNanos, float64(testCpuTime.Sys)*1e6)
require.LessOrEqual(t, 0., uptimeSeconds)

resp, err = client.Get(url)
Expand All @@ -115,30 +114,29 @@ func TestTenantVars(t *testing.T) {
metrics, err = parser.TextToMetricFamilies(resp.Body)
require.NoError(t, err)

userCPU, found = metrics["sys_cpu_user_ns"]
tenantUserCPU, found = metrics["sys_cpu_user_ns"]
require.True(t, found)
require.Len(t, userCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, userCPU.GetType())
cpuUserNanos2 := userCPU.Metric[0].GetGauge().GetValue()
require.Len(t, tenantUserCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, tenantUserCPU.GetType())
tenantCpuUserNanos2 := tenantUserCPU.Metric[0].GetGauge().GetValue()

sysCPU, found = metrics["sys_cpu_sys_ns"]
tenantSysCPU, found = metrics["sys_cpu_sys_ns"]
require.True(t, found)
require.Len(t, sysCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, sysCPU.GetType())
// cpuSysNanos2 := sysCPU.Metric[0].GetGauge().GetValue()
require.Len(t, tenantSysCPU.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, tenantSysCPU.GetType())
tenantCpuSysNanos2 := tenantSysCPU.Metric[0].GetGauge().GetValue()

uptime, found = metrics["sys_uptime"]
tenantUptime, found = metrics["sys_uptime"]
require.True(t, found)
require.Len(t, uptime.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, uptime.GetType())
uptimeSeconds2 := uptime.Metric[0].GetGauge().GetValue()
require.Len(t, tenantUptime.GetMetric(), 1)
require.Equal(t, io_prometheus_client.MetricType_GAUGE, tenantUptime.GetType())
uptimeSeconds2 := tenantUptime.Metric[0].GetGauge().GetValue()

cpuTime2 := gosigar.ProcTime{}
require.NoError(t, cpuTime2.Get(os.Getpid()))

require.LessOrEqual(t, float64(cpuTime2.User-cpuTime.User)*1e6, cpuUserNanos2)
// TODO(#119329): Sometimes our metrics have 0 cpuSysNanos.
// require.LessOrEqual(t, float64(cpuTime2.Sys-cpuTime.Sys)*1e6, cpuSysNanos2)
require.Less(t, tenantCpuUserNanos2, float64(cpuTime2.User)*1e6)
require.LessOrEqual(t, tenantCpuSysNanos2, float64(cpuTime2.Sys)*1e6)
require.LessOrEqual(t, uptimeSeconds, uptimeSeconds2)

_, found = metrics["jobs_running_non_idle"]
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func RegisterTests(r registry.Registry) {
registerHibernate(r, hibernateOpts)
registerHibernate(r, hibernateSpatialOpts)
registerHotSpotSplits(r)
registerHTTPRestart(r)
registerImportCancellation(r)
registerImportDecommissioned(r)
registerImportMixedVersions(r)
Expand Down
93 changes: 93 additions & 0 deletions pkg/cmd/roachtest/tests/versionupgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ import (

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/testutils/release"
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/version"
)
Expand Down Expand Up @@ -116,6 +119,7 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) {
}

mvt := mixedversion.NewTest(testCtx, t, t.L(), c, c.All(), opts...)

mvt.InMixedVersion(
"maybe run backup",
func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
Expand Down Expand Up @@ -294,3 +298,92 @@ for i in 1 2 3 4; do
done
`)
}

// This is a regression test for a race detailed in
// https://github.com/cockroachdb/cockroach/issues/138342, where it became
// possible for an HTTP request to cause a fatal error if the sql server
// did not initialize the cluster version in time.
func registerHTTPRestart(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "http-register-routes/mixed-version",
Owner: registry.OwnerObservability,
Cluster: r.MakeClusterSpec(4),
CompatibleClouds: registry.AllClouds,
Suites: registry.Suites(registry.Nightly),
Randomized: true,
Run: runHTTPRestart,
Timeout: 1 * time.Hour,
})
}

func runHTTPRestart(ctx context.Context, t test.Test, c cluster.Cluster) {
mvt := mixedversion.NewTest(ctx, t, t.L(), c,
c.CRDBNodes(),
mixedversion.AlwaysUseLatestPredecessors,
)

// Any http request requiring auth will do.
httpReq := tspb.TimeSeriesQueryRequest{
StartNanos: timeutil.Now().UnixNano() - 10*time.Second.Nanoseconds(),
EndNanos: timeutil.Now().UnixNano(),
// Ask for 10s intervals.
SampleNanos: (10 * time.Second).Nanoseconds(),
Queries: []tspb.Query{{
Name: "cr.node.sql.service.latency-p90",
SourceAggregator: tspb.TimeSeriesQueryAggregator_MAX.Enum(),
}},
}

httpCall := func(ctx context.Context, node int, l *logger.Logger, useSystemTenant bool) {
logEvery := roachtestutil.Every(1 * time.Second)
var clientOpts []func(opts *roachtestutil.RoachtestHTTPOptions)
var urlOpts []option.OptionFunc
if useSystemTenant {
clientOpts = append(clientOpts, roachtestutil.VirtualCluster(install.SystemInterfaceName))
urlOpts = append(urlOpts, option.VirtualClusterName(install.SystemInterfaceName))
}
client := roachtestutil.DefaultHTTPClient(c, l, clientOpts...)
adminUrls, err := c.ExternalAdminUIAddr(ctx, l, c.Node(node), urlOpts...)
if err != nil {
t.Fatal(err)
}
url := "https://" + adminUrls[0] + "/ts/query"
l.Printf("Sending requests to %s", url)

var response tspb.TimeSeriesQueryResponse
// Eventually we should see a successful request.
reqSuccess := false
for {
select {
case <-ctx.Done():
if !reqSuccess {
t.Fatalf("n%d: No successful http requests made.", node)
}
return
default:
}
if err := client.PostProtobuf(ctx, url, &httpReq, &response); err != nil {
if logEvery.ShouldLog() {
l.Printf("n%d: Error posting protobuf: %s", node, err)
}
continue
}
reqSuccess = true
}
}

for _, n := range c.CRDBNodes() {
mvt.BackgroundFunc("HTTP requests to system tenant", func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
httpCall(ctx, n, l, true /* useSystemTenant */)
return nil
})
mvt.BackgroundFunc("HTTP requests to secondary tenant", func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
if h.DeploymentMode() == mixedversion.SystemOnlyDeployment {
return nil
}
httpCall(ctx, n, l, false /* useSystemTenant */)
return nil
})
}
mvt.Run()
}
23 changes: 13 additions & 10 deletions pkg/server/tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -799,9 +799,22 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
return err
}

// Start the SQL subsystem.
if err := s.sqlServer.preStart(
workersCtx,
s.stopper,
s.sqlServer.cfg.TestingKnobs,
orphanedLeasesTimeThresholdNanos,
); err != nil {
return err
}

// Connect the HTTP endpoints. This also wraps the privileged HTTP
// endpoints served by gwMux by the HTTP cookie authentication
// check.
// NB: This must occur after sqlServer.preStart() which initializes
// the cluster version from storage as the http auth server relies on
// the cluster version being initialized.
if err := s.http.setupRoutes(ctx,
s.authentication, /* authnServer */
s.adminAuthzCheck, /* adminAuthzCheck */
Expand All @@ -828,16 +841,6 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
return err
}

// Start the SQL subsystem.
if err := s.sqlServer.preStart(
workersCtx,
s.stopper,
s.sqlServer.cfg.TestingKnobs,
orphanedLeasesTimeThresholdNanos,
); err != nil {
return err
}

// Initialize the external storage builders configuration params now that the
// engines have been created. The object can be used to create ExternalStorage
// objects hereafter.
Expand Down

0 comments on commit 6c7b277

Please sign in to comment.