From 437513ea725b58636c65246783be4cf958efc995 Mon Sep 17 00:00:00 2001 From: Mateusz Drab Date: Tue, 28 May 2024 01:17:23 +0100 Subject: [PATCH 1/2] feat: Update docker container handler to include health status in stats --- container/docker/handler.go | 9 +++++++-- info/v1/container.go | 7 +++++++ metrics/prometheus.go | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/container/docker/handler.go b/container/docker/handler.go index fc66641f6f..7ffa78cdad 100644 --- a/container/docker/handler.go +++ b/container/docker/handler.go @@ -64,8 +64,9 @@ type dockerContainerHandler struct { creationTime time.Time // Metadata associated with the container. - envs map[string]string - labels map[string]string + envs map[string]string + labels map[string]string + healthStatus string // Image name used for this container. image string @@ -178,6 +179,7 @@ func newDockerContainerHandler( rootfsStorageDir: rootfsStorageDir, envs: make(map[string]string), labels: ctnr.Config.Labels, + healthStatus: ctnr.State.Health.Status, includedMetrics: metrics, zfsParent: zfsParent, } @@ -303,6 +305,9 @@ func (h *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) { // TODO(vmarmol): Get from libcontainer API instead of cgroup manager when we don't have to support older Dockers. func (h *dockerContainerHandler) GetStats() (*info.ContainerStats, error) { stats, err := h.libcontainerHandler.GetStats() + + stats.Health.Status = h.healthStatus + if err != nil { return stats, err } diff --git a/info/v1/container.go b/info/v1/container.go index efcfd5628e..208aed3d96 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -930,6 +930,11 @@ type ProcessStats struct { Ulimits []UlimitSpec `json:"ulimits,omitempty"` } +type Health struct { + // Health status of the container + Status string `json:"status"` +} + type ContainerStats struct { // The time of this stat point. Timestamp time.Time `json:"timestamp"` @@ -969,6 +974,8 @@ type ContainerStats struct { CpuSet CPUSetStats `json:"cpuset,omitempty"` OOMEvents uint64 `json:"oom_events,omitempty"` + + Health Health `json:"health,omitempty"` } func timeEq(t1, t2 time.Time, tolerance time.Duration) bool { diff --git a/metrics/prometheus.go b/metrics/prometheus.go index a6fc3dff4c..97322fef41 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -130,6 +130,24 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }} }, }, + { + name: "container_health_state", + help: "The result of the container's health check", + valueType: prometheus.GaugeValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{ + // inline if to check if s.health.status = healthy + value: func(s *info.ContainerStats) float64 { + if s.Health.Status == "healthy" { + return 1 + } else { + return 0 + } + }(s), + timestamp: s.Timestamp, + }} + }, + }, }, includedMetrics: includedMetrics, opts: opts, From 353ac2e06fc028cb442925ca34d21b60c7feb82d Mon Sep 17 00:00:00 2001 From: Mateusz Drab Date: Thu, 2 Jan 2025 22:17:21 +0000 Subject: [PATCH 2/2] add health status tests --- docs/storage/prometheus.md | 1 + metrics/prometheus_fake.go | 1 + metrics/prometheus_test.go | 3 ++- metrics/testdata/prometheus_metrics | 3 +++ metrics/testdata/prometheus_metrics_perf_aggregated | 3 +++ metrics/testdata/prometheus_metrics_whitelist_filtered | 3 +++ 6 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/storage/prometheus.md b/docs/storage/prometheus.md index 7bb3c465ea..1edc8e5dbe 100644 --- a/docs/storage/prometheus.md +++ b/docs/storage/prometheus.md @@ -47,6 +47,7 @@ Metric name | Type | Description | Unit (where applicable) | option parameter | `container_fs_write_seconds_total` | Counter | Cumulative count of seconds spent writing | seconds | diskIO | `container_fs_writes_merged_total` | Counter | Cumulative count of writes merged | | diskIO | `container_fs_writes_total` | Counter | Cumulative count of writes completed | | diskIO | +`container_health_state` | Gauge | State of the health check probe | | - | `container_hugetlb_failcnt` | Counter | Number of hugepage usage hits limits | | hugetlb | `container_hugetlb_max_usage_bytes` | Gauge | Maximum hugepage usages recorded | bytes | hugetlb | `container_hugetlb_usage_bytes` | Gauge | Current hugepage usage | bytes | hugetlb | diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index 7b7399778d..3d5de40f2b 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -733,6 +733,7 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req }, }, CpuSet: info.CPUSetStats{MemoryMigrate: 1}, + Health: info.Health{Status: "healthy"}, }, }, }, diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index a9ef6f8899..11c44054e6 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -110,12 +110,13 @@ func TestPrometheusCollector_scrapeFailure(t *testing.T) { func TestNewPrometheusCollectorWithPerf(t *testing.T) { c := NewPrometheusCollector(&mockInfoProvider{}, mockLabelFunc, container.MetricSet{container.PerfMetrics: struct{}{}}, now, v2.RequestOptions{}) - assert.Len(t, c.containerMetrics, 5) + assert.Len(t, c.containerMetrics, 6) names := []string{} for _, m := range c.containerMetrics { names = append(names, m.name) } assert.Contains(t, names, "container_last_seen") + assert.Contains(t, names, "container_health_state") assert.Contains(t, names, "container_perf_events_total") assert.Contains(t, names, "container_perf_events_scaling_ratio") assert.Contains(t, names, "container_perf_uncore_events_total") diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index 924e58ede2..c1f1b61135 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -121,6 +121,9 @@ container_fs_writes_merged_total{container_env_foo_env="prod",container_label_fo # TYPE container_fs_writes_total counter container_fs_writes_total{container_env_foo_env="prod",container_label_foo_label="bar",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 28 1395066363000 container_fs_writes_total{container_env_foo_env="prod",container_label_foo_label="bar",device="sda2",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 43 1395066363000 +# HELP container_health_state The result of the container's health check +# TYPE container_health_state gauge +container_health_state{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000 # HELP container_hugetlb_failcnt Number of hugepage usage hits limits # TYPE container_hugetlb_failcnt counter container_hugetlb_failcnt{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",pagesize="1Gi",zone_name="hello"} 0 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_perf_aggregated b/metrics/testdata/prometheus_metrics_perf_aggregated index bee60f5141..fd3647529f 100644 --- a/metrics/testdata/prometheus_metrics_perf_aggregated +++ b/metrics/testdata/prometheus_metrics_perf_aggregated @@ -1,6 +1,9 @@ # HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision. # TYPE cadvisor_version_info gauge cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1 +# HELP container_health_state The result of the container's health check +# TYPE container_health_state gauge +container_health_state{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000 # HELP container_last_seen Last time a container was seen by the exporter # TYPE container_last_seen gauge container_last_seen{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.395066363e+09 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 1724b3c8ba..d489ccb2ed 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -121,6 +121,9 @@ container_fs_writes_merged_total{container_env_foo_env="prod",device="sda2",id=" # TYPE container_fs_writes_total counter container_fs_writes_total{container_env_foo_env="prod",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 28 1395066363000 container_fs_writes_total{container_env_foo_env="prod",device="sda2",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 43 1395066363000 +# HELP container_health_state The result of the container's health check +# TYPE container_health_state gauge +container_health_state{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000 # HELP container_hugetlb_failcnt Number of hugepage usage hits limits # TYPE container_hugetlb_failcnt counter container_hugetlb_failcnt{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",pagesize="1Gi",zone_name="hello"} 0 1395066363000