Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add metric for container health check status #3640

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions container/docker/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ type dockerContainerHandler struct {
creationTime time.Time

// Metadata associated with the container.
envs map[string]string
labels map[string]string
envs map[string]string
labels map[string]string
healthStatus string

// Image name used for this container.
image string
Expand Down Expand Up @@ -178,6 +179,7 @@ func newDockerContainerHandler(
rootfsStorageDir: rootfsStorageDir,
envs: make(map[string]string),
labels: ctnr.Config.Labels,
healthStatus: ctnr.State.Health.Status,
includedMetrics: metrics,
zfsParent: zfsParent,
}
Expand Down Expand Up @@ -303,6 +305,9 @@ func (h *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
// TODO(vmarmol): Get from libcontainer API instead of cgroup manager when we don't have to support older Dockers.
func (h *dockerContainerHandler) GetStats() (*info.ContainerStats, error) {
stats, err := h.libcontainerHandler.GetStats()

stats.Health.Status = h.healthStatus

if err != nil {
return stats, err
}
Expand Down
1 change: 1 addition & 0 deletions docs/storage/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Metric name | Type | Description | Unit (where applicable) | option parameter |
`container_fs_write_seconds_total` | Counter | Cumulative count of seconds spent writing | seconds | diskIO |
`container_fs_writes_merged_total` | Counter | Cumulative count of writes merged | | diskIO |
`container_fs_writes_total` | Counter | Cumulative count of writes completed | | diskIO |
`container_health_state` | Gauge | State of the health check probe | | - |
`container_hugetlb_failcnt` | Counter | Number of hugepage usage hits limits | | hugetlb |
`container_hugetlb_max_usage_bytes` | Gauge | Maximum hugepage usages recorded | bytes | hugetlb |
`container_hugetlb_usage_bytes` | Gauge | Current hugepage usage | bytes | hugetlb |
Expand Down
7 changes: 7 additions & 0 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,11 @@ type ProcessStats struct {
Ulimits []UlimitSpec `json:"ulimits,omitempty"`
}

type Health struct {
// Health status of the container
Status string `json:"status"`
}

type ContainerStats struct {
// The time of this stat point.
Timestamp time.Time `json:"timestamp"`
Expand Down Expand Up @@ -979,6 +984,8 @@ type ContainerStats struct {
CpuSet CPUSetStats `json:"cpuset,omitempty"`

OOMEvents uint64 `json:"oom_events,omitempty"`

Health Health `json:"health,omitempty"`
}

func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {
Expand Down
18 changes: 18 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,24 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
}}
},
},
{
name: "container_health_state",
help: "The result of the container's health check",
valueType: prometheus.GaugeValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{
// inline if to check if s.health.status = healthy
value: func(s *info.ContainerStats) float64 {
if s.Health.Status == "healthy" {
return 1
} else {
return 0
}
}(s),
timestamp: s.Timestamp,
}}
},
},
},
includedMetrics: includedMetrics,
opts: opts,
Expand Down
1 change: 1 addition & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,7 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
},
},
CpuSet: info.CPUSetStats{MemoryMigrate: 1},
Health: info.Health{Status: "healthy"},
},
},
},
Expand Down
3 changes: 2 additions & 1 deletion metrics/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,13 @@ func TestPrometheusCollector_scrapeFailure(t *testing.T) {

func TestNewPrometheusCollectorWithPerf(t *testing.T) {
c := NewPrometheusCollector(&mockInfoProvider{}, mockLabelFunc, container.MetricSet{container.PerfMetrics: struct{}{}}, now, v2.RequestOptions{})
assert.Len(t, c.containerMetrics, 5)
assert.Len(t, c.containerMetrics, 6)
names := []string{}
for _, m := range c.containerMetrics {
names = append(names, m.name)
}
assert.Contains(t, names, "container_last_seen")
assert.Contains(t, names, "container_health_state")
assert.Contains(t, names, "container_perf_events_total")
assert.Contains(t, names, "container_perf_events_scaling_ratio")
assert.Contains(t, names, "container_perf_uncore_events_total")
Expand Down
3 changes: 3 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ container_fs_writes_merged_total{container_env_foo_env="prod",container_label_fo
# TYPE container_fs_writes_total counter
container_fs_writes_total{container_env_foo_env="prod",container_label_foo_label="bar",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 28 1395066363000
container_fs_writes_total{container_env_foo_env="prod",container_label_foo_label="bar",device="sda2",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 43 1395066363000
# HELP container_health_state The result of the container's health check
# TYPE container_health_state gauge
container_health_state{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000
# HELP container_hugetlb_failcnt Number of hugepage usage hits limits
# TYPE container_hugetlb_failcnt counter
container_hugetlb_failcnt{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",pagesize="1Gi",zone_name="hello"} 0 1395066363000
Expand Down
3 changes: 3 additions & 0 deletions metrics/testdata/prometheus_metrics_perf_aggregated
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
# TYPE cadvisor_version_info gauge
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
# HELP container_health_state The result of the container's health check
# TYPE container_health_state gauge
container_health_state{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000
# HELP container_last_seen Last time a container was seen by the exporter
# TYPE container_last_seen gauge
container_last_seen{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.395066363e+09 1395066363000
Expand Down
3 changes: 3 additions & 0 deletions metrics/testdata/prometheus_metrics_whitelist_filtered
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ container_fs_writes_merged_total{container_env_foo_env="prod",device="sda2",id="
# TYPE container_fs_writes_total counter
container_fs_writes_total{container_env_foo_env="prod",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 28 1395066363000
container_fs_writes_total{container_env_foo_env="prod",device="sda2",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 43 1395066363000
# HELP container_health_state The result of the container's health check
# TYPE container_health_state gauge
container_health_state{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000
# HELP container_hugetlb_failcnt Number of hugepage usage hits limits
# TYPE container_hugetlb_failcnt counter
container_hugetlb_failcnt{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",pagesize="1Gi",zone_name="hello"} 0 1395066363000
Expand Down