forked from k8stech/alertmanager-wechatrobot-webhook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprometheusrule.yaml
151 lines (151 loc) · 7.42 KB
/
prometheusrule.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
namespace: monitoring
spec:
groups:
- name: k8s-alert
rules:
- alert: 容器cpu使用率
annotations:
description: 集群名称:储能-ems-cn 命名空间:{{$labels.namespace}} 容器:{{$labels.container}} Pod:{{$labels.pod}}
cpu使用率超过85%,(当前值:{{ printf "%.0f" $value }}%)
summary: 容器cpu使用率过高
expr: sum(irate(container_cpu_usage_seconds_total{namespace=~'ems-plus-mapai|ems-uat'}[2m])) by
(pod,namespace, instance,container) / (sum(container_spec_cpu_quota{namespace=~'ems-plus-mapai|ems-uat'} /100000) by (pod,namespace, node,container))
* 100 > 85
for: 5m
labels:
severity: warning
- alert: 容器内存使用率
annotations:
description: 集群名称:储能-ems-cn 命名空间:{{$labels.namespace}} 容器:{{$labels.container}} Pod:{{$labels.pod}}
内存使用率超过90%, (当前值:{{ printf "%.0f" $value }}%)
summary: 容器内存使用率过高
expr: (sum (container_memory_working_set_bytes{namespace=~'ems-plus-mapai|ems-uat', container!='care-smart',pod!~'care-smart-.*-.*'})
by (container,pod,instance,namespace) / sum(container_spec_memory_limit_bytes{namespace=~'ems-plus-mapai|ems-uat',container!='care-smart',pod!~'care-smart-.*-.*'}
> 0) by (container,pod,instance,namespace) * 100) > 90
for: 5m
labels:
severity: warning
- alert: rabbitmq内存使用率
annotations:
description: 集群名称:储能-ems-cn 命名空间:{{$labels.namespace}} 容器:{{$labels.container}} Pod:{{$labels.pod}}
内存使用率超过80%, (当前值:{{ printf "%.0f" $value }}%)
summary: rabbitmq内存使用率过高
expr: (sum by (container, pod, instance, namespace) (container_memory_working_set_bytes{container="rabbitmq",namespace="ems-plus-mapai",pod!~"care-smart-.*-.*"})
/ sum by (container, pod, instance, namespace) (container_spec_memory_limit_bytes{container="rabbitmq",namespace="ems-plus-mapai",pod!~"care-smart-.*-.*"}
> 0) * 100) > 80
for: 2m
labels:
severity: warning
- alert: Pod可用率
annotations:
description: '集群名称:储能-ems-cn 命名空间:{{$labels.namespace}} Deployment:{{$labels.deployment}}
Pod可用率小于100%, (当前不可用Pod数: {{ $value }})'
summary: pod可用率低于100%
expr: kube_deployment_status_replicas_available / kube_deployment_status_replicas
< 1
for: 10m
labels:
severity: warning
- alert: Pod状态异常
annotations:
description: 集群名称:储能-ems-cn 命名空间:{{$labels.namespace}} Pod:{{$labels.pod}}
处于{{$labels.phase}}状态持续超过10分钟
summary: pod状态异常,请检查
expr: sum(min_over_time(kube_pod_status_phase{phase=~"Unknown|Failed|Pending"}[10m]))
by (namespace,pod,phase,instance) > 0
for: 5m
labels:
severity: warning
- alert: Pod频繁重启
annotations:
description: 集群名称:储能-ems-cn 命名空间:{{$labels.namespace}} Pod:{{$labels.pod}}
10分钟内重启超过3次
summary: pod频繁重启,请检查
expr: increase(kube_pod_container_status_restarts_total[10m]) > 3
for: 1m
labels:
severity: warning
- name: k8s-node
rules:
- alert: node_节点就绪状态
annotations:
description: 储能-ems-cn集群 node节点{{$labels.node}}未就绪,请检查
summary: node节点就绪状态异常
expr: kube_node_status_condition{status='true',condition='Ready'} != 1
for: 1m
labels:
severity: critical
- alert: node节点cpu使用率过高
annotations:
description: 集群名称:储能-ems-cn node名称:{{$labels.instance}} cpu使用率超过85%,当前值:{{
printf "%.0f" $value }}%
summary: node节点cpu使用率过高
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
mode!="idle"}[2m]))) * 100 > 85
for: 5m
labels:
severity: warning
- alert: node_节点cpu使用率过高
annotations:
description: 储能-ems-cn集群 node节点{{$labels.instance}}cpu使用率超过90%,当前值:{{ printf
"%.0f" $value }}%
summary: node节点cpu使用率过高
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
mode!="idle"}[2m]))) * 100 > 90
for: 5m
labels:
severity: critical
- alert: node节点内存使用率过高
annotations:
description: 集群名称:储能-ems-cn node名称:{{$labels.instance}} 内存使用率超过90%,当前值:{{
printf "%.0f" $value }}%
summary: node节点内存使用率过高
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
* 100) > 90
for: 5m
labels:
severity: warning
- alert: node_节点内存使用率过高
annotations:
description: 储能-ems-cn集群 node节点{{$labels.instance}}内存使用率超过95%,当前值:{{ printf
"%.0f" $value }}%
summary: node节点内存使用率过高
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
* 100) > 95
for: 10m
labels:
severity: critical
- alert: node节点磁盘使用率过高
annotations:
description: 集群名称:储能-ems-cn node名称:{{$labels.instance}} 分区:{{$labels.mountpoint}}
使用率超过85%,当前值:{{ printf "%.0f" $value }}%
summary: node节点磁盘使用率过高
expr: 100 - ((node_filesystem_avail_bytes{device!~'rootfs', mountpoint=~'/|(/var/lib/kubelet)'}
* 100) / node_filesystem_size_bytes{device!~'rootfs',mountpoint=~'/|(/var/lib/kubelet)'})
> 85
for: 5m
labels:
severity: warning
- alert: node_节点磁盘使用率过高
annotations:
description: 储能-ems-cn集群 node节点{{$labels.instance}} 分区:{{$labels.mountpoint}}
使用率超过80%,当前值:{{ printf "%.0f" $value }}%
summary: node节点磁盘使用率过高
expr: 100 - ((node_filesystem_avail_bytes{device!~'rootfs', mountpoint=~'/|(/var/lib/kubelet)'}
* 100) / node_filesystem_size_bytes{device!~'rootfs',mountpoint=~'/|(/var/lib/kubelet)'})
> 90
for: 10m
labels:
severity: critical