chore(metrics): Aggregate metrics by labels

qonto · Nov 21, 2023 · 4007238 · 4007238
1 parent a9c14f4
commit 4007238
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 25 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -25,7 +25,7 @@ jobs:
       group: "pages" # Allow one concurrent deployment
       cancel-in-progress: true
     env:
-      HUGO_VERSION: 0.115.4
+      HUGO_VERSION: 0.120.4
       BASE_URL: https://qonto.github.io/database-monitoring-framework
     environment:
       name: github-pages

diff --git a/README.md b/README.md
@@ -13,3 +13,5 @@ Go to <https://qonto.github.io/database-monitoring-framework/latest/> for more i
 ## Contributing
 
 The project is open to contribution. See [CONTRIBUTING.md](CONTRIBUTING.md)
+
+To report a security issue, please visit [SECURITY.md](SECURITY.md)
diff --git a/charts/prometheus-postgresql-alerts/values.yaml b/charts/prometheus-postgresql-alerts/values.yaml
@@ -25,7 +25,7 @@ rules:
       description: "PostgreSQL exporter last scrape error metric is missing. Either the exporter is down or some metrics are not collected anymore"
 
   PostgreSQLExporterErrors:
-    expr: max(last_over_time(pg_exporter_last_scrape_error[10m])) by (job) > 0
+    expr: max by (job) (last_over_time(pg_exporter_last_scrape_error[10m])) > 0
     for: 5m
     labels:
       severity: critical
@@ -43,7 +43,11 @@ rules:
       description: "{{ $labels.instance }} scraping take long time"
 
   PostgreSQLMaxConnections:
-    expr: sum(pg_stat_connections_count) by (server) * 100 / on (server) pg_settings_max_connections{} > 80
+    expr: |
+      max by (server) (pg_stat_connections_count)
+      * 100
+      / max by (server) (pg_settings_max_connections{})
+      > 80
     for: 10m
     labels:
       severity: warning
@@ -52,16 +56,16 @@ rules:
       description: '{{ $labels.server }} uses {{ printf "%.2g" $value }}% of database connections'
 
   PostgreSQLReplicationSlotStorageLimit:
-    expr: pg_replication_slots_available_storage_percent{} < 20
+    expr: max by (server, slot_name) (pg_replication_slots_available_storage_percent{}) < 20
     for: 5m
     labels:
       severity: warning
     annotations:
-      summary: "{{ $labels.slot_name }} on {{ $labels.dbidentifier }} is close to its storage limit"
+      summary: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is close to its storage limit"
       description: '{{ $labels.slot_name }} uses {{ printf "%.2g" $value }}% of its storage limit'
 
   PostgreSQLInactiveLogicalReplicationSlot:
-    expr: max(pg_replication_slots_active{slot_type="logical"} < 1) by (server, slot_name)
+    expr: max by (server, slot_name) (pg_replication_slots_active{slot_type="logical"}) < 1
     for: 10m
     labels:
       severity: warning
@@ -70,7 +74,7 @@ rules:
       description: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is inactive"
 
   PostgreSQLInactivePhysicalReplicationSlot:
-    expr: max(pg_replication_slots_active{slot_type="physical"} < 1) by (server, slot_name)
+    expr: max by (server, slot_name) (pg_replication_slots_active{slot_type="physical"}) < 1
     for: 10m
     labels:
       severity: warning
@@ -79,7 +83,7 @@ rules:
       description: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is inactive"
 
   PostgreSQLLongRunningQuery:
-    expr: pg_active_backend_duration_minutes{usename!=""} > 30
+    expr: max by (server, datname, usename) (pg_active_backend_duration_minutes{usename!=""}) > 30
     for: 1m
     labels:
       severity: warning
@@ -90,7 +94,7 @@ rules:
       - disable promql/series
 
   PostgreSQLInvalidIndex:
-    expr: max(pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) by (cluster, datname, relname, indexrelname) >= 0
+    expr: max by (cluster, datname, relname, indexrelname) (pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) >= 0
     for: 1h
     labels:
       severity: warning

diff --git a/charts/prometheus-rds-alerts/values.yaml b/charts/prometheus-rds-alerts/values.yaml
@@ -36,7 +36,7 @@ rules:
       description: "{{ $labels.instance }} is reporting {{ $value }} errors per minute"
 
   RDSDiskSpaceLimit:
-    expr: 100 - max(100 - rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) by (dbidentifier) < 10
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) < 10
     for: 15m
     labels:
       severity: warning
@@ -45,7 +45,7 @@ rules:
       description: '{{ $labels.dbidentifier }} has {{ printf "%.2g" $value }}% free disk space'
 
   RDSDiskSpacePrediction:
-    expr: predict_linear(rds_free_storage_bytes{}[30m], 3600 * 4) < 1
+    expr: predict_linear(min(rds_free_storage_bytes{}) by (aws_account_id, aws_region, dbidentifier)[30m:], 3600 * 4) < 1
     for: 15m
     labels:
       severity: critical
@@ -54,7 +54,7 @@ rules:
       description: "{{ $labels.dbidentifier }} will run out of disk space in 4 hours"
 
   RDSPostgreSQLMaximumUsedTransaction:
-    expr: rds_maximum_used_transaction_ids_average > (2^32) * 0.5 # 50% of the max transactions limit
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_maximum_used_transaction_ids_average) > (2^32) * 0.5 # 50% of the max transactions limit
     for: 5m
     labels:
       severity: critical
@@ -63,7 +63,7 @@ rules:
       description: "{{ $labels.dbidentifier }} is using {{ $value }} transaction IDs on 4 billions hard limit"
 
   RDSCPUUtilization:
-    expr: rds_cpu_usage_percent_average > 85
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_cpu_usage_percent_average) > 85
     for: 10m
     labels:
       severity: warning
@@ -72,7 +72,12 @@ rules:
       description: '{{ $labels.dbidentifier }} has {{ printf "%.2g" $value }}% CPU used'
 
   RDSNonCPUUtilization:
-    expr: rds_dbload_noncpu_average > on(dbidentifier) max(rds_instance_vcpu_average{}) by (instance_class) * on (instance_class) group_right() max(rds_instance_info{}) by (dbidentifier, instance_class) * 4
+    expr: |
+      max by (aws_account_id, aws_region, dbidentifier) (rds_dbload_noncpu_average)
+      > on(aws_account_id, aws_region, dbidentifier) (
+          4 *
+          max by (instance_class) (rds_instance_vcpu_average{}) * on (instance_class) group_right() max by (aws_account_id, aws_region, dbidentifier, instance_class) (rds_instance_info{})
+      )
     for: 10m
     labels:
       severity: critical
@@ -82,11 +87,12 @@ rules:
 
   RDSMemoryUtilization:
     expr: |
-      100 - (
-          (max(rds_freeable_memory_bytes{}) by (dbidentifier))
-          * 100
-          / on(dbidentifier) (max(rds_instance_memory_bytes{}) by (instance_class) * on (instance_class) group_right() max(rds_instance_info{}) by (dbidentifier, instance_class))
-      ) > 80
+      max by (aws_account_id, aws_region, dbidentifier) (rds_freeable_memory_bytes{})
+      * 100
+      / on(aws_account_id, aws_region, dbidentifier) (
+          max by (instance_class) (rds_instance_memory_bytes{}) * on (instance_class) group_right() max by (aws_account_id, aws_region, dbidentifier, instance_class) (rds_instance_info{})
+      )
+      < 20
     for: 10m
     labels:
       severity: warning
@@ -95,16 +101,16 @@ rules:
       description: '{{ $labels.dbidentifier }} used {{ printf "%.2g" $value }}% of its max memory'
 
   RDSSwapUtilization:
-    expr: delta(rds_swap_usage_bytes{}[1h]) / 1024 / 1024 >= 20
+    expr: max by (aws_account_id, aws_region, dbidentifier) (delta(rds_swap_usage_bytes{}[1h])) / 1024 / 1024 >= 20
     for: 2m
     labels:
       severity: warning
     annotations:
       summary: "{{ $labels.dbidentifier }} SWAP utilization is high"
-      description: "{{ $labels.dbidentifier }} use {{ $value }}MB of SWAP"
+      description: "{{ $labels.dbidentifier }} use {{ $value }} MB of SWAP"
 
   RDSIOPSUtilization:
-    expr: (rds_read_iops_average{} + rds_write_iops_average{}) * 100 / rds_max_disk_iops_average{} > 80
+    expr: max by (aws_account_id, aws_region, dbidentifier) ((rds_read_iops_average{} + rds_write_iops_average{}) * 100 / rds_max_disk_iops_average{}) > 80
     for: 10m
     labels:
       severity: warning
@@ -113,7 +119,7 @@ rules:
       description: '{{ $labels.dbidentifier }} uses {{ printf "%.2g" $value }}% of its disk IOPS'
 
   RDSReplicationLag:
-    expr: rds_replica_lag_seconds{} > 60
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_replica_lag_seconds{}) > 60
     for: 5m
     labels:
       severity: warning
@@ -156,7 +162,7 @@ rules:
       description: 'Using {{ printf "%.2g" $value }}% of allowed RDS storage in {{ $labels.aws_account_id}}:{{ $labels.aws_region }}'
 
   RDSUnappliedParameters:
-    expr: rds_instance_info{pending_modified_values="true"} > 0
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_instance_info{pending_modified_values="true"}) > 0
     for: 1h
     labels:
       severity: warning
@@ -167,7 +173,7 @@ rules:
       - disable promql/series
 
   RDSForcedMaintenance:
-    expr: rds_instance_info{pending_maintenance=~"auto-applied|forced"} > 0
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_instance_info{pending_maintenance=~"auto-applied|forced"}) > 0
     for: 1h
     labels:
       severity: warning
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,3 +13,5 @@ Go to <https://qonto.github.io/database-monitoring-framework/latest/> for more i
		## Contributing

		The project is open to contribution. See [CONTRIBUTING.md](CONTRIBUTING.md)

		To report a security issue, please visit [SECURITY.md](SECURITY.md)