From 72f93e8a4173a84ddf952f3b36a72277c46a6d38 Mon Sep 17 00:00:00 2001
From: Vincent Mercier <vmercier@gmail.com>
Date: Tue, 21 Nov 2023 12:18:52 +0100
Subject: [PATCH 1/2] chore(metrics): Aggregate metrics by labels

---
 .../prometheus-postgresql-alerts/values.yaml  | 20 ++++++----
 charts/prometheus-rds-alerts/values.yaml      | 38 +++++++++++--------
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/charts/prometheus-postgresql-alerts/values.yaml b/charts/prometheus-postgresql-alerts/values.yaml
index b75ae57..85099b5 100644
--- a/charts/prometheus-postgresql-alerts/values.yaml
+++ b/charts/prometheus-postgresql-alerts/values.yaml
@@ -25,7 +25,7 @@ rules:
       description: "PostgreSQL exporter last scrape error metric is missing. Either the exporter is down or some metrics are not collected anymore"
 
   PostgreSQLExporterErrors:
-    expr: max(last_over_time(pg_exporter_last_scrape_error[10m])) by (job) > 0
+    expr: max by (job) (last_over_time(pg_exporter_last_scrape_error[10m])) > 0
     for: 5m
     labels:
       severity: critical
@@ -43,7 +43,11 @@ rules:
       description: "{{ $labels.instance }} scraping take long time"
 
   PostgreSQLMaxConnections:
-    expr: sum(pg_stat_connections_count) by (server) * 100 / on (server) pg_settings_max_connections{} > 80
+    expr: |
+      max by (server) (pg_stat_connections_count)
+      * 100
+      / max by (server) (pg_settings_max_connections{})
+      > 80
     for: 10m
     labels:
       severity: warning
@@ -52,16 +56,16 @@ rules:
       description: '{{ $labels.server }} uses {{ printf "%.2g" $value }}% of database connections'
 
   PostgreSQLReplicationSlotStorageLimit:
-    expr: pg_replication_slots_available_storage_percent{} < 20
+    expr: max by (server, slot_name) (pg_replication_slots_available_storage_percent{}) < 20
     for: 5m
     labels:
       severity: warning
     annotations:
-      summary: "{{ $labels.slot_name }} on {{ $labels.dbidentifier }} is close to its storage limit"
+      summary: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is close to its storage limit"
       description: '{{ $labels.slot_name }} uses {{ printf "%.2g" $value }}% of its storage limit'
 
   PostgreSQLInactiveLogicalReplicationSlot:
-    expr: max(pg_replication_slots_active{slot_type="logical"} < 1) by (server, slot_name)
+    expr: max by (server, slot_name) (pg_replication_slots_active{slot_type="logical"}) < 1
     for: 10m
     labels:
       severity: warning
@@ -70,7 +74,7 @@ rules:
       description: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is inactive"
 
   PostgreSQLInactivePhysicalReplicationSlot:
-    expr: max(pg_replication_slots_active{slot_type="physical"} < 1) by (server, slot_name)
+    expr: max by (server, slot_name) (pg_replication_slots_active{slot_type="physical"}) < 1
     for: 10m
     labels:
       severity: warning
@@ -79,7 +83,7 @@ rules:
       description: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is inactive"
 
   PostgreSQLLongRunningQuery:
-    expr: pg_active_backend_duration_minutes{usename!=""} > 30
+    expr: max by (server, datname, usename) (pg_active_backend_duration_minutes{usename!=""}) > 30
     for: 1m
     labels:
       severity: warning
@@ -90,7 +94,7 @@ rules:
       - disable promql/series
 
   PostgreSQLInvalidIndex:
-    expr: max(pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) by (cluster, datname, relname, indexrelname) >= 0
+    expr: max by (cluster, datname, relname, indexrelname) (pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) >= 0
     for: 1h
     labels:
       severity: warning
diff --git a/charts/prometheus-rds-alerts/values.yaml b/charts/prometheus-rds-alerts/values.yaml
index 8f0c2b7..3e6776d 100644
--- a/charts/prometheus-rds-alerts/values.yaml
+++ b/charts/prometheus-rds-alerts/values.yaml
@@ -36,7 +36,7 @@ rules:
       description: "{{ $labels.instance }} is reporting {{ $value }} errors per minute"
 
   RDSDiskSpaceLimit:
-    expr: 100 - max(100 - rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) by (dbidentifier) < 10
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) < 10
     for: 15m
     labels:
       severity: warning
@@ -45,7 +45,7 @@ rules:
       description: '{{ $labels.dbidentifier }} has {{ printf "%.2g" $value }}% free disk space'
 
   RDSDiskSpacePrediction:
-    expr: predict_linear(rds_free_storage_bytes{}[30m], 3600 * 4) < 1
+    expr: predict_linear(min by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{})[30m:], 3600 * 4) < 1
     for: 15m
     labels:
       severity: critical
@@ -54,7 +54,7 @@ rules:
       description: "{{ $labels.dbidentifier }} will run out of disk space in 4 hours"
 
   RDSPostgreSQLMaximumUsedTransaction:
-    expr: rds_maximum_used_transaction_ids_average > (2^32) * 0.5 # 50% of the max transactions limit
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_maximum_used_transaction_ids_average) > (2^32) * 0.5 # 50% of the max transactions limit
     for: 5m
     labels:
       severity: critical
@@ -63,7 +63,7 @@ rules:
       description: "{{ $labels.dbidentifier }} is using {{ $value }} transaction IDs on 4 billions hard limit"
 
   RDSCPUUtilization:
-    expr: rds_cpu_usage_percent_average > 85
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_cpu_usage_percent_average) > 85
     for: 10m
     labels:
       severity: warning
@@ -72,7 +72,12 @@ rules:
       description: '{{ $labels.dbidentifier }} has {{ printf "%.2g" $value }}% CPU used'
 
   RDSNonCPUUtilization:
-    expr: rds_dbload_noncpu_average > on(dbidentifier) max(rds_instance_vcpu_average{}) by (instance_class) * on (instance_class) group_right() max(rds_instance_info{}) by (dbidentifier, instance_class) * 4
+    expr: |
+      max by (aws_account_id, aws_region, dbidentifier) (rds_dbload_noncpu_average)
+      > on(aws_account_id, aws_region, dbidentifier) (
+          4 *
+          max by (instance_class) (rds_instance_vcpu_average{}) * on (instance_class) group_right() max by (aws_account_id, aws_region, dbidentifier, instance_class) (rds_instance_info{})
+      )
     for: 10m
     labels:
       severity: critical
@@ -82,11 +87,12 @@ rules:
 
   RDSMemoryUtilization:
     expr: |
-      100 - (
-          (max(rds_freeable_memory_bytes{}) by (dbidentifier))
-          * 100
-          / on(dbidentifier) (max(rds_instance_memory_bytes{}) by (instance_class) * on (instance_class) group_right() max(rds_instance_info{}) by (dbidentifier, instance_class))
-      ) > 80
+      max by (aws_account_id, aws_region, dbidentifier) (rds_freeable_memory_bytes{})
+      * 100
+      / on(aws_account_id, aws_region, dbidentifier) (
+          max by (instance_class) (rds_instance_memory_bytes{}) * on (instance_class) group_right() max by (aws_account_id, aws_region, dbidentifier, instance_class) (rds_instance_info{})
+      )
+      < 20
     for: 10m
     labels:
       severity: warning
@@ -95,16 +101,16 @@ rules:
       description: '{{ $labels.dbidentifier }} used {{ printf "%.2g" $value }}% of its max memory'
 
   RDSSwapUtilization:
-    expr: delta(rds_swap_usage_bytes{}[1h]) / 1024 / 1024 >= 20
+    expr: max by (aws_account_id, aws_region, dbidentifier) (delta(rds_swap_usage_bytes{}[1h])) / 1024 / 1024 >= 20
     for: 2m
     labels:
       severity: warning
     annotations:
       summary: "{{ $labels.dbidentifier }} SWAP utilization is high"
-      description: "{{ $labels.dbidentifier }} use {{ $value }}MB of SWAP"
+      description: "{{ $labels.dbidentifier }} use {{ $value }} MB of SWAP"
 
   RDSIOPSUtilization:
-    expr: (rds_read_iops_average{} + rds_write_iops_average{}) * 100 / rds_max_disk_iops_average{} > 80
+    expr: max by (aws_account_id, aws_region, dbidentifier) ((rds_read_iops_average{} + rds_write_iops_average{}) * 100 / rds_max_disk_iops_average{}) > 80
     for: 10m
     labels:
       severity: warning
@@ -113,7 +119,7 @@ rules:
       description: '{{ $labels.dbidentifier }} uses {{ printf "%.2g" $value }}% of its disk IOPS'
 
   RDSReplicationLag:
-    expr: rds_replica_lag_seconds{} > 300
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_replica_lag_seconds{}) > 300
     for: 5m
     labels:
       severity: warning
@@ -156,7 +162,7 @@ rules:
       description: 'Using {{ printf "%.2g" $value }}% of allowed RDS storage in {{ $labels.aws_account_id}}:{{ $labels.aws_region }}'
 
   RDSUnappliedParameters:
-    expr: rds_instance_info{pending_modified_values="true"} > 0
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_instance_info{pending_modified_values="true"}) > 0
     for: 1h
     labels:
       severity: warning
@@ -167,7 +173,7 @@ rules:
       - disable promql/series
 
   RDSForcedMaintenance:
-    expr: rds_instance_info{pending_maintenance=~"auto-applied|forced"} > 0
+    expr: max by (aws_account_id, aws_region, dbidentifier) (rds_instance_info{pending_maintenance=~"auto-applied|forced"}) > 0
     for: 1h
     labels:
       severity: warning

From ada8719643f8eadc4d8dcb539d0b028be418bffe Mon Sep 17 00:00:00 2001
From: Vincent Mercier <vmercier@gmail.com>
Date: Tue, 21 Nov 2023 22:02:29 +0100
Subject: [PATCH 2/2] chore(linter): Fix Github template

---
 .github/ISSUE_TEMPLATE/changes.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/changes.md b/.github/ISSUE_TEMPLATE/changes.md
index b4cac27..80123f5 100644
--- a/.github/ISSUE_TEMPLATE/changes.md
+++ b/.github/ISSUE_TEMPLATE/changes.md
@@ -9,15 +9,11 @@ assignees: ''
 
 # Objective
 
-
-
 # Why
 
-
-
 # How
 
-- 
+-
 
 # Release plan