Skip to content

Commit

Permalink
chore(metrics): Aggregate metrics by labels
Browse files Browse the repository at this point in the history
  • Loading branch information
vmercierfr committed Nov 21, 2023
1 parent a9c14f4 commit 4007238
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
group: "pages" # Allow one concurrent deployment
cancel-in-progress: true
env:
HUGO_VERSION: 0.115.4
HUGO_VERSION: 0.120.4
BASE_URL: https://qonto.github.io/database-monitoring-framework
environment:
name: github-pages
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ Go to <https://qonto.github.io/database-monitoring-framework/latest/> for more i
## Contributing

The project is open to contribution. See [CONTRIBUTING.md](CONTRIBUTING.md)

To report a security issue, please visit [SECURITY.md](SECURITY.md)
20 changes: 12 additions & 8 deletions charts/prometheus-postgresql-alerts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ rules:
description: "PostgreSQL exporter last scrape error metric is missing. Either the exporter is down or some metrics are not collected anymore"

PostgreSQLExporterErrors:
expr: max(last_over_time(pg_exporter_last_scrape_error[10m])) by (job) > 0
expr: max by (job) (last_over_time(pg_exporter_last_scrape_error[10m])) > 0
for: 5m
labels:
severity: critical
Expand All @@ -43,7 +43,11 @@ rules:
description: "{{ $labels.instance }} scraping take long time"

PostgreSQLMaxConnections:
expr: sum(pg_stat_connections_count) by (server) * 100 / on (server) pg_settings_max_connections{} > 80
expr: |
max by (server) (pg_stat_connections_count)
* 100
/ max by (server) (pg_settings_max_connections{})
> 80
for: 10m
labels:
severity: warning
Expand All @@ -52,16 +56,16 @@ rules:
description: '{{ $labels.server }} uses {{ printf "%.2g" $value }}% of database connections'

PostgreSQLReplicationSlotStorageLimit:
expr: pg_replication_slots_available_storage_percent{} < 20
expr: max by (server, slot_name) (pg_replication_slots_available_storage_percent{}) < 20
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.slot_name }} on {{ $labels.dbidentifier }} is close to its storage limit"
summary: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is close to its storage limit"
description: '{{ $labels.slot_name }} uses {{ printf "%.2g" $value }}% of its storage limit'

PostgreSQLInactiveLogicalReplicationSlot:
expr: max(pg_replication_slots_active{slot_type="logical"} < 1) by (server, slot_name)
expr: max by (server, slot_name) (pg_replication_slots_active{slot_type="logical"}) < 1
for: 10m
labels:
severity: warning
Expand All @@ -70,7 +74,7 @@ rules:
description: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is inactive"

PostgreSQLInactivePhysicalReplicationSlot:
expr: max(pg_replication_slots_active{slot_type="physical"} < 1) by (server, slot_name)
expr: max by (server, slot_name) (pg_replication_slots_active{slot_type="physical"}) < 1
for: 10m
labels:
severity: warning
Expand All @@ -79,7 +83,7 @@ rules:
description: "{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} is inactive"

PostgreSQLLongRunningQuery:
expr: pg_active_backend_duration_minutes{usename!=""} > 30
expr: max by (server, datname, usename) (pg_active_backend_duration_minutes{usename!=""}) > 30
for: 1m
labels:
severity: warning
Expand All @@ -90,7 +94,7 @@ rules:
- disable promql/series

PostgreSQLInvalidIndex:
expr: max(pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) by (cluster, datname, relname, indexrelname) >= 0
expr: max by (cluster, datname, relname, indexrelname) (pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) >= 0
for: 1h
labels:
severity: warning
Expand Down
38 changes: 22 additions & 16 deletions charts/prometheus-rds-alerts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ rules:
description: "{{ $labels.instance }} is reporting {{ $value }} errors per minute"

RDSDiskSpaceLimit:
expr: 100 - max(100 - rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) by (dbidentifier) < 10
expr: max by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) < 10
for: 15m
labels:
severity: warning
Expand All @@ -45,7 +45,7 @@ rules:
description: '{{ $labels.dbidentifier }} has {{ printf "%.2g" $value }}% free disk space'

RDSDiskSpacePrediction:
expr: predict_linear(rds_free_storage_bytes{}[30m], 3600 * 4) < 1
expr: predict_linear(min(rds_free_storage_bytes{}) by (aws_account_id, aws_region, dbidentifier)[30m:], 3600 * 4) < 1
for: 15m
labels:
severity: critical
Expand All @@ -54,7 +54,7 @@ rules:
description: "{{ $labels.dbidentifier }} will run out of disk space in 4 hours"

RDSPostgreSQLMaximumUsedTransaction:
expr: rds_maximum_used_transaction_ids_average > (2^32) * 0.5 # 50% of the max transactions limit
expr: max by (aws_account_id, aws_region, dbidentifier) (rds_maximum_used_transaction_ids_average) > (2^32) * 0.5 # 50% of the max transactions limit
for: 5m
labels:
severity: critical
Expand All @@ -63,7 +63,7 @@ rules:
description: "{{ $labels.dbidentifier }} is using {{ $value }} transaction IDs on 4 billions hard limit"

RDSCPUUtilization:
expr: rds_cpu_usage_percent_average > 85
expr: max by (aws_account_id, aws_region, dbidentifier) (rds_cpu_usage_percent_average) > 85
for: 10m
labels:
severity: warning
Expand All @@ -72,7 +72,12 @@ rules:
description: '{{ $labels.dbidentifier }} has {{ printf "%.2g" $value }}% CPU used'

RDSNonCPUUtilization:
expr: rds_dbload_noncpu_average > on(dbidentifier) max(rds_instance_vcpu_average{}) by (instance_class) * on (instance_class) group_right() max(rds_instance_info{}) by (dbidentifier, instance_class) * 4
expr: |
max by (aws_account_id, aws_region, dbidentifier) (rds_dbload_noncpu_average)
> on(aws_account_id, aws_region, dbidentifier) (
4 *
max by (instance_class) (rds_instance_vcpu_average{}) * on (instance_class) group_right() max by (aws_account_id, aws_region, dbidentifier, instance_class) (rds_instance_info{})
)
for: 10m
labels:
severity: critical
Expand All @@ -82,11 +87,12 @@ rules:

RDSMemoryUtilization:
expr: |
100 - (
(max(rds_freeable_memory_bytes{}) by (dbidentifier))
* 100
/ on(dbidentifier) (max(rds_instance_memory_bytes{}) by (instance_class) * on (instance_class) group_right() max(rds_instance_info{}) by (dbidentifier, instance_class))
) > 80
max by (aws_account_id, aws_region, dbidentifier) (rds_freeable_memory_bytes{})
* 100
/ on(aws_account_id, aws_region, dbidentifier) (
max by (instance_class) (rds_instance_memory_bytes{}) * on (instance_class) group_right() max by (aws_account_id, aws_region, dbidentifier, instance_class) (rds_instance_info{})
)
< 20
for: 10m
labels:
severity: warning
Expand All @@ -95,16 +101,16 @@ rules:
description: '{{ $labels.dbidentifier }} used {{ printf "%.2g" $value }}% of its max memory'

RDSSwapUtilization:
expr: delta(rds_swap_usage_bytes{}[1h]) / 1024 / 1024 >= 20
expr: max by (aws_account_id, aws_region, dbidentifier) (delta(rds_swap_usage_bytes{}[1h])) / 1024 / 1024 >= 20
for: 2m
labels:
severity: warning
annotations:
summary: "{{ $labels.dbidentifier }} SWAP utilization is high"
description: "{{ $labels.dbidentifier }} use {{ $value }}MB of SWAP"
description: "{{ $labels.dbidentifier }} use {{ $value }} MB of SWAP"

RDSIOPSUtilization:
expr: (rds_read_iops_average{} + rds_write_iops_average{}) * 100 / rds_max_disk_iops_average{} > 80
expr: max by (aws_account_id, aws_region, dbidentifier) ((rds_read_iops_average{} + rds_write_iops_average{}) * 100 / rds_max_disk_iops_average{}) > 80
for: 10m
labels:
severity: warning
Expand All @@ -113,7 +119,7 @@ rules:
description: '{{ $labels.dbidentifier }} uses {{ printf "%.2g" $value }}% of its disk IOPS'

RDSReplicationLag:
expr: rds_replica_lag_seconds{} > 60
expr: max by (aws_account_id, aws_region, dbidentifier) (rds_replica_lag_seconds{}) > 60
for: 5m
labels:
severity: warning
Expand Down Expand Up @@ -156,7 +162,7 @@ rules:
description: 'Using {{ printf "%.2g" $value }}% of allowed RDS storage in {{ $labels.aws_account_id}}:{{ $labels.aws_region }}'

RDSUnappliedParameters:
expr: rds_instance_info{pending_modified_values="true"} > 0
expr: max by (aws_account_id, aws_region, dbidentifier) (rds_instance_info{pending_modified_values="true"}) > 0
for: 1h
labels:
severity: warning
Expand All @@ -167,7 +173,7 @@ rules:
- disable promql/series

RDSForcedMaintenance:
expr: rds_instance_info{pending_maintenance=~"auto-applied|forced"} > 0
expr: max by (aws_account_id, aws_region, dbidentifier) (rds_instance_info{pending_maintenance=~"auto-applied|forced"}) > 0
for: 1h
labels:
severity: warning
Expand Down

0 comments on commit 4007238

Please sign in to comment.