From e5c23f60114dd0753ed55a0edb508d3ab3f85d6b Mon Sep 17 00:00:00 2001 From: Vincent Mercier Date: Wed, 22 Nov 2023 18:11:18 +0100 Subject: [PATCH] feat(postgresql): Add Prometheus metric tests --- .../prometheus_tests/rules.yml | 122 ------------------ 1 file changed, 122 deletions(-) delete mode 100644 charts/prometheus-postgresql-alerts/prometheus_tests/rules.yml diff --git a/charts/prometheus-postgresql-alerts/prometheus_tests/rules.yml b/charts/prometheus-postgresql-alerts/prometheus_tests/rules.yml deleted file mode 100644 index 09259b6..0000000 --- a/charts/prometheus-postgresql-alerts/prometheus_tests/rules.yml +++ /dev/null @@ -1,122 +0,0 @@ ---- -# Source: prometheus-postgresql-alerts-chart/templates/prometheusConfiguration.yaml -groups: - - name: postgresql.rules - rules: - - alert: "PostgreSQLExporterDown" - expr: | - up{} * on (instance) postgres_exporter_build_info{} < 1 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $labels.instance }} exporter is down' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLExporterDown - summary: Exporter is down - - alert: "PostgreSQLExporterErrors" - expr: | - max by (job) (last_over_time(pg_exporter_last_scrape_error[10m])) > 0 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $labels.job }} is reporting scraping errors. Some metrics are not - collected anymore' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLExporterErrors - summary: Exporter is reporting scraping errors - - alert: "PostgreSQLExporterMissingScrapeErrorMetric" - expr: | - absent(pg_exporter_last_scrape_error) - for: 5m - labels: - severity: critical - annotations: - description: PostgreSQL exporter last scrape error metric is missing. Either the exporter - is down or some metrics are not collected anymore - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLExporterMissingScrapeErrorMetric - summary: PostgreSQL exporter last scrape error metric is missing - - alert: "PostgreSQLExporterScrapingLimit" - expr: | - avg_over_time(pg_exporter_last_scrape_duration_seconds{}[10m]) > 30 - for: 5m - labels: - severity: warning - annotations: - description: '{{ $labels.instance }} scraping take long time' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLExporterScrapingLimit - summary: Exporter scraping take long time - - alert: "PostgreSQLInactiveLogicalReplicationSlot" - expr: | - max by (server, slot_name) (pg_replication_slots_active{slot_type="logical"}) < 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort - }} is inactive' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLInactiveLogicalReplicationSlot - summary: Logical replication slot is inactive - - alert: "PostgreSQLInactivePhysicalReplicationSlot" - expr: | - max by (server, slot_name) (pg_replication_slots_active{slot_type="physical"}) < 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort - }} is inactive' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLInactivePhysicalReplicationSlot - summary: Physical replication slot is inactive - - alert: "PostgreSQLInvalidIndex" - # pint disable promql/series - expr: | - max by (cluster, datname, relname, indexrelname) (pg_stat_user_indexes_idx_blks_hit{indisvalid="false"}) >= 0 - for: 1h - labels: - severity: warning - annotations: - description: '{{ $labels.indexrelname }} of {{ $labels.relname }} table on {{ $labels.datname - }} database on {{ $labels.cluster }} is invalid' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLInvalidIndex - summary: '{{ $labels.indexrelname }} is invalid' - - alert: "PostgreSQLLongRunningQuery" - # pint disable promql/series - expr: | - max by (server, datname, usename, pid) (pg_active_backend_duration_minutes{usename!=""}) > 30 - for: 1m - labels: - severity: warning - annotations: - description: '{{ $labels.usename }} is running a long query on {{ $labels.datname - }} of {{ $labels.server | stripDomain | stripPort }} with pid {{ $labels.pid }}' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLLongRunningQuery - summary: Long running query on {{ $labels.datname }} of {{ $labels.server | stripDomain - | stripPort }} - - alert: "PostgreSQLMaxConnections" - expr: | - max by (server) (pg_stat_connections_count) - * 100 - / max by (server) (pg_settings_max_connections{}) - > 80 - - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.server | stripDomain | stripPort }} uses {{ printf "%.2g" - $value }}% of the maximum database connections' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLMaxConnections - summary: '{{ $labels.server | stripDomain | stripPort }} is close from the maximum - database connections' - - alert: "PostgreSQLReplicationSlotStorageLimit" - expr: | - max by (server, slot_name) (pg_replication_slots_available_storage_percent{}) < 20 - for: 5m - labels: - severity: warning - annotations: - description: '{{ $labels.slot_name }} has {{ printf "%.2g" $value }}% free disk storage - space' - runbook_url: https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/postgresql/PostgreSQLReplicationSlotStorageLimit - summary: '{{ $labels.slot_name }} on {{ $labels.server | stripDomain | stripPort }} - is close to its storage limit'