From ad43a8b493d10962313f6aafa081d641a55fb3d2 Mon Sep 17 00:00:00 2001 From: Muhammad Shahzeb Date: Tue, 20 Aug 2024 05:07:42 +0500 Subject: [PATCH 1/6] Add vars for grouping --- postgres_mixin/alerts/postgres.libsonnet | 42 +++++++++++++----------- postgres_mixin/config.libsonnet | 3 ++ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet index 16f11fe09..ba624429c 100644 --- a/postgres_mixin/alerts/postgres.libsonnet +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -11,12 +11,12 @@ summary: 'Postgres connections count is over the maximum amount.', }, expr: ||| - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) >= sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) - - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) - ||| % $._config, + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '1m', labels: { severity: 'warning', @@ -29,14 +29,14 @@ summary: 'Postgres connections count is over 80% of maximum amount.', }, expr: ||| - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) > ( - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) - - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) ) * 0.8 - ||| % $._config, + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '10m', labels: { severity: 'warning', @@ -61,12 +61,12 @@ summary: 'PostgreSQL high number of slow queries.', }, expr: ||| - avg by (datname) ( + avg by (%(agg)s) ( rate ( pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m] ) ) > 2 * 60 - ||| % $._config, + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '2m', labels: { severity: 'warning', @@ -79,7 +79,7 @@ summary: 'PostgreSQL high number of queries per second.', }, expr: ||| - avg by (datname) ( + avg by (datname, %(agg)s) ( irate( pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] ) @@ -88,7 +88,7 @@ pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] ) ) > 10000 - ||| % $._config, + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -101,7 +101,7 @@ summary: 'PostgreSQL low cache hit rate.', }, expr: ||| - avg by (datname) ( + avg by (datname, %(agg)s) ( rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]) / ( @@ -114,7 +114,7 @@ ) ) ) < 0.98 - ||| % $._config, + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -157,9 +157,9 @@ summary: 'PostgreSQL has high number of acquired locks.', }, expr: ||| - max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) / - on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 - ||| % $._config, + max by(datname, %(agg)s) ((pg_locks_count{%(dbNameFilter)s}) / + on(%(agg)s) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -171,7 +171,9 @@ description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.', summary: 'PostgreSQL replication lagging more than 1 hour.', }, - expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)', + expr: ||| + (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1)' + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -223,12 +225,12 @@ timestamp( pg_stat_user_tables_n_dead_tup{} > pg_stat_user_tables_n_live_tup{} - * on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{} - + on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{} + * on(%(agg)s) group_left pg_settings_autovacuum_vacuum_scale_factor{} + + on(%(agg)s) group_left pg_settings_autovacuum_vacuum_threshold{} ) < time() - 36000 ) - |||, + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '30m', labels: { severity: 'critical', diff --git a/postgres_mixin/config.libsonnet b/postgres_mixin/config.libsonnet index 9d2d3cfd8..4ea3b7e52 100644 --- a/postgres_mixin/config.libsonnet +++ b/postgres_mixin/config.libsonnet @@ -2,5 +2,8 @@ _config+:: { dbNameFilter: 'datname!~"template.*"', postgresExporterSelector: '', + groupLabels: if self.enableMultiCluster then ['job', 'cluster'] else ['job'], + instanceLabels: ['instance', 'server'], + enableMultiCluster: false, }, } From f069d64302156e0577bb1b3a7c5e943b659deb59 Mon Sep 17 00:00:00 2001 From: Muhammad Shahzeb Date: Tue, 20 Aug 2024 05:17:34 +0500 Subject: [PATCH 2/6] Fixes --- postgres_mixin/alerts/postgres.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet index ba624429c..ca2d468a7 100644 --- a/postgres_mixin/alerts/postgres.libsonnet +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -7,7 +7,7 @@ { alert: 'PostgreSQLMaxConnectionsReached', annotations: { - description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.', + description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy).', summary: 'Postgres connections count is over the maximum amount.', }, expr: ||| @@ -172,7 +172,7 @@ summary: 'PostgreSQL replication lagging more than 1 hour.', }, expr: ||| - (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1)' + (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1) ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { From 517da2c6119496be78e3444a91cd66fd1ec0d048 Mon Sep 17 00:00:00 2001 From: Muhammad Shahzeb Date: Tue, 20 Aug 2024 05:26:37 +0500 Subject: [PATCH 3/6] Minor updates --- postgres_mixin/alerts/postgres.libsonnet | 30 ++++++++++++------------ postgres_mixin/config.libsonnet | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet index ca2d468a7..92ef0b327 100644 --- a/postgres_mixin/alerts/postgres.libsonnet +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -13,10 +13,10 @@ expr: ||| sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) >= - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) - sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '1m', labels: { severity: 'warning', @@ -36,7 +36,7 @@ - sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) ) * 0.8 - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '10m', labels: { severity: 'warning', @@ -63,10 +63,10 @@ expr: ||| avg by (%(agg)s) ( rate ( - pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m] + pg_stat_activity_max_tx_duration{%(dbNameFilter)s, %(postgresExporterSelector)s}[2m] ) ) > 2 * 60 - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '2m', labels: { severity: 'warning', @@ -81,14 +81,14 @@ expr: ||| avg by (datname, %(agg)s) ( irate( - pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] + pg_stat_database_xact_commit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] ) + irate( - pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] + pg_stat_database_xact_rollback{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] ) ) > 10000 - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -102,19 +102,19 @@ }, expr: ||| avg by (datname, %(agg)s) ( - rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]) + rate(pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]) / ( rate( - pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] + pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] ) + rate( - pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] + pg_stat_database_blks_read{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] ) ) ) < 0.98 - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -159,7 +159,7 @@ expr: ||| max by(datname, %(agg)s) ((pg_locks_count{%(dbNameFilter)s}) / on(%(agg)s) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -173,7 +173,7 @@ }, expr: ||| (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1) - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '5m', labels: { severity: 'warning', @@ -230,7 +230,7 @@ ) < time() - 36000 ) - ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, 'for': '30m', labels: { severity: 'critical', diff --git a/postgres_mixin/config.libsonnet b/postgres_mixin/config.libsonnet index 4ea3b7e52..d44830f87 100644 --- a/postgres_mixin/config.libsonnet +++ b/postgres_mixin/config.libsonnet @@ -1,7 +1,7 @@ { _config+:: { dbNameFilter: 'datname!~"template.*"', - postgresExporterSelector: '', + postgresExporterSelector: 'job="integrations/postgres_exporter"', groupLabels: if self.enableMultiCluster then ['job', 'cluster'] else ['job'], instanceLabels: ['instance', 'server'], enableMultiCluster: false, From 528cdb6b5844e498b10e688c72b2f41b0d017b47 Mon Sep 17 00:00:00 2001 From: Muhammad Shahzeb Date: Tue, 20 Aug 2024 05:39:30 +0500 Subject: [PATCH 4/6] Fix PostgresAcquiredTooManyLocks --- postgres_mixin/alerts/postgres.libsonnet | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet index 92ef0b327..2df5cec7f 100644 --- a/postgres_mixin/alerts/postgres.libsonnet +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -157,9 +157,14 @@ summary: 'PostgreSQL has high number of acquired locks.', }, expr: ||| - max by(datname, %(agg)s) ((pg_locks_count{%(dbNameFilter)s}) / - on(%(agg)s) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 - ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, + max by(datname, %(agg)s) ( + (pg_locks_count{%(dbNameFilter)s}) + / + on(%(aggWithoutServer)s) group_left(server) ( + pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{} + ) + ) > 0.20 + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels), aggWithoutServer: std.join(',', std.filter(function(x) x != "server", $._config.groupLabels + $._config.instanceLabels)) }, 'for': '5m', labels: { severity: 'warning', From 4fe5f5760528db772d4bbf0b02f9e58eb0c14e58 Mon Sep 17 00:00:00 2001 From: Muhammad Shahzeb Date: Tue, 20 Aug 2024 13:34:04 +0500 Subject: [PATCH 5/6] Update dashboard name --- postgres_mixin/dashboards/dashboards.libsonnet | 2 +- .../{postgres-overview.json => postgresql-overview.json} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename postgres_mixin/dashboards/{postgres-overview.json => postgresql-overview.json} (99%) diff --git a/postgres_mixin/dashboards/dashboards.libsonnet b/postgres_mixin/dashboards/dashboards.libsonnet index d55f1ef5c..35413ce7f 100644 --- a/postgres_mixin/dashboards/dashboards.libsonnet +++ b/postgres_mixin/dashboards/dashboards.libsonnet @@ -1,5 +1,5 @@ { grafanaDashboards+:: { - 'postgres-overview.json': (import 'postgres-overview.json'), + 'postgresql-overview.json': (import 'postgresql-overview.json'), }, } diff --git a/postgres_mixin/dashboards/postgres-overview.json b/postgres_mixin/dashboards/postgresql-overview.json similarity index 99% rename from postgres_mixin/dashboards/postgres-overview.json rename to postgres_mixin/dashboards/postgresql-overview.json index 88d742d00..8baf6fbb3 100644 --- a/postgres_mixin/dashboards/postgres-overview.json +++ b/postgres_mixin/dashboards/postgresql-overview.json @@ -45,7 +45,7 @@ "repeatRowId": null, "showTitle": true, "span": 4, - "title": "Postgres Overview", + "title": "PostgreSQL overview", "titleSize": "h6", "type": "row" }, From 330253569cdb2804ceac4db1f4fa55f7ddcb7d1c Mon Sep 17 00:00:00 2001 From: Muhammad Shahzeb Date: Wed, 21 Aug 2024 14:34:58 +0500 Subject: [PATCH 6/6] Add datname in alert group by --- postgres_mixin/alerts/postgres.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet index 2df5cec7f..015e06a9d 100644 --- a/postgres_mixin/alerts/postgres.libsonnet +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -61,7 +61,7 @@ summary: 'PostgreSQL high number of slow queries.', }, expr: ||| - avg by (%(agg)s) ( + avg by (datname, %(agg)s) ( rate ( pg_stat_activity_max_tx_duration{%(dbNameFilter)s, %(postgresExporterSelector)s}[2m] )