-
Notifications
You must be signed in to change notification settings - Fork 1
Add alerts #21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add alerts #21
Changes from 3 commits
3a5d179
5103808
a295184
69af704
1a2bb63
44ecfb7
cc5c1f4
2d554de
5324eaa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -120,6 +120,207 @@ | |
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresHasTooManyRollbacks', | ||
| annotations: { | ||
| description: 'PostgreSQL has too many rollbacks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', | ||
| summary: 'PostgreSQL has too many rollbacks.', | ||
| }, | ||
| expr: ||| | ||
| avg without(pod, instance) | ||
| (rate(pg_stat_database_xact_rollback{db_name!~"template.*|^$"}[5m]) / | ||
Dasomeone marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| (rate(pg_stat_database_xact_commit{db_name!~"template.*|^$"}[5m])+ rate(pg_stat_database_xact_rollback{db_name!~"template.*|^$"}[5m]))) > 0.10 | ||
|
||
| ||| % $._config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresHasHighDeadLocks', | ||
| annotations: { | ||
| description: 'PostgreSQL has too high deadlocks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', | ||
| summary: 'PostgreSQL has high number of deadlocks.', | ||
| }, | ||
| expr: ||| | ||
| max without(pod, instance) (rate(pg_stat_database_deadlocks{db_name!~"template.*|^$"}[5m]) * 60) > 5 | ||
Dasomeone marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ||| % $._config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresAcquiredTooManyLocks', | ||
| annotations: { | ||
| description: 'PostgreSQL has acquired too many locks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', | ||
| summary: 'PostgreSQL has high number of acquired locks.', | ||
| }, | ||
| expr: ||| | ||
| max by( server, job, db_name, asserts_env, asserts_site, namespace) ((pg_locks_count{db_name!~"template.*|^$"}) / | ||
| on(instance, asserts_env, asserts_site, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 | ||
Dasomeone marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ||| % $._config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresXLOGConsumptionVeryLow', | ||
| annotations: { | ||
| description: 'PostgreSQL instance {{ $labels.instance }} has a very low XLOG consumption rate.', | ||
| summary: 'PostgreSQL XLOG consumption is very low.', | ||
| }, | ||
| expr: 'rate(pg_xlog_position_bytes{asserts_env!=""}[5m]) < 200000', | ||
Dasomeone marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresXLOGConsumptionVeryHigh', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} is experiencing very high XLOG consumption rate, which might indicate excessive write operations.', | ||
| summary: 'PostgreSQL very high XLOG consumption rate.', | ||
| }, | ||
| expr: 'rate(pg_xlog_position_bytes{asserts_env!=""}[2m]) > 36700160 and on (instance, asserts_env, asserts_site) (pg_replication_is_replica{asserts_env!=""} == 0)', | ||
Dasomeone marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| 'for': '10m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresReplicationStopped', | ||
| annotations: { | ||
| description: 'PostgreSQL instance {{ $labels.instance }} has stopped replication.', | ||
| summary: 'PostgreSQL replication has stopped.', | ||
| }, | ||
| expr: 'pg_stat_replication_pg_xlog_location_diff{asserts_env!=""} != 0', | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresReplicationLaggingMore1Hour', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.', | ||
| summary: 'PostgreSQL replication lagging more than 1 hour.', | ||
| }, | ||
| expr: '(pg_replication_lag{asserts_env!=""} > 3600) and on (instance) (pg_replication_is_replica{asserts_env!=""} == 1)', | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresReplicationLagBytesAreTooLarge', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} replication lag in bytes is too large, which might indicate replication issues or network bottlenecks.', | ||
| summary: 'PostgreSQL replication lag in bytes too large.', | ||
| }, | ||
| expr: '(pg_xlog_position_bytes{asserts_env!=""} and pg_replication_is_replica{asserts_env!=""} == 0) - on (job, service, asserts_env, asserts_site) group_right(instance) (pg_xlog_position_bytes{asserts_env!=""} and pg_replication_is_replica{asserts_env!=""} == 1) > 1e+09', | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresHasReplicationSlotUsed', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} has replication slots that are not used, which might lead to replication lag or data inconsistency.', | ||
| summary: 'PostgreSQL has unused replication slots.', | ||
| }, | ||
| expr: 'pg_replication_slots_active{asserts_env!=""} == 0', | ||
| 'for': '30m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresReplicationIsStale', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} replication slots have not been updated for a significant period, indicating potential issues with replication.', | ||
| summary: 'PostgreSQL replication slots are stale.', | ||
| }, | ||
| expr: 'pg_replication_slots_xmin_age{asserts_env!="", slot_name =~ "^repmgr_slot_[0-9]+"} > 20000', | ||
| 'for': '30m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresReplicationRoleChanged', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} replication role has changed. Verify if this is expected or if it indicates a failover.', | ||
| summary: 'PostgreSQL replication role change detected.', | ||
| }, | ||
| expr: 'pg_replication_is_replica{asserts_env!=""} and changes(pg_replication_is_replica{asserts_env!=""}[1m]) > 0', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresHasExporterErrors', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} exporter is experiencing errors. Verify exporter health and configuration.', | ||
| summary: 'PostgreSQL exporter errors detected.', | ||
| }, | ||
| expr: 'pg_exporter_last_scrape_error{asserts_env!=""} > 0', | ||
| 'for': '30m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresHasTooManyDeadTuples', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} has too many dead tuples, which may lead to inefficient query performance. Consider vacuuming the database.', | ||
| summary: 'PostgreSQL has too many dead tuples.', | ||
| }, | ||
| expr: '(sum without(relname) (pg_stat_user_tables_n_dead_tup{asserts_env!="", db_name!~"template.*|^$"}) > 10000) / ((sum without(relname) (pg_stat_user_tables_n_live_tup{asserts_env!="", db_name!~"template.*|^$"}) + sum without(relname)(pg_stat_user_tables_n_dead_tup{asserts_env!="", db_name!~"template.*|^$"})) > 0) >= 0.1 unless on(instance, asserts_env, asserts_site) (pg_replication_is_replica{asserts_env!=""} == 1)', | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresTablesNotVaccumed', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} tables have not been vacuumed recently, which may lead to performance degradation.', | ||
| summary: 'PostgreSQL tables not vacuumed.', | ||
| }, | ||
| expr: 'group without(pod, instance)(timestamp(pg_stat_user_tables_n_dead_tup{asserts_env!=""} > pg_stat_user_tables_n_live_tup{asserts_env!=""} * on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{asserts_env!=""} + on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{asserts_env!=""})) < time() - 36000', | ||
| 'for': '30m', | ||
| labels: { | ||
| severity: 'critical', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresTableNotAnalyzed', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} table has not been analyzed recently, which might lead to inefficient query planning.', | ||
| summary: 'PostgreSQL table not analyzed.', | ||
| }, | ||
| expr: '\n group without(pod, instance)(\n timestamp(\n pg_stat_user_tables_n_dead_tup{asserts_env!=""} >\n pg_stat_user_tables_n_live_tup{asserts_env!=""}\n * on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_scale_factor{asserts_env!=""}\n + on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_threshold{asserts_env!=""}\n )\n -\n pg_stat_user_tables_last_autoanalyze{asserts_env!=""}\n > 24 * 60 * 60\n )', | ||
| labels: { | ||
| severity: 'warning', | ||
| asserts_entity_type: 'DataSource', | ||
| asserts_alert_category: 'failure', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'PostgresTooManyCheckpointsRequested', | ||
| annotations: { | ||
| description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.', | ||
| summary: 'PostgreSQL too many checkpoints requested.', | ||
| }, | ||
| expr: '\n rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) /\n (rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{asserts_env!=""}[5m]))\n < 0.5', | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| }, | ||
| ], | ||
| }, | ||
| ], | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,4 +2,4 @@ | |
| grafanaDashboards+:: { | ||
| 'postgres-overview.json': (import 'postgres-overview.json'), | ||
| }, | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.