diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bea7065fa3..1e3f954b8db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ - `MimirContinuousTestNotRunningOnWrites` - `MimirContinuousTestNotRunningOnReads` - `MimirContinuousTestFailed` +* [ENHANCEMENT] Added `per_cluster_label` support to allow to change the label name used to differentiate between Kubernetes clusters. #1651 * [BUGFIX] Dashboards: Fix "Failed evaluation rate" panel on Tenants dashboard. #1629 ### Jsonnet diff --git a/docs/sources/operators-guide/visualizing-metrics/requirements.md b/docs/sources/operators-guide/visualizing-metrics/requirements.md index dbe82e44323..3eb9def1920 100644 --- a/docs/sources/operators-guide/visualizing-metrics/requirements.md +++ b/docs/sources/operators-guide/visualizing-metrics/requirements.md @@ -13,7 +13,7 @@ The following table shows the required label names and whether they can be custo | Label name | Configurable | Description | | :---------- | :----------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `cluster` | No | The Kubernetes cluster or datacenter where the Mimir cluster is running. | +| `cluster` | Yes | The Kubernetes cluster or datacenter where the Mimir cluster is running. The cluster label can be configured with the `per_cluster_label` field in the mixin config. | | `namespace` | No | The Kubernetes namespace where the Mimir cluster is running. | | `job` | Partially | The Kubernetes namespace and Mimir component in the format `/`. When running in monolithic mode, the `` should be `mimir`. When running in microservices mode, the `` should be the name of the specific Mimir component (singular), like `distributor`, `ingester` or `store-gateway`. The label name can't be configured, while the regular expressions used to match components can be configured with the `job_names` field in the mixin config. | | `pod` | Yes | The unique identifier of a Mimir replica (eg. Pod ID when running on Kubernetes). The label name can be configured with the `per_instance_label` field in the mixin config. | diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 27bfd013097..db5fe341f61 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -435,7 +435,7 @@ alert: $.alertName('ProvisioningTooManyWrites'), // 80k writes / s per ingester max. expr: ||| - avg by (%(alert_aggregation_labels)s) (cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3 + avg by (%(alert_aggregation_labels)s) (%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3 ||| % $._config, 'for': '15m', labels: { diff --git a/operations/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet index e54776cda4c..1feba86fdae 100644 --- a/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/operations/mimir-mixin/alerts/blocks.libsonnet @@ -14,13 +14,13 @@ (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) and # Only if the ingester has ingested samples over the last 4h. - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) and # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) ||| % $._config, labels: { severity: 'critical', @@ -37,7 +37,7 @@ expr: ||| (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) and - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cluster_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) ||| % $._config, labels: { severity: 'critical', diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index 936bd0f67c1..f212383cacd 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -35,9 +35,12 @@ overrides_exporter: 'overrides-exporter', }, + // The label used to differentiate between different Kubernetes clusters. + per_cluster_label: 'cluster', + // Grouping labels, to uniquely identify and group by {jobs, clusters} - job_labels: ['cluster', 'namespace', 'job'], - cluster_labels: ['cluster', 'namespace'], + job_labels: [$._config.per_cluster_label, 'namespace', 'job'], + cluster_labels: [$._config.per_cluster_label, 'namespace'], cortex_p99_latency_threshold_seconds: 2.5, diff --git a/operations/mimir-mixin/dashboards/alertmanager.libsonnet b/operations/mimir-mixin/dashboards/alertmanager.libsonnet index 015b9ef437d..e65c15efecd 100644 --- a/operations/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/operations/mimir-mixin/dashboards/alertmanager.libsonnet @@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Total alerts') + - $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') + $.statPanel('sum(%s_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_cluster_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( $.panel('Total silences') + - $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') + $.statPanel('sum(%s_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_cluster_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( $.panel('Tenants') + @@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s}) + sum(%s_job:cortex_alertmanager_alerts_received_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s}) - ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], - 'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), + sum(%s_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s}) + ||| % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager), $._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum(%s_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], ], ['success', 'failed'] ) @@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) + sum(%s_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) - - sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) - ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], - 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), + sum(%s_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) + ||| % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager), $._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum(%s_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], ], ['success', 'failed'] ) @@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; [ ||| ( - sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration) + sum(%s_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration) - - sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration) + sum(%s_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration) ) > 0 or on () vector(0) - ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], - 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher($._config.job_names.alertmanager), + ||| % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager), $._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum(%s_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], ], ['success - {{ integration }}', 'failed - {{ integration }}'] ) @@ -104,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum by(%s) (%s_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_cluster_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -112,7 +112,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum by(%s) (%s_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_cluster_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s}) + sum(%s_job:cortex_alertmanager_state_replication_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s}) - ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], - 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), + sum(%s_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s}) + ||| % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager), $._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum(%s_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], ], ['success', 'failed'] ) @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s}) + sum(%s_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s}) - ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], - 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), + sum(%s_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s}) + ||| % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager), $._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], + 'sum(%s_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.alertmanager)], ], ['success', 'failed'] ) diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 3fdcc72baf2..680e0eb5fee 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -54,17 +54,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; if $._config.singleBinary then d.addMultiTemplate('job', 'cortex_build_info', 'job') else d - .addMultiTemplate('cluster', 'cortex_build_info', 'cluster') - .addMultiTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace') + .addMultiTemplate('cluster', 'cortex_build_info', '%s' % $._config.per_cluster_label) + .addMultiTemplate('namespace', 'cortex_build_info{%s=~"$cluster"}' % $._config.per_cluster_label, 'namespace') else if $._config.singleBinary then d.addTemplate('job', 'cortex_build_info', 'job') else d - .addTemplate('cluster', 'cortex_build_info', 'cluster') - .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), + .addTemplate('cluster', 'cortex_build_info', '%s' % $._config.per_cluster_label) + .addTemplate('namespace', 'cortex_build_info{%s=~"$cluster"}' % $._config.per_cluster_label, 'namespace'), addActiveUserSelectorTemplates():: - self.addTemplate('user', 'cortex_ingester_active_series{cluster=~"$cluster", namespace=~"$namespace"}', 'user'), + self.addTemplate('user', 'cortex_ingester_active_series{%s=~"$cluster", namespace=~"$namespace"}' % $._config.per_cluster_label, 'user'), addCustomTemplate(name, values, defaultIndex=0):: self { templating+: { @@ -99,17 +99,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; jobMatcher(job):: if $._config.singleBinary then 'job=~"$job"' - else 'cluster=~"$cluster", job=~"($namespace)/(%s)"' % job, + else '%s=~"$cluster", job=~"($namespace)/(%s)"' % [$._config.per_cluster_label, job], namespaceMatcher():: if $._config.singleBinary then 'job=~"$job"' - else 'cluster=~"$cluster", namespace=~"$namespace"', + else '%s=~"$cluster", namespace=~"$namespace"' % $._config.per_cluster_label, jobSelector(job):: if $._config.singleBinary - then [utils.selector.noop('cluster'), utils.selector.re('job', '$job')] - else [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/(%s)' % job)], + then [utils.selector.noop('%s' % $._config.per_cluster_label), utils.selector.re('job', '$job')] + else [utils.selector.re('%s' % $._config.per_cluster_label, '$cluster'), utils.selector.re('job', '($namespace)/(%s)' % job)], queryPanel(queries, legends, legendLink=null):: super.queryPanel(queries, legends, legendLink) + { diff --git a/operations/mimir-mixin/dashboards/overrides.libsonnet b/operations/mimir-mixin/dashboards/overrides.libsonnet index 29fb3749812..980ed068e38 100644 --- a/operations/mimir-mixin/dashboards/overrides.libsonnet +++ b/operations/mimir-mixin/dashboards/overrides.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: '${datasource}', targets: [ { - expr: 'max by(limit_name) (cortex_limits_defaults{cluster=~"$cluster",namespace=~"$namespace"})', + expr: 'max by(limit_name) (cortex_limits_defaults{%s=~"$cluster",namespace=~"$namespace"})' % $._config.per_cluster_label, instant: true, legendFormat: '', refId: 'A', @@ -69,7 +69,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: '${datasource}', targets: [ { - expr: 'max by(user, limit_name) (cortex_limits_overrides{cluster=~"$cluster",namespace=~"$namespace",user=~"${tenant_id}"})', + expr: 'max by(user, limit_name) (cortex_limits_overrides{%s=~"$cluster",namespace=~"$namespace",user=~"${tenant_id}"})' % $._config.per_cluster_label, instant: true, legendFormat: '', refId: 'A', diff --git a/operations/mimir-mixin/dashboards/rollout-progress.libsonnet b/operations/mimir-mixin/dashboards/rollout-progress.libsonnet index fcefd096f38..d5c57a3c056 100644 --- a/operations/mimir-mixin/dashboards/rollout-progress.libsonnet +++ b/operations/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -3,6 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { local config = { namespace_matcher: $.namespaceMatcher(), + per_cluster_label: $._config.per_cluster_label, gateway_job_matcher: $.jobMatcher($._config.job_names.gateway), gateway_write_routes_regex: 'api_(v1|prom)_push', gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+', @@ -127,7 +128,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Writes 99th latency') + $.newStatPanel(||| - histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) + histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ { color: 'green', value: null }, { color: 'orange', value: 0.2 }, @@ -178,7 +179,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Reads 99th latency') + $.newStatPanel(||| - histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) + histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ { color: 'green', value: null }, { color: 'orange', value: 1 }, @@ -283,15 +284,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency vs 24h ago') + $.queryPanel([||| 1 - ( - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) / - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:]) ) ||| % config, ||| 1 - ( - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:]) / - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:]) ) ||| % config], ['writes', 'reads']) + { yaxes: $.yaxes({ diff --git a/operations/mimir-mixin/dashboards/ruler.libsonnet b/operations/mimir-mixin/dashboards/ruler.libsonnet index 6206bfe64a1..7bf6bb292a5 100644 --- a/operations/mimir-mixin/dashboards/ruler.libsonnet +++ b/operations/mimir-mixin/dashboards/ruler.libsonnet @@ -116,7 +116,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per route p99 latency') + $.queryPanel( - 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], + 'histogram_quantile(0.99, sum by (route, le) (%s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$._config.per_cluster_label, $.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], '{{ route }}' ) + { yaxes: $.yaxes('s') } diff --git a/operations/mimir-mixin/dashboards/scaling.libsonnet b/operations/mimir-mixin/dashboards/scaling.libsonnet index 36dead45cf6..67980d16ff0 100644 --- a/operations/mimir-mixin/dashboards/scaling.libsonnet +++ b/operations/mimir-mixin/dashboards/scaling.libsonnet @@ -41,11 +41,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.tablePanel([ ||| sort_desc( - cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} + %s_deployment_reason:required_replicas:count{%s} > ignoring(reason) group_left - cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} + %s_deployment:actual_replicas:count{%s} ) - |||, + ||| % [$._config.alert_aggregation_rule_prefix, $.namespaceMatcher(), $._config.alert_aggregation_rule_prefix, $.namespaceMatcher()], ], { __name__: { alias: 'Cluster', type: 'hidden' }, cluster: { alias: 'Cluster' }, diff --git a/operations/mimir-mixin/dashboards/slow-queries.libsonnet b/operations/mimir-mixin/dashboards/slow-queries.libsonnet index f28805b8960..f11cdd8f4d0 100644 --- a/operations/mimir-mixin/dashboards/slow-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/slow-queries.libsonnet @@ -16,7 +16,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; targets: [ { // Filter out the remote read endpoint. - expr: '{cluster=~"$cluster",namespace=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | org_id=~"${tenant_id}" | response_time > ${min_duration}', + expr: '{%s=~"$cluster",namespace=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | org_id=~"${tenant_id}" | response_time > ${min_duration}' % $._config.per_cluster_label, instant: false, legendFormat: '', range: true, diff --git a/operations/mimir-mixin/groups.libsonnet b/operations/mimir-mixin/groups.libsonnet index c2c35f90d21..a88b7e4c42f 100644 --- a/operations/mimir-mixin/groups.libsonnet +++ b/operations/mimir-mixin/groups.libsonnet @@ -58,5 +58,14 @@ ), ), ), + alert_aggregation_rule_prefix: + std.join( + '_', + // Split the configured labels by comma and remove whitespaces. + std.map( + function(l) std.strReplace(l, ' ', ''), + std.split($._config.alert_aggregation_labels, ',') + ), + ), }, } diff --git a/operations/mimir-mixin/recording_rules.libsonnet b/operations/mimir-mixin/recording_rules.libsonnet index 40e8e6cf59d..b2878681447 100644 --- a/operations/mimir-mixin/recording_rules.libsonnet +++ b/operations/mimir-mixin/recording_rules.libsonnet @@ -12,45 +12,45 @@ local utils = import 'mixin-utils/utils.libsonnet'; { name: 'mimir_api_1', rules: - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']), + utils.histogramRules('cortex_request_duration_seconds', [$._config.per_cluster_label, 'job']), }, { name: 'mimir_api_2', rules: - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']), + utils.histogramRules('cortex_request_duration_seconds', [$._config.per_cluster_label, 'job', 'route']), }, { name: 'mimir_api_3', rules: - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), + utils.histogramRules('cortex_request_duration_seconds', $._config.job_labels + ['route']), }, { name: 'mimir_querier_api', rules: - utils.histogramRules('cortex_querier_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_querier_request_duration_seconds', ['cluster', 'job', 'route']) + - utils.histogramRules('cortex_querier_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), + utils.histogramRules('cortex_querier_request_duration_seconds', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('cortex_querier_request_duration_seconds', [$._config.per_cluster_label, 'job', 'route']) + + utils.histogramRules('cortex_querier_request_duration_seconds', $._config.job_labels + ['route']), }, { name: 'mimir_cache', rules: - utils.histogramRules('cortex_memcache_request_duration_seconds', ['cluster', 'job', 'method']) + - utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']), + utils.histogramRules('cortex_memcache_request_duration_seconds', [$._config.per_cluster_label, 'job', 'method']) + + utils.histogramRules('cortex_cache_request_duration_seconds', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('cortex_cache_request_duration_seconds', [$._config.per_cluster_label, 'job', 'method']), }, { name: 'mimir_storage', rules: - utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), + utils.histogramRules('cortex_kv_request_duration_seconds', [$._config.per_cluster_label, 'job']), }, { name: 'mimir_queries', rules: - utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + - utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_exemplars', ['cluster', 'job']), + utils.histogramRules('cortex_query_frontend_retries', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('cortex_query_frontend_queue_duration_seconds', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('cortex_ingester_queried_series', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('cortex_ingester_queried_samples', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('cortex_ingester_queried_exemplars', [$._config.per_cluster_label, 'job']), }, { name: 'mimir_received_samples', @@ -113,9 +113,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; { // Convenience rule to get the number of replicas for both a deployment and a statefulset. // Multi-zone deployments are grouped together removing the "zone-X" suffix. - record: 'cluster_namespace_deployment:actual_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count' % _config, expr: ||| - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( kube_deployment_spec_replicas, # The question mark in "(.*?)" is used to make it non-greedy, otherwise it @@ -124,14 +124,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) or - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") ) - |||, + ||| % _config, }, { // Distributors should be able to deal with 240k samples/s. - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'distributor', reason: 'sample_rate', @@ -139,7 +139,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| ceil( quantile_over_time(0.99, - sum by (cluster, namespace) ( + sum by (%(alert_aggregation_labels)s) ( %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m )[24h:] ) @@ -150,14 +150,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; { // We should be about to cover 80% of our limits, // and ingester can have 80k samples/s. - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'distributor', reason: 'sample_rate_limits', }, expr: ||| ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"}) * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s ) ||| % _config, @@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { // We want ingesters each ingester to deal with 80k samples/s. // NB we measure this at the distributors and multiple by RF (3). - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'ingester', reason: 'sample_rate', @@ -173,7 +173,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| ceil( quantile_over_time(0.99, - sum by (cluster, namespace) ( + sum by (%(alert_aggregation_labels)s) ( %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m )[24h:] ) @@ -183,7 +183,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // Ingester should have 1.5M series in memory - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'ingester', reason: 'active_series', @@ -191,7 +191,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| ceil( quantile_over_time(0.99, - sum by(cluster, namespace) ( + sum by(%(alert_aggregation_labels)s) ( cortex_ingester_memory_series )[24h:] ) @@ -202,14 +202,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; { // We should be about to cover 60% of our limits, // and ingester can have 1.5M series in memory - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'ingester', reason: 'active_series_limits', }, expr: ||| ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s ) ||| % _config, @@ -217,43 +217,43 @@ local utils = import 'mixin-utils/utils.libsonnet'; { // We should be about to cover 60% of our limits, // and ingester can have 80k samples/s. - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'ingester', reason: 'sample_rate_limits', }, expr: ||| ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"}) * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s ) ||| % _config, }, { // Ingesters store 96h of data on disk - we want memcached to store 1/4 of that. - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { deployment: 'memcached', reason: 'active_series', }, expr: ||| ceil( - (sum by (cluster, namespace) ( + (sum by (%(alert_aggregation_labels)s) ( cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} ) / 4) / - avg by (cluster, namespace) ( + avg by (%(alert_aggregation_labels)s) ( memcached_limit_bytes{job=~".+/memcached"} ) ) - |||, + ||| % _config, }, { // Convenience rule to get the CPU utilization for both a deployment and a statefulset. // Multi-zone deployments are grouped together removing the "zone-X" suffix. - record: 'cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate', + record: '%(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate' % _config, expr: ||| - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate, @@ -264,12 +264,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) - |||, + ||| % _config, }, { // Convenience rule to get the CPU request for both a deployment and a statefulset. // Multi-zone deployments are grouped together removing the "zone-X" suffix. - record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum', + record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config, expr: ||| # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # that remove resource metrics, ref: @@ -279,7 +279,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; # This is the old expression, compatible with kube-state-metrics < v2.0.0, # where kube_pod_container_resource_requests_cpu_cores was removed: ( - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests_cpu_cores, @@ -295,7 +295,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; # This expression is compatible with kube-state-metrics >= v1.4.0, # where kube_pod_container_resource_requests was introduced. ( - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests{resource="cpu"}, @@ -307,32 +307,32 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) ) - |||, + ||| % _config, }, { // Jobs should be sized to their CPU usage. // We do this by comparing 99th percentile usage over the last 24hrs to // their current provisioned #replicas and resource requests. - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { reason: 'cpu_usage', }, expr: ||| ceil( - cluster_namespace_deployment:actual_replicas:count + %(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count * - quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) + quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) / - cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + %(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum ) - |||, + ||| % _config, }, { // Convenience rule to get the Memory utilization for both a deployment and a statefulset. // Multi-zone deployments are grouped together removing the "zone-X" suffix. - record: 'cluster_namespace_deployment:container_memory_usage_bytes:sum', + record: '%(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum' % _config, expr: ||| - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( container_memory_usage_bytes, @@ -343,12 +343,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) - |||, + ||| % _config, }, { // Convenience rule to get the Memory request for both a deployment and a statefulset. // Multi-zone deployments are grouped together removing the "zone-X" suffix. - record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum', + record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config, expr: ||| # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # that remove resource metrics, ref: @@ -358,7 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; # This is the old expression, compatible with kube-state-metrics < v2.0.0, # where kube_pod_container_resource_requests_memory_bytes was removed: ( - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests_memory_bytes, @@ -374,7 +374,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; # This expression is compatible with kube-state-metrics >= v1.4.0, # where kube_pod_container_resource_requests was introduced. ( - sum by (cluster, namespace, deployment) ( + sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests{resource="memory"}, @@ -386,25 +386,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) ) - |||, + ||| % _config, }, { // Jobs should be sized to their Memory usage. // We do this by comparing 99th percentile usage over the last 24hrs to // their current provisioned #replicas and resource requests. - record: 'cluster_namespace_deployment_reason:required_replicas:count', + record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config, labels: { reason: 'memory_usage', }, expr: ||| ceil( - cluster_namespace_deployment:actual_replicas:count + %(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count * - quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) + quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum[24h]) / - cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + %(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum ) - |||, + ||| % _config, }, ], }, @@ -413,64 +413,64 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: [ // Aggregations of per-user Alertmanager metrics used in dashboards. { - record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label, + record: '%s_job_%s:cortex_alertmanager_alerts:sum' % [$._config.per_cluster_label, $._config.per_instance_label], expr: ||| - sum by (cluster, job, %s) (cortex_alertmanager_alerts) - ||| % $._config.per_instance_label, + sum by (%s, job, %s) (cortex_alertmanager_alerts) + ||| % [$._config.per_cluster_label, $._config.per_instance_label], }, { - record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label, + record: '%s_job_%s:cortex_alertmanager_silences:sum' % [$._config.per_cluster_label, $._config.per_instance_label], expr: ||| - sum by (cluster, job, %s) (cortex_alertmanager_silences) - ||| % $._config.per_instance_label, + sum by (%s, job, %s) (cortex_alertmanager_silences) + ||| % [$._config.per_cluster_label, $._config.per_instance_label], }, { - record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m', + record: '%s_job:cortex_alertmanager_alerts_received_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) - |||, + sum by (%(per_cluster_label)s, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + ||| % _config, }, { - record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m', + record: '%s_job:cortex_alertmanager_alerts_invalid_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) - |||, + sum by (%(per_cluster_label)s, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + ||| % _config, }, { - record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m', + record: '%s_job_integration:cortex_alertmanager_notifications_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) - |||, + sum by (%(per_cluster_label)s, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + ||| % _config, }, { - record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m', + record: '%s_job_integration:cortex_alertmanager_notifications_failed_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) - |||, + sum by (%(per_cluster_label)s, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + ||| % _config, }, { - record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m', + record: '%s_job:cortex_alertmanager_state_replication_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) - |||, + sum by (%(per_cluster_label)s, job) (rate(cortex_alertmanager_state_replication_total[5m])) + ||| % _config, }, { - record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m', + record: '%s_job:cortex_alertmanager_state_replication_failed_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) - |||, + sum by (%(per_cluster_label)s, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + ||| % _config, }, { - record: 'cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m', + record: '%s_job:cortex_alertmanager_partial_state_merges_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) - |||, + sum by (%(per_cluster_label)s, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + ||| % _config, }, { - record: 'cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m', + record: '%s_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m' % $._config.per_cluster_label, expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) - |||, + sum by (%(per_cluster_label)s, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + ||| % _config, }, ], }, @@ -479,7 +479,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: [ { // cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance - record: 'cluster_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % $._config.per_instance_label, + record: '%s_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label], expr: ||| sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m])) ||| % $._config,