Skip to content

Commit 7b32d0a

Browse files
committed
Use alert_aggreation at the obvious locations
Signed-off-by: Whyeasy <[email protected]>
1 parent ca1b6d6 commit 7b32d0a

File tree

5 files changed

+52
-43
lines changed

5 files changed

+52
-43
lines changed

operations/mimir-mixin/alerts/alerts.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@
435435
alert: $.alertName('ProvisioningTooManyWrites'),
436436
// 80k writes / s per ingester max.
437437
expr: |||
438-
avg by (%(alert_aggregation_labels)s) (%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
438+
avg by (%(alert_aggregation_labels)s) (%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
439439
||| % $._config,
440440
'for': '15m',
441441
labels: {

operations/mimir-mixin/alerts/blocks.libsonnet

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
1515
and
1616
# Only if the ingester has ingested samples over the last 4h.
17-
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
17+
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
1818
and
1919
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
2020
# had ingested samples in the past, then no traffic was received for a long period and then it starts
2121
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
2222
# samples, while the a block shipping is expected within the next 4h.
23-
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
23+
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
2424
||| % $._config,
2525
labels: {
2626
severity: 'critical',
@@ -37,7 +37,7 @@
3737
expr: |||
3838
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
3939
and
40-
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
40+
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
4141
||| % $._config,
4242
labels: {
4343
severity: 'critical',

operations/mimir-mixin/dashboards/scaling.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
4141
$.tablePanel([
4242
|||
4343
sort_desc(
44-
%(clusterLabel)s_namespace_deployment_reason:required_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
44+
%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
4545
> ignoring(reason) group_left
46-
%(clusterLabel)s_namespace_deployment:actual_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
46+
%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
4747
)
4848
||| % $._config,
4949
], {

operations/mimir-mixin/groups.libsonnet

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,14 @@
5858
),
5959
),
6060
),
61+
alert_aggregation_rule_prefix:
62+
std.join(
63+
'_',
64+
// Split the configured labels by comma and remove whitespaces.
65+
std.map(
66+
function(l) std.strReplace(l, ' ', ''),
67+
std.split($._config.alert_aggregation_labels, ',')
68+
),
69+
),
6170
},
6271
}

operations/mimir-mixin/recording_rules.libsonnet

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
113113
{
114114
// Convenience rule to get the number of replicas for both a deployment and a statefulset.
115115
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
116-
record: '%(clusterLabel)s_namespace_deployment:actual_replicas:count' % _config,
116+
record: '%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count' % _config,
117117
expr: |||
118-
sum by (%(clusterLabel)s, namespace, deployment) (
118+
sum by (%(alert_aggregation_labels)s, deployment) (
119119
label_replace(
120120
kube_deployment_spec_replicas,
121121
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
@@ -124,22 +124,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
124124
)
125125
)
126126
or
127-
sum by (%(clusterLabel)s, namespace, deployment) (
127+
sum by (%(alert_aggregation_labels)s, deployment) (
128128
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
129129
)
130130
||| % _config,
131131
},
132132
{
133133
// Distributors should be able to deal with 240k samples/s.
134-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
134+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
135135
labels: {
136136
deployment: 'distributor',
137137
reason: 'sample_rate',
138138
},
139139
expr: |||
140140
ceil(
141141
quantile_over_time(0.99,
142-
sum by (%(clusterLabel)s, namespace) (
142+
sum by (%(alert_aggregation_labels)s) (
143143
%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
144144
)[24h:]
145145
)
@@ -150,30 +150,30 @@ local utils = import 'mixin-utils/utils.libsonnet';
150150
{
151151
// We should be about to cover 80% of our limits,
152152
// and ingester can have 80k samples/s.
153-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
153+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
154154
labels: {
155155
deployment: 'distributor',
156156
reason: 'sample_rate_limits',
157157
},
158158
expr: |||
159159
ceil(
160-
sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
160+
sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"})
161161
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
162162
)
163163
||| % _config,
164164
},
165165
{
166166
// We want ingesters each ingester to deal with 80k samples/s.
167167
// NB we measure this at the distributors and multiple by RF (3).
168-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
168+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
169169
labels: {
170170
deployment: 'ingester',
171171
reason: 'sample_rate',
172172
},
173173
expr: |||
174174
ceil(
175175
quantile_over_time(0.99,
176-
sum by (%(clusterLabel)s, namespace) (
176+
sum by (%(alert_aggregation_labels)s) (
177177
%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
178178
)[24h:]
179179
)
@@ -183,15 +183,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
183183
},
184184
{
185185
// Ingester should have 1.5M series in memory
186-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
186+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
187187
labels: {
188188
deployment: 'ingester',
189189
reason: 'active_series',
190190
},
191191
expr: |||
192192
ceil(
193193
quantile_over_time(0.99,
194-
sum by(%(clusterLabel)s, namespace) (
194+
sum by(%(alert_aggregation_labels)s) (
195195
cortex_ingester_memory_series
196196
)[24h:]
197197
)
@@ -202,47 +202,47 @@ local utils = import 'mixin-utils/utils.libsonnet';
202202
{
203203
// We should be about to cover 60% of our limits,
204204
// and ingester can have 1.5M series in memory
205-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
205+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
206206
labels: {
207207
deployment: 'ingester',
208208
reason: 'active_series_limits',
209209
},
210210
expr: |||
211211
ceil(
212-
sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
212+
sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
213213
* 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
214214
)
215215
||| % _config,
216216
},
217217
{
218218
// We should be about to cover 60% of our limits,
219219
// and ingester can have 80k samples/s.
220-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
220+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
221221
labels: {
222222
deployment: 'ingester',
223223
reason: 'sample_rate_limits',
224224
},
225225
expr: |||
226226
ceil(
227-
sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
227+
sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"})
228228
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
229229
)
230230
||| % _config,
231231
},
232232
{
233233
// Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
234-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
234+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
235235
labels: {
236236
deployment: 'memcached',
237237
reason: 'active_series',
238238
},
239239
expr: |||
240240
ceil(
241-
(sum by (%(clusterLabel)s, namespace) (
241+
(sum by (%(alert_aggregation_labels)s) (
242242
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
243243
) / 4)
244244
/
245-
avg by (%(clusterLabel)s, namespace) (
245+
avg by (%(alert_aggregation_labels)s) (
246246
memcached_limit_bytes{job=~".+/memcached"}
247247
)
248248
)
@@ -251,9 +251,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
251251
{
252252
// Convenience rule to get the CPU utilization for both a deployment and a statefulset.
253253
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
254-
record: '%(clusterLabel)s_namespace_deployment:container_cpu_usage_seconds_total:sum_rate' % _config,
254+
record: '%(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate' % _config,
255255
expr: |||
256-
sum by (%(clusterLabel)s, namespace, deployment) (
256+
sum by (%(alert_aggregation_labels)s, deployment) (
257257
label_replace(
258258
label_replace(
259259
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
@@ -269,7 +269,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
269269
{
270270
// Convenience rule to get the CPU request for both a deployment and a statefulset.
271271
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
272-
record: '%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config,
272+
record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config,
273273
expr: |||
274274
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
275275
# that remove resource metrics, ref:
@@ -279,7 +279,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
279279
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
280280
# where kube_pod_container_resource_requests_cpu_cores was removed:
281281
(
282-
sum by (%(clusterLabel)s, namespace, deployment) (
282+
sum by (%(alert_aggregation_labels)s, deployment) (
283283
label_replace(
284284
label_replace(
285285
kube_pod_container_resource_requests_cpu_cores,
@@ -295,7 +295,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
295295
# This expression is compatible with kube-state-metrics >= v1.4.0,
296296
# where kube_pod_container_resource_requests was introduced.
297297
(
298-
sum by (%(clusterLabel)s, namespace, deployment) (
298+
sum by (%(alert_aggregation_labels)s, deployment) (
299299
label_replace(
300300
label_replace(
301301
kube_pod_container_resource_requests{resource="cpu"},
@@ -313,26 +313,26 @@ local utils = import 'mixin-utils/utils.libsonnet';
313313
// Jobs should be sized to their CPU usage.
314314
// We do this by comparing 99th percentile usage over the last 24hrs to
315315
// their current provisioned #replicas and resource requests.
316-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
316+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
317317
labels: {
318318
reason: 'cpu_usage',
319319
},
320320
expr: |||
321321
ceil(
322-
%(clusterLabel)s_namespace_deployment:actual_replicas:count
322+
%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count
323323
*
324-
quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
324+
quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
325325
/
326-
%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
326+
%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum
327327
)
328328
||| % _config,
329329
},
330330
{
331331
// Convenience rule to get the Memory utilization for both a deployment and a statefulset.
332332
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
333-
record: '%(clusterLabel)s_namespace_deployment:container_memory_usage_bytes:sum' % _config,
333+
record: '%(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum' % _config,
334334
expr: |||
335-
sum by (%(clusterLabel)s, namespace, deployment) (
335+
sum by (%(alert_aggregation_labels)s, deployment) (
336336
label_replace(
337337
label_replace(
338338
container_memory_usage_bytes,
@@ -348,7 +348,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
348348
{
349349
// Convenience rule to get the Memory request for both a deployment and a statefulset.
350350
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
351-
record: '%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config,
351+
record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config,
352352
expr: |||
353353
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
354354
# that remove resource metrics, ref:
@@ -358,7 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
358358
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
359359
# where kube_pod_container_resource_requests_memory_bytes was removed:
360360
(
361-
sum by (%(clusterLabel)s, namespace, deployment) (
361+
sum by (%(alert_aggregation_labels)s, deployment) (
362362
label_replace(
363363
label_replace(
364364
kube_pod_container_resource_requests_memory_bytes,
@@ -374,7 +374,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
374374
# This expression is compatible with kube-state-metrics >= v1.4.0,
375375
# where kube_pod_container_resource_requests was introduced.
376376
(
377-
sum by (%(clusterLabel)s, namespace, deployment) (
377+
sum by (%(alert_aggregation_labels)s, deployment) (
378378
label_replace(
379379
label_replace(
380380
kube_pod_container_resource_requests{resource="memory"},
@@ -392,17 +392,17 @@ local utils = import 'mixin-utils/utils.libsonnet';
392392
// Jobs should be sized to their Memory usage.
393393
// We do this by comparing 99th percentile usage over the last 24hrs to
394394
// their current provisioned #replicas and resource requests.
395-
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
395+
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
396396
labels: {
397397
reason: 'memory_usage',
398398
},
399399
expr: |||
400400
ceil(
401-
%(clusterLabel)s_namespace_deployment:actual_replicas:count
401+
%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count
402402
*
403-
quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment:container_memory_usage_bytes:sum[24h])
403+
quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum[24h])
404404
/
405-
%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
405+
%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum
406406
)
407407
||| % _config,
408408
},
@@ -479,7 +479,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
479479
rules: [
480480
{
481481
// cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
482-
record: '%s_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.clusterLabel, $._config.per_instance_label],
482+
record: '%s_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label],
483483
expr: |||
484484
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
485485
||| % $._config,

0 commit comments

Comments
 (0)