@@ -113,9 +113,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
113113 {
114114 // Convenience rule to get the number of replicas for both a deployment and a statefulset.
115115 // Multi-zone deployments are grouped together removing the "zone-X" suffix.
116- record: '%(clusterLabel)s_namespace_deployment :actual_replicas:count' % _config,
116+ record: '%(alert_aggregation_rule_prefix)s_deployment :actual_replicas:count' % _config,
117117 expr: |||
118- sum by (%(clusterLabel)s, namespace , deployment) (
118+ sum by (%(alert_aggregation_labels)s , deployment) (
119119 label_replace(
120120 kube_deployment_spec_replicas,
121121 # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
@@ -124,22 +124,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
124124 )
125125 )
126126 or
127- sum by (%(clusterLabel)s, namespace , deployment) (
127+ sum by (%(alert_aggregation_labels)s , deployment) (
128128 label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
129129 )
130130 ||| % _config,
131131 },
132132 {
133133 // Distributors should be able to deal with 240k samples/s.
134- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
134+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
135135 labels: {
136136 deployment: 'distributor' ,
137137 reason: 'sample_rate' ,
138138 },
139139 expr: |||
140140 ceil(
141141 quantile_over_time(0.99,
142- sum by (%(clusterLabel)s, namespace ) (
142+ sum by (%(alert_aggregation_labels)s ) (
143143 %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
144144 )[24h:]
145145 )
@@ -150,30 +150,30 @@ local utils = import 'mixin-utils/utils.libsonnet';
150150 {
151151 // We should be about to cover 80% of our limits,
152152 // and ingester can have 80k samples/s.
153- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
153+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
154154 labels: {
155155 deployment: 'distributor' ,
156156 reason: 'sample_rate_limits' ,
157157 },
158158 expr: |||
159159 ceil(
160- sum by (%(clusterLabel)s, namespace ) (cortex_limits_overrides{limit_name="ingestion_rate"})
160+ sum by (%(alert_aggregation_labels)s ) (cortex_limits_overrides{limit_name="ingestion_rate"})
161161 * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
162162 )
163163 ||| % _config,
164164 },
165165 {
166166 // We want ingesters each ingester to deal with 80k samples/s.
167167 // NB we measure this at the distributors and multiple by RF (3).
168- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
168+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
169169 labels: {
170170 deployment: 'ingester' ,
171171 reason: 'sample_rate' ,
172172 },
173173 expr: |||
174174 ceil(
175175 quantile_over_time(0.99,
176- sum by (%(clusterLabel)s, namespace ) (
176+ sum by (%(alert_aggregation_labels)s ) (
177177 %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
178178 )[24h:]
179179 )
@@ -183,15 +183,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
183183 },
184184 {
185185 // Ingester should have 1.5M series in memory
186- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
186+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
187187 labels: {
188188 deployment: 'ingester' ,
189189 reason: 'active_series' ,
190190 },
191191 expr: |||
192192 ceil(
193193 quantile_over_time(0.99,
194- sum by(%(clusterLabel)s, namespace ) (
194+ sum by(%(alert_aggregation_labels)s ) (
195195 cortex_ingester_memory_series
196196 )[24h:]
197197 )
@@ -202,47 +202,47 @@ local utils = import 'mixin-utils/utils.libsonnet';
202202 {
203203 // We should be about to cover 60% of our limits,
204204 // and ingester can have 1.5M series in memory
205- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
205+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
206206 labels: {
207207 deployment: 'ingester' ,
208208 reason: 'active_series_limits' ,
209209 },
210210 expr: |||
211211 ceil(
212- sum by (%(clusterLabel)s, namespace ) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
212+ sum by (%(alert_aggregation_labels)s ) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
213213 * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
214214 )
215215 ||| % _config,
216216 },
217217 {
218218 // We should be about to cover 60% of our limits,
219219 // and ingester can have 80k samples/s.
220- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
220+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
221221 labels: {
222222 deployment: 'ingester' ,
223223 reason: 'sample_rate_limits' ,
224224 },
225225 expr: |||
226226 ceil(
227- sum by (%(clusterLabel)s, namespace ) (cortex_limits_overrides{limit_name="ingestion_rate"})
227+ sum by (%(alert_aggregation_labels)s ) (cortex_limits_overrides{limit_name="ingestion_rate"})
228228 * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
229229 )
230230 ||| % _config,
231231 },
232232 {
233233 // Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
234- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
234+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
235235 labels: {
236236 deployment: 'memcached' ,
237237 reason: 'active_series' ,
238238 },
239239 expr: |||
240240 ceil(
241- (sum by (%(clusterLabel)s, namespace ) (
241+ (sum by (%(alert_aggregation_labels)s ) (
242242 cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
243243 ) / 4)
244244 /
245- avg by (%(clusterLabel)s, namespace ) (
245+ avg by (%(alert_aggregation_labels)s ) (
246246 memcached_limit_bytes{job=~".+/memcached"}
247247 )
248248 )
@@ -251,9 +251,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
251251 {
252252 // Convenience rule to get the CPU utilization for both a deployment and a statefulset.
253253 // Multi-zone deployments are grouped together removing the "zone-X" suffix.
254- record: '%(clusterLabel)s_namespace_deployment :container_cpu_usage_seconds_total:sum_rate' % _config,
254+ record: '%(alert_aggregation_rule_prefix)s_deployment :container_cpu_usage_seconds_total:sum_rate' % _config,
255255 expr: |||
256- sum by (%(clusterLabel)s, namespace , deployment) (
256+ sum by (%(alert_aggregation_labels)s , deployment) (
257257 label_replace(
258258 label_replace(
259259 node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
@@ -269,7 +269,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
269269 {
270270 // Convenience rule to get the CPU request for both a deployment and a statefulset.
271271 // Multi-zone deployments are grouped together removing the "zone-X" suffix.
272- record: '%(clusterLabel)s_namespace_deployment :kube_pod_container_resource_requests_cpu_cores:sum' % _config,
272+ record: '%(alert_aggregation_rule_prefix)s_deployment :kube_pod_container_resource_requests_cpu_cores:sum' % _config,
273273 expr: |||
274274 # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
275275 # that remove resource metrics, ref:
@@ -279,7 +279,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
279279 # This is the old expression, compatible with kube-state-metrics < v2.0.0,
280280 # where kube_pod_container_resource_requests_cpu_cores was removed:
281281 (
282- sum by (%(clusterLabel)s, namespace , deployment) (
282+ sum by (%(alert_aggregation_labels)s , deployment) (
283283 label_replace(
284284 label_replace(
285285 kube_pod_container_resource_requests_cpu_cores,
@@ -295,7 +295,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
295295 # This expression is compatible with kube-state-metrics >= v1.4.0,
296296 # where kube_pod_container_resource_requests was introduced.
297297 (
298- sum by (%(clusterLabel)s, namespace , deployment) (
298+ sum by (%(alert_aggregation_labels)s , deployment) (
299299 label_replace(
300300 label_replace(
301301 kube_pod_container_resource_requests{resource="cpu"},
@@ -313,26 +313,26 @@ local utils = import 'mixin-utils/utils.libsonnet';
313313 // Jobs should be sized to their CPU usage.
314314 // We do this by comparing 99th percentile usage over the last 24hrs to
315315 // their current provisioned #replicas and resource requests.
316- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
316+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
317317 labels: {
318318 reason: 'cpu_usage' ,
319319 },
320320 expr: |||
321321 ceil(
322- %(clusterLabel)s_namespace_deployment :actual_replicas:count
322+ %(alert_aggregation_rule_prefix)s_deployment :actual_replicas:count
323323 *
324- quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment :container_cpu_usage_seconds_total:sum_rate[24h])
324+ quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment :container_cpu_usage_seconds_total:sum_rate[24h])
325325 /
326- %(clusterLabel)s_namespace_deployment :kube_pod_container_resource_requests_cpu_cores:sum
326+ %(alert_aggregation_rule_prefix)s_deployment :kube_pod_container_resource_requests_cpu_cores:sum
327327 )
328328 ||| % _config,
329329 },
330330 {
331331 // Convenience rule to get the Memory utilization for both a deployment and a statefulset.
332332 // Multi-zone deployments are grouped together removing the "zone-X" suffix.
333- record: '%(clusterLabel)s_namespace_deployment :container_memory_usage_bytes:sum' % _config,
333+ record: '%(alert_aggregation_rule_prefix)s_deployment :container_memory_usage_bytes:sum' % _config,
334334 expr: |||
335- sum by (%(clusterLabel)s, namespace , deployment) (
335+ sum by (%(alert_aggregation_labels)s , deployment) (
336336 label_replace(
337337 label_replace(
338338 container_memory_usage_bytes,
@@ -348,7 +348,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
348348 {
349349 // Convenience rule to get the Memory request for both a deployment and a statefulset.
350350 // Multi-zone deployments are grouped together removing the "zone-X" suffix.
351- record: '%(clusterLabel)s_namespace_deployment :kube_pod_container_resource_requests_memory_bytes:sum' % _config,
351+ record: '%(alert_aggregation_rule_prefix)s_deployment :kube_pod_container_resource_requests_memory_bytes:sum' % _config,
352352 expr: |||
353353 # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
354354 # that remove resource metrics, ref:
@@ -358,7 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
358358 # This is the old expression, compatible with kube-state-metrics < v2.0.0,
359359 # where kube_pod_container_resource_requests_memory_bytes was removed:
360360 (
361- sum by (%(clusterLabel)s, namespace , deployment) (
361+ sum by (%(alert_aggregation_labels)s , deployment) (
362362 label_replace(
363363 label_replace(
364364 kube_pod_container_resource_requests_memory_bytes,
@@ -374,7 +374,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
374374 # This expression is compatible with kube-state-metrics >= v1.4.0,
375375 # where kube_pod_container_resource_requests was introduced.
376376 (
377- sum by (%(clusterLabel)s, namespace , deployment) (
377+ sum by (%(alert_aggregation_labels)s , deployment) (
378378 label_replace(
379379 label_replace(
380380 kube_pod_container_resource_requests{resource="memory"},
@@ -392,17 +392,17 @@ local utils = import 'mixin-utils/utils.libsonnet';
392392 // Jobs should be sized to their Memory usage.
393393 // We do this by comparing 99th percentile usage over the last 24hrs to
394394 // their current provisioned #replicas and resource requests.
395- record: '%(clusterLabel)s_namespace_deployment_reason :required_replicas:count' % _config,
395+ record: '%(alert_aggregation_rule_prefix)s_deployment_reason :required_replicas:count' % _config,
396396 labels: {
397397 reason: 'memory_usage' ,
398398 },
399399 expr: |||
400400 ceil(
401- %(clusterLabel)s_namespace_deployment :actual_replicas:count
401+ %(alert_aggregation_rule_prefix)s_deployment :actual_replicas:count
402402 *
403- quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment :container_memory_usage_bytes:sum[24h])
403+ quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment :container_memory_usage_bytes:sum[24h])
404404 /
405- %(clusterLabel)s_namespace_deployment :kube_pod_container_resource_requests_memory_bytes:sum
405+ %(alert_aggregation_rule_prefix)s_deployment :kube_pod_container_resource_requests_memory_bytes:sum
406406 )
407407 ||| % _config,
408408 },
@@ -479,7 +479,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
479479 rules: [
480480 {
481481 // cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
482- record: '%s_namespace_ %s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.clusterLabel , $._config.per_instance_label],
482+ record: '%s_ %s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix , $._config.per_instance_label],
483483 expr: |||
484484 sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
485485 ||| % $._config,
0 commit comments