feat(base-cluster/tracing): add gateway to enable tail sampling (#1736)

cwrau · web-flow · commit 7c1bd9a3504e · 2025-11-20T11:46:47.000Z
That way traces with errors are always stored, traces that
are &gt;200ms and with a random chance of 0.1%.

Adjust your clients to just sample 100% of the spans/traces.


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Added a telemetry gateway for distributed tracing with tail-sampling
policies and OTLP endpoints.
  * Enabled JSON-formatted logs in the telemetry collector.
* Added a load‑balanced OTLP exporter with Kubernetes service discovery
and zstd compression.

* **Refactor**
* Simplified telemetry enablement across components (no longer tied to
Prometheus).
  * Standardized OTLP endpoint handling to explicit host:port.
* Moved descheduler tracing options to a unified telemetry
command-options block.
* Enriched Kubernetes metadata on traces and increased default trace
sampling to 100%.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/charts/base-cluster/templates/descheduler/descheduler.yaml b/charts/base-cluster/templates/descheduler/descheduler.yaml
@@ -38,15 +38,15 @@ spec:
       additionalLabels:
         monitoring/provisioned-by: base-cluster
     {{- end }}
-    deschedulerPolicy: {{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "otlp") | fromYaml }}
-      {{- if and $telemetryConf.enabled .Values.monitoring.prometheus.enabled }}
-      tracing:
-        collectorEndpoint: {{ printf "%s:%d" $telemetryConf.host (int64 $telemetryConf.port) | quote }}
-        serviceName: descheduler
-        serviceNamespace: kube-system
-        sampleRate: 1.0
-        fallbackToNoOpProviderOnError: true
-      {{- end }}
+    {{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "otlp") | fromYaml }}
+    {{- if $telemetryConf.enabled }}
+    cmdOptions:
+      otel-collector-endpoint: {{ printf "%s:%d" $telemetryConf.host (int64 $telemetryConf.port) | quote }}
+      otel-service-name: descheduler
+      otel-trace-namespace: kube-system
+      otel-sample-rate: 1.0
+    {{- end }}
+    deschedulerPolicy:
       profiles:
         - name: default
           {{- toYaml .Values.descheduler.profile | nindent 10 }}
diff --git a/charts/base-cluster/templates/ingress/nginx.yaml b/charts/base-cluster/templates/ingress/nginx.yaml
@@ -29,7 +29,7 @@ spec:
           additionalLabels:
             monitoring/provisioned-by: base-cluster
       {{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "otlp") | fromYaml }}
-      {{- if and $telemetryConf.enabled .Values.monitoring.prometheus.enabled }}
+      {{- if $telemetryConf.enabled }}
       opentelemetry:
         enabled: true
         {{- if and .Values.global.imageRegistry false }}
diff --git a/charts/base-cluster/templates/ingress/traefik.yaml b/charts/base-cluster/templates/ingress/traefik.yaml
@@ -85,7 +85,8 @@ spec:
       otlp:
         enabled: true
         grpc:
-          endpoint: {{ $telemetryConf.endpoint }}
+          enabled: true
+          endpoint: {{ printf "%s:%d" $telemetryConf.host (int64 $telemetryConf.port) | quote }}
           {{- if $telemetryConf.insecure }}
           insecure: true
           {{- end }}
diff --git a/charts/base-cluster/templates/kyverno/kyverno.yaml b/charts/base-cluster/templates/kyverno/kyverno.yaml
@@ -64,26 +64,25 @@ spec:
     admissionController:
       replicas: 3
       {{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "jaeger" "serviceProtocol" "grpc") | fromYaml -}}
-      {{- $telemetryEnabled := and $telemetryConf.enabled .Values.monitoring.prometheus.enabled -}}
-      {{- if $telemetryEnabled }}
+      {{- if $telemetryConf.enabled }}
       tracing: &tracingConfig
         enabled: true
         address: {{ $telemetryConf.host }}
         port: {{ int64 $telemetryConf.port }}
       {{- end }}
     backgroundController:
       replicas: 2
-      {{- if $telemetryEnabled }}
+      {{- if $telemetryConf.enabled }}
       tracing: *tracingConfig
       {{- end }}
     reportsController:
       replicas: 2
-      {{- if $telemetryEnabled }}
+      {{- if $telemetryConf.enabled }}
       tracing: *tracingConfig
       {{- end }}
     cleanupController:
       replicas: 2
-      {{- if $telemetryEnabled }}
+      {{- if $telemetryConf.enabled }}
       tracing: *tracingConfig
       {{- end }}
     podDisruptionBudget:
diff --git a/charts/base-cluster/templates/monitoring/alloy-collector.yaml b/charts/base-cluster/templates/monitoring/alloy-collector.yaml
@@ -41,6 +41,10 @@ spec:
           type: RuntimeDefault
       configMap:
         content: |
+          logging {
+            format = "json"
+          }
+
           {{- if .Values.monitoring.loki.enabled }}
           discovery.kubernetes "pods" {
             role = "pod"
@@ -189,11 +193,26 @@ spec:
             }
           }
 
-          otelcol.processor.k8sattributes "default" {
-            auth_type = "serviceAccount"
+          tracing {
+            sampling_fraction = 1.0
+            write_to = [otelcol.processor.k8sattributes.default.input]
+          }
 
+          otelcol.processor.k8sattributes "default" {
             extract {
-              metadata = ["k8s.namespace.name", "k8s.deployment.name", "k8s.statefulset.name", "k8s.daemonset.name", "k8s.cronjob.name", "k8s.job.name", "k8s.node.name", "k8s.pod.name", "k8s.pod.uid", "k8s.pod.start_time"]
+              metadata = [
+                "k8s.namespace.name",
+                "k8s.deployment.name",
+                "k8s.replicaset.name", "k8s.replicaset.uid",
+                "k8s.statefulset.name", "k8s.statefulset.uid",
+                "k8s.daemonset.name", "k8s.daemonset.uid",
+                "k8s.cronjob.name",
+                "k8s.job.name", "k8s.job.uid",
+                "k8s.node.name",
+                "k8s.pod.name", "k8s.pod.uid", "k8s.pod.start_time",
+                "k8s.container.name", "container.id",
+                "container.image.name", "container.image.tag",
+              ]
             }
 
             pod_association {
@@ -223,16 +242,24 @@ spec:
 
           otelcol.processor.batch "default" {
             output {
-              traces = [otelcol.connector.servicegraph.default.input, otelcol.exporter.otlp.tempo.input]
+              traces = [otelcol.connector.servicegraph.default.input, otelcol.exporter.loadbalancing.gateway.input]
             }
           }
 
-          otelcol.exporter.otlp "tempo" {
-            client {
-              endpoint = "grafana-tempo-distributor:4317"
-
-              tls {
-                insecure = true
+          otelcol.exporter.loadbalancing "gateway" {
+            resolver {
+              kubernetes {
+                service = "telemetry-gateway"
+              }
+            }
+            protocol {
+              otlp {
+                client {
+                  compression = "zstd"
+                  tls {
+                    insecure = true
+                  }
+                }
               }
             }
           }
diff --git a/charts/base-cluster/templates/monitoring/alloy-gateway.yaml b/charts/base-cluster/templates/monitoring/alloy-gateway.yaml
@@ -0,0 +1,130 @@
+{{- if and .Values.monitoring.prometheus.enabled .Values.monitoring.tracing.enabled -}}
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: telemetry-gateway
+  namespace: monitoring
+  labels: {{- include "common.labels.standard" $ | nindent 4 }}
+    app.kubernetes.io/component: alloy
+    app.kubernetes.io/part-of: monitoring
+spec:
+  chart:
+    spec: {{- include "base-cluster.helm.chartSpec" (dict "repo" "grafana" "chart" "alloy" "context" $) | nindent 6 }}
+  interval: 1h
+  driftDetection:
+    mode: enabled
+  install:
+    timeout: 10m0s
+    crds: Skip
+  upgrade:
+    timeout: 10m0s
+    crds: Skip
+  dependsOn:
+    - name: kube-prometheus-stack
+      namespace: monitoring
+  values:
+    fullnameOverride: telemetry-gateway
+    {{- if .Values.global.imageRegistry }}
+    global:
+      image:
+        registry: {{ $.Values.global.imageRegistry }}
+    {{- end }}
+    alloy:
+      enableReporting: false
+      resources: {{- include "common.resources" .Values.monitoring.loki.promtail | nindent 8 }}
+      {{- if .Values.monitoring.loki.enabled }}
+      mounts:
+        varlog: true
+      {{- end }}
+      securityContext:
+        seccompProfile:
+          type: RuntimeDefault
+      configMap:
+        content: |
+          logging {
+            format = "json"
+          }
+
+          otelcol.receiver.otlp "default" {
+            grpc {}
+
+            output {
+              traces = [otelcol.processor.tail_sampling.policies.input]
+            }
+          }
+
+          tracing {
+            sampling_fraction = 1.0
+            write_to = [otelcol.processor.tail_sampling.policies.input]
+          }
+
+          otelcol.processor.tail_sampling "policies" {
+            decision_wait = "30s"
+
+            policy {
+              name = "sample-erroring-traces"
+              type = "status_code"
+              status_code {
+                status_codes = [ "ERROR" ]
+              }
+            }
+
+            policy {
+              name = "sample-long-traces"
+              type = "latency"
+              latency {
+                threshold_ms = 200
+              }
+            }
+
+            policy {
+              name = "sample-random"
+              type = "probabilistic"
+              probabilistic {
+                sampling_percentage = 0.1
+              }
+            }
+
+            output {
+              traces = [otelcol.processor.batch.default.input]
+            }
+          }
+
+          otelcol.processor.batch "default" {
+            output {
+              traces = [otelcol.exporter.otlp.tempo.input]
+            }
+          }
+
+          otelcol.exporter.otlp "tempo" {
+            client {
+              endpoint = "grafana-tempo-distributor:4317"
+
+              tls {
+                insecure = true
+              }
+            }
+          }
+      extraPorts:
+        - name: metrics
+          port: 8888
+          protocol: TCP
+          targetPort: 8888
+        - name: otlp
+          port: 4317
+          appProtocol: grpc
+          protocol: TCP
+          targetPort: 4317
+    crds:
+      create: false
+    controller:
+      type: deployment
+      autoscaling:
+        enabled: true
+        minReplicas: 2
+      priorityClassName: monitoring-components
+    serviceMonitor:
+      enabled: true
+      additionalLabels:
+        monitoring/provisioned-by: base-cluster
+{{- end -}}
diff --git a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_prometheus_config.yaml b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_prometheus_config.yaml
@@ -62,7 +62,7 @@ prometheusSpec:
   {{- if $telemetryConf.enabled }}
   tracingConfig:
     clientType: grpc
-    samplingFraction: "0.1"
+    samplingFraction: "1.0"
     insecure: true
     endpoint: {{ printf "%s:%d" $telemetryConf.host (int64 $telemetryConf.port) | quote }}
   {{- end }}