diff --git a/scripts/purge-docker-registry/docker-registry-curl.bash b/scripts/purge-docker-registry/docker-registry-curl.bash index 6653b8f5d..dd3900d99 100755 --- a/scripts/purge-docker-registry/docker-registry-curl.bash +++ b/scripts/purge-docker-registry/docker-registry-curl.bash @@ -29,7 +29,7 @@ main() { console "${REGISTRY_HOST}" console "${WWW_AUTHENTICATE}" - if [ "x${WWW_AUTHENTICATE}" != "x" ];then + if [ "${WWW_AUTHENTICATE}" != "" ];then # we need to get a token DOCKER_AUTH_TYPE=$(echo "${WWW_AUTHENTICATE}" | cut --delimiter=" " --fields=1) DETAILS=$(echo "${WWW_AUTHENTICATE}" | cut --delimiter=" " --fields=2-) @@ -42,7 +42,7 @@ main() { SCOPE=$(echo "${DETAILS}" | cut --delimiter=',' --fields=3 | cut --delimiter="=" --fields=2 | tr --delete '"') if [ -v DOCKER_AUTH ];then : - elif [[ "x${DOCKER_USERNAME}" != "x" && "x${DOCKER_PASSWORD}" != "x" ]];then + elif [[ "${DOCKER_USERNAME}" != "" && "${DOCKER_PASSWORD}" != "" ]];then DOCKER_AUTH="${DOCKER_USERNAME}:${DOCKER_PASSWORD}" elif [ -e ~/.docker/config.json ];then DOCKER_AUTH=$(jq -r ".[\"auths\"][\"${REGISTRY_HOST}\"][\"auth\"]" ~/.docker/config.json | base64 -d) diff --git a/services/jaeger/docker-compose.yml.j2 b/services/jaeger/docker-compose.yml.j2 index a04a36498..e554825cb 100644 --- a/services/jaeger/docker-compose.yml.j2 +++ b/services/jaeger/docker-compose.yml.j2 @@ -41,6 +41,9 @@ services: command: - "--config=/etc/otel/config.yaml" deploy: + labels: + - prometheus-job=otel-collector + - prometheus-port=8888 placement: constraints: - node.labels.ops==true diff --git a/services/jaeger/opentelemetry-collector-config.yaml b/services/jaeger/opentelemetry-collector-config.yaml index 729af7727..1064db36c 100644 --- a/services/jaeger/opentelemetry-collector-config.yaml +++ b/services/jaeger/opentelemetry-collector-config.yaml @@ -19,6 +19,14 @@ service: exporters: [otlphttp,otlp] processors: [batch,filter/drop_healthcheck] telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: '0.0.0.0' + port: 8888 + logs: level: ${TRACING_OPENTELEMETRY_COLLECTOR_SERVICE_TELEMETRY_LOG_LEVEL} processors: diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 index bcc031c16..d208620a1 100644 --- a/services/logging/docker-compose.yml.j2 +++ b/services/logging/docker-compose.yml.j2 @@ -118,18 +118,21 @@ services: - VECTOR_CONFIG=/etc/vector/vector.yaml - VECTOR_LOG=info - VECTOR_LOG_DESTINATION=${VECTOR_LOG_DESTINATION} + - PROMETHEUS_SCRAPE_INTERVAL=${PROMETHEUS_SCRAPE_INTERVAL} configs: - source: vector_config target: /etc/vector/vector.yaml deploy: replicas: 1 + labels: + - prometheus-job=vector + - prometheus-port=9598 resources: limits: cpus: "1.0" memory: 512M reservations: memory: 256M - labels: [] networks: logging: @@ -153,6 +156,9 @@ services: - S3_ENDPOINT_LOKI=${S3_ENDPOINT_LOKI} - LOKI_RETENTION_PERIOD=${LOKI_RETENTION_PERIOD} deploy: + labels: + - prometheus-job=loki + - prometheus-port=3100 placement: constraints: [] replicas: 1 diff --git a/services/logging/template.env b/services/logging/template.env index 3878dc1da..ecdcffee9 100644 --- a/services/logging/template.env +++ b/services/logging/template.env @@ -25,3 +25,4 @@ S3_REGION_LOKI=${S3_REGION_LOKI} S3_SECRET_KEY_LOKI=${S3_SECRET_KEY_LOKI} STORAGE_DOMAIN=${STORAGE_DOMAIN} VECTOR_LOG_DESTINATION=${VECTOR_LOG_DESTINATION} +PROMETHEUS_SCRAPE_INTERVAL=${PROMETHEUS_SCRAPE_INTERVAL} diff --git a/services/logging/vector.yaml b/services/logging/vector.yaml index bfa9ecaf7..71c40ecf6 100644 --- a/services/logging/vector.yaml +++ b/services/logging/vector.yaml @@ -2,6 +2,9 @@ sources: # Receive GELF messages from Docker containers via UDP + vector_metrics: + type: internal_metrics + scrape_interval_secs: ${PROMETHEUS_SCRAPE_INTERVAL} docker_gelf: type: socket address: "0.0.0.0:12201" @@ -115,7 +118,11 @@ sinks: healthcheck: enabled: true - + prometheus_exporter: + type: prometheus_exporter + inputs: + - vector_metrics + address: "0.0.0.0:9598" # Send to Graylog via GELF over TCP graylog: type: socket diff --git a/services/monitoring/Makefile b/services/monitoring/Makefile index 567c139e9..395b29827 100644 --- a/services/monitoring/Makefile +++ b/services/monitoring/Makefile @@ -132,6 +132,16 @@ config.prometheus.simcore: ${REPO_CONFIG_LOCATION} venv envsubst < prometheus/prometheus.yml > prometheus/prometheus.temp.yml; \ mv prometheus/prometheus.temp.yml prometheus/prometheus.yml +.PHONY: config.prometheus.federation +config.prometheus.federation: ${REPO_CONFIG_LOCATION} venv + @set -o allexport; \ + source $(REPO_CONFIG_LOCATION); \ + set +o allexport; \ + envsubst < prometheus/prometheus-federation.template.yml > prometheus/prometheus-federation.yml + +.PHONY: prometheus/prometheus-federation.yml +prometheus/prometheus-federation.yml: config.prometheus.federation + .PHONY: config.prometheus.simcore.aws config.prometheus.simcore.aws: ${REPO_CONFIG_LOCATION} venv @set -o allexport; \ diff --git a/services/monitoring/docker-compose.yml.j2 b/services/monitoring/docker-compose.yml.j2 index 7d46a0c4d..12c9be240 100644 --- a/services/monitoring/docker-compose.yml.j2 +++ b/services/monitoring/docker-compose.yml.j2 @@ -230,12 +230,11 @@ services: - monitored # needed to access postgres - public deploy: - #restart_policy: - # condition: on-failure labels: + - prometheus-job=grafana + - prometheus-port=3000 - traefik.enable=true - traefik.swarm.network=${PUBLIC_NETWORK} - # direct access through port - traefik.http.services.grafana.loadbalancer.server.port=3000 - traefik.http.routers.grafana.rule=Host(`${MONITORING_DOMAIN}`) && PathPrefix(`/grafana`) - traefik.http.routers.grafana.entrypoints=https @@ -391,6 +390,8 @@ services: - monitored deploy: labels: + - prometheus-job=tempo + - prometheus-port=3200 - traefik.enable=true - traefik.swarm.network=${PUBLIC_NETWORK} - traefik.http.services.tempo.loadbalancer.server.port=9095 diff --git a/services/monitoring/prometheus/.gitignore b/services/monitoring/prometheus/.gitignore index 366042e4a..3748ce8f6 100644 --- a/services/monitoring/prometheus/.gitignore +++ b/services/monitoring/prometheus/.gitignore @@ -1,2 +1,3 @@ prometheus-ceph.yml prometheus.yml +prometheus-federation.yml diff --git a/services/monitoring/prometheus/prometheus-base.yml b/services/monitoring/prometheus/prometheus-base.yml index 31705a2db..3c4bce154 100644 --- a/services/monitoring/prometheus/prometheus-base.yml +++ b/services/monitoring/prometheus/prometheus-base.yml @@ -1,9 +1,9 @@ # global config # DOLLAR SIGNS NEED TO BE EXCAPED (see https://stackoverflow.com/a/61259844/10198629) global: - scrape_interval: 15s # By default, scrape targets every 15 seconds. - evaluation_interval: 15s # By default, scrape targets every 15 seconds. - # scrape_timeout global default would be (10s). + scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s + evaluation_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s # By default, scrape targets every 15 seconds. + scrape_timeout: ${PROMETHEUS_SCRAPE_TIMEOUT}s # Attach these labels to any time series or alerts when communicating with # external systems (federation, remote storage, Alertmanager). diff --git a/services/monitoring/prometheus/prometheus-ceph.yml.j2 b/services/monitoring/prometheus/prometheus-ceph.yml.j2 index 5181bed24..de4a31321 100644 --- a/services/monitoring/prometheus/prometheus-ceph.yml.j2 +++ b/services/monitoring/prometheus/prometheus-ceph.yml.j2 @@ -2,8 +2,8 @@ scrape_configs: - job_name: ceph-production honor_labels: true honor_timestamps: true - scrape_interval: 30s - scrape_timeout: 30s + scrape_interval: {{PROMETHEUS_SCRAPE_INTERVAL}}s + scrape_timeout: {{PROMETHEUS_SCRAPE_TIMEOUT}}s metrics_path: /metrics scheme: http static_configs: diff --git a/services/monitoring/prometheus/prometheus-federation.yml b/services/monitoring/prometheus/prometheus-federation.template.yml similarity index 71% rename from services/monitoring/prometheus/prometheus-federation.yml rename to services/monitoring/prometheus/prometheus-federation.template.yml index 424916a7f..e5868f455 100644 --- a/services/monitoring/prometheus/prometheus-federation.yml +++ b/services/monitoring/prometheus/prometheus-federation.template.yml @@ -1,10 +1,12 @@ global: - scrape_interval: 29s # Set the scrape interval to every 29 seconds. Default is every 1 minute. - evaluation_interval: 29s # Evaluate rules every 29 seconds. The default is every 1 minute. + scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s + evaluation_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s + scrape_timeout: ${PROMETHEUS_SCRAPE_TIMEOUT}s scrape_configs: - job_name: 'federate' # A job defines a series of targets and parameters describing how to scrape them. - scrape_interval: 29s # Overwrite the global scrape interval for this job, set to every 29 seconds. + scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s # Overwrite the global scrape interval for this job, + scrape_timeout: ${PROMETHEUS_SCRAPE_TIMEOUT}s # Overwrite the global scrape timeout for this job. honor_labels: true # Do not overwrite labels in scraped data. scheme: http metrics_path: '/federate' # Path to fetch the metrics from, '/federate' is for federation. diff --git a/services/monitoring/prometheus/prometheus-simcore.yml b/services/monitoring/prometheus/prometheus-simcore.yml index 23e6f6834..657339ba8 100644 --- a/services/monitoring/prometheus/prometheus-simcore.yml +++ b/services/monitoring/prometheus/prometheus-simcore.yml @@ -1,7 +1,8 @@ scrape_configs: # SIMCORE ------------------------------------------------------------------- - job_name: "simcore" - scrape_interval: 15s + scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s + scrape_timeout: ${PROMETHEUS_SCRAPE_TIMEOUT}s relabel_configs: - source_labels: [__meta_dns_name] separator: ; diff --git a/services/monitoring/template.env b/services/monitoring/template.env index 5bbc1f7f7..137163802 100644 --- a/services/monitoring/template.env +++ b/services/monitoring/template.env @@ -25,6 +25,8 @@ MONITORED_NETWORK=${MONITORED_NETWORK} TEMPO_S3_BUCKET=${TEMPO_S3_BUCKET} STORAGE_DOMAIN=${STORAGE_DOMAIN} S3_REGION=${S3_REGION} +PROMETHEUS_SCRAPE_INTERVAL=${PROMETHEUS_SCRAPE_INTERVAL} +PROMETHEUS_SCRAPE_TIMEOUT=${PROMETHEUS_SCRAPE_TIMEOUT} S3_ACCESS_KEY=${S3_ACCESS_KEY} S3_SECRET_KEY=${S3_SECRET_KEY} TF_VAR_PROMETHEUS_CATCHALL_URL=${TF_VAR_PROMETHEUS_CATCHALL_URL} diff --git a/services/monitoring/tempo_config.yaml.j2 b/services/monitoring/tempo_config.yaml.j2 index c28c41425..58b416833 100644 --- a/services/monitoring/tempo_config.yaml.j2 +++ b/services/monitoring/tempo_config.yaml.j2 @@ -1,4 +1,5 @@ server: + http_listen_address: 0.0.0.0 http_listen_port: 3200 distributor: @@ -70,3 +71,5 @@ overrides: rate_limit_bytes: 30000000 burst_size_bytes: 40000000 max_traces_per_user: 10000 +usage_report: + reporting_enabled: false