From a4a8fb71f633296d6aec372892d2dc1e7c9b8d92 Mon Sep 17 00:00:00 2001 From: Feng Pan Date: Tue, 9 Sep 2025 11:49:48 +0000 Subject: [PATCH 1/3] Add sonic-telemetry-sidecar container --- .../docker-telemetry-sidecar/Dockerfile.j2 | 33 +++ .../docker-telemetry-sidecar/supervisord.conf | 37 ++++ .../systemd_scripts/telemetry.sh | 100 +++++++++ .../docker-telemetry-sidecar/systemd_stub.py | 203 ++++++++++++++++++ files/image_config/monit/container_checker | 12 +- rules/docker-telemetry-sidecar.dep | 10 + rules/docker-telemetry-sidecar.mk | 29 +++ 7 files changed, 419 insertions(+), 5 deletions(-) create mode 100644 dockers/docker-telemetry-sidecar/Dockerfile.j2 create mode 100644 dockers/docker-telemetry-sidecar/supervisord.conf create mode 100644 dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh create mode 100644 dockers/docker-telemetry-sidecar/systemd_stub.py create mode 100644 rules/docker-telemetry-sidecar.dep create mode 100644 rules/docker-telemetry-sidecar.mk diff --git a/dockers/docker-telemetry-sidecar/Dockerfile.j2 b/dockers/docker-telemetry-sidecar/Dockerfile.j2 new file mode 100644 index 00000000000..f2e6535186c --- /dev/null +++ b/dockers/docker-telemetry-sidecar/Dockerfile.j2 @@ -0,0 +1,33 @@ +{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %} +ARG BASE=docker-config-engine-bookworm-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}} + +FROM $BASE AS base + +ARG docker_container_name +ARG image_version +RUN [ -f /etc/rsyslog.conf ] && sed -ri "s/%syslogtag%/$docker_container_name#%syslogtag%/;" /etc/rsyslog.conf + +# Make apt-get non-interactive +ENV DEBIAN_FRONTEND=noninteractive + +# Pass the image_version to container +ENV IMAGE_VERSION=$image_version + +COPY ["systemd_stub.py", "/usr/bin/"] +COPY ["systemd_scripts/", "/usr/share/sonic/systemd_scripts/"] +COPY ["files/image_config/monit/container_checker", "/usr/share/sonic/systemd_scripts/container_checker"] +COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] + +RUN chmod +x /usr/bin/systemd_stub.py + +FROM $BASE + +RUN --mount=type=bind,from=base,target=/changes-to-image rsync -axAX --no-D --exclude=/sys --exclude=/proc --exclude=/dev --exclude=resolv.conf /changes-to-image/ / + +# Make apt-get non-interactive +ENV DEBIAN_FRONTEND=noninteractive + +# Pass the image_version to container +ENV IMAGE_VERSION=$image_version + +ENTRYPOINT ["/usr/local/bin/supervisord"] diff --git a/dockers/docker-telemetry-sidecar/supervisord.conf b/dockers/docker-telemetry-sidecar/supervisord.conf new file mode 100644 index 00000000000..25363a69ee5 --- /dev/null +++ b/dockers/docker-telemetry-sidecar/supervisord.conf @@ -0,0 +1,37 @@ +[supervisord] +logfile_maxbytes=1MB +logfile_backups=2 +nodaemon=true + +[eventlistener:dependent-startup] +command=python3 -m supervisord_dependent_startup +autostart=true +autorestart=unexpected +startretries=0 +exitcodes=0,3 +events=PROCESS_STATE +buffer_size=1024 + +[program:rsyslogd] +command=/usr/sbin/rsyslogd -n -iNONE +priority=1 +autostart=false +autorestart=unexpected +stdout_logfile=NONE +stdout_syslog=true +stderr_logfile=NONE +stderr_syslog=true +dependent_startup=true + +[program:systemd_stub] +command=python3 /usr/bin/systemd_stub.py +priority=3 +autostart=false +autorestart=false +startsecs=0 +stdout_logfile=NONE +stdout_syslog=true +stderr_logfile=NONE +stderr_syslog=true +dependent_startup=true +dependent_startup_wait_for=rsyslogd:running diff --git a/dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh b/dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh new file mode 100644 index 00000000000..1954bcfe01a --- /dev/null +++ b/dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh @@ -0,0 +1,100 @@ +#!/bin/bash +set -euo pipefail + +SERVICE="telemetry" +NS="${NS:-sonic}" # k8s namespace +LABEL="raw_container_name=${SERVICE}" # selector used by DaemonSet +KUBECTL_BIN="${KUBECTL_BIN:-kubectl}" +NODE_NAME="${NODE_NAME:-$(hostname)}" +DEV="${2:-}" # accepted for compatibility; unused (single-ASIC) + +log() { /usr/bin/logger -t "${SERVICE}#system" "$*"; } + +require_kubectl() { + if ! command -v "${KUBECTL_BIN}" >/dev/null 2>&1; then + echo "ERROR: kubectl not found (KUBECTL_BIN=${KUBECTL_BIN})." >&2 + exit 127 + fi + # Try a sensible default if KUBECONFIG isn’t set + if [[ -z "${KUBECONFIG:-}" && -r /etc/kubernetes/kubelet.conf ]]; then + export KUBECONFIG=/etc/kubernetes/kubelet.conf + fi +} + +pods_on_node() { + # Prints: " " per line for this node + "${KUBECTL_BIN}" -n "${NS}" get pods \ + -l "${LABEL}" \ + --field-selector "spec.nodeName=${NODE_NAME}" \ + -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' 2>/dev/null || true +} + +kill_pods() { + require_kubectl + local found=0 + while read -r name phase; do + [[ -z "${name}" ]] && continue + found=1 + log "Deleting ${SERVICE} pod ${name} (phase=${phase}) on node ${NODE_NAME}" + # Force/instant delete to emulate “kill”; DaemonSet will recreate + "${KUBECTL_BIN}" -n "${NS}" delete pod "${name}" --grace-period=0 --force >/dev/null 2>&1 || true + done < <(pods_on_node) + if [[ "${found}" -eq 0 ]]; then + log "No ${SERVICE} pods found on node ${NODE_NAME} (namespace=${NS}, label=${LABEL})." + fi +} + +cmd_start() { kill_pods; } # start == kill (DS restarts) +cmd_stop() { kill_pods; } +cmd_restart() { kill_pods; sleep 1; kill_pods; } + +cmd_status() { + require_kubectl + local out; out="$(pods_on_node)" + if [[ -z "${out}" ]]; then + echo "${SERVICE}: NOT RUNNING (no pod on node ${NODE_NAME})" + exit 3 + fi + echo "${out}" | while read -r name phase; do + [[ -z "${name}" ]] && continue + echo "${SERVICE} pod ${name}: ${phase}" + done + # Exit 0 if at least one Running, 1 otherwise + if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then + exit 0 + else + exit 1 + fi +} + +cmd_wait() { + require_kubectl + log "Waiting on ${SERVICE} pods (ns=${NS}, label=${LABEL}) on node ${NODE_NAME}..." + # Keep the systemd service 'active' as long as at least one pod exists for this node. + while true; do + local out; out="$(pods_on_node)" + if [[ -z "${out}" ]]; then + # no pod presently; keep waiting (DaemonSet may bring it up) + sleep 5 + continue + fi + # If at least one is Running, sleep longer; otherwise poll faster + if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then + sleep 60 + else + sleep 5 + fi + done +} + +case "${1:-}" in + start) cmd_start ;; + stop) cmd_stop ;; + restart) cmd_restart ;; + wait) cmd_wait ;; + status) cmd_status ;; + *) + echo "Usage: $0 {start|stop|restart|wait|status} [asic-id(optional, ignored)]" >&2 + exit 2 + ;; +esac diff --git a/dockers/docker-telemetry-sidecar/systemd_stub.py b/dockers/docker-telemetry-sidecar/systemd_stub.py new file mode 100644 index 00000000000..a889ba0c7e0 --- /dev/null +++ b/dockers/docker-telemetry-sidecar/systemd_stub.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +import time +import argparse +import hashlib +import shlex +import subprocess +from dataclasses import dataclass +from typing import List, Optional, Tuple + +from sonic_py_common import logger as log +logger = log.Logger() + +# ───────────── Config ───────────── +SYNC_INTERVAL_S = int(os.environ.get("SYNC_INTERVAL_S", "900")) # seconds +NSENTER_BASE = ["nsenter", "--target", "1", "--pid", "--mount", "--uts", "--ipc", "--net"] + +@dataclass(frozen=True) +class SyncItem: + src_in_container: str + dst_on_host: str + mode: int = 0o755 + +SYNC_ITEMS: List[SyncItem] = [ + SyncItem("/usr/share/sonic/systemd_scripts/telemetry.sh", "/usr/local/bin/telemetry.sh"), + SyncItem("/usr/share/sonic/systemd_scripts/container_checker", "/bin/container_checker"), +] + +POST_COPY_ACTIONS = { + "/usr/local/bin/telemetry.sh": [ + ["sudo", "docker", "stop", "telemetry"], + ["sudo", "docker", "rm", "telemetry"], + ["sudo", "systemctl", "daemon-reload"], + ["sudo", "systemctl", "restart", "telemetry"], + ], + "/bin/container_checker": [ + ["sudo", "systemctl", "daemon-reload"], + ["sudo", "systemctl", "restart", "monit"], + ], +} + + +def run(args: List[str], *, text: bool = True, input_bytes: Optional[bytes] = None) -> Tuple[int, str | bytes, str | bytes]: + logger.log_debug("Running: " + " ".join(args)) + p = subprocess.Popen( + args, + text=text, + stdin=subprocess.PIPE if input_bytes is not None else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate(input=input_bytes if input_bytes is not None else None) + return p.returncode, out, err + + +def run_nsenter(args: List[str], *, text: bool = True, input_bytes: Optional[bytes] = None) -> Tuple[int, str | bytes, str | bytes]: + return run(NSENTER_BASE + args, text=text, input_bytes=input_bytes) + + +def file_bytes_local(path: str) -> Optional[bytes]: + try: + with open(path, "rb") as f: + return f.read() + except Exception as e: + logger.log_error(f"read failed for {path}: {e}") + return None + + +# ───────────── Host file ops via nsenter ───────────── +def host_read_bytes(path_on_host: str) -> Optional[bytes]: + # Use /bin/cat in host namespace + rc, out, _ = run_nsenter(["/bin/cat", path_on_host], text=False) + if rc != 0: + return None + return out + +def host_write_atomic(dst_on_host: str, data: bytes, mode: int) -> bool: + tmp_path = f"/tmp/{os.path.basename(dst_on_host)}.tmp" + + # 1) write bytes to host tmp via stdin + rc, _, err = run_nsenter(["/bin/sh", "-lc", f"cat > {shlex.quote(tmp_path)}"], text=False, input_bytes=data) + if rc != 0: + emsg = err.decode(errors="ignore") if isinstance(err, (bytes, bytearray)) else str(err) + logger.log_error(f"host write tmp failed: {emsg.strip()}") + return False + + # 2) chmod tmp on host + rc, _, err = run_nsenter(["/bin/chmod", f"{mode:o}", tmp_path], text=True) + if rc != 0: + logger.log_error(f"host chmod failed: {str(err).strip()}") + run_nsenter(["/bin/rm", "-f", tmp_path], text=True) + return False + + # 3) ensure parent dir exists on host + parent = os.path.dirname(dst_on_host) or "/" + rc, _, err = run_nsenter(["/bin/mkdir", "-p", parent], text=True) + if rc != 0: + logger.log_error(f"host mkdir failed for {parent}: {str(err).strip()}") + run_nsenter(["/bin/rm", "-f", tmp_path], text=True) + return False + + # 4) atomic replace on host + rc, _, err = run_nsenter(["/bin/mv", "-f", tmp_path, dst_on_host], text=True) + if rc != 0: + logger.log_error(f"host mv failed to {dst_on_host}: {str(err).strip()}") + run_nsenter(["/bin/rm", "-f", tmp_path], text=True) + return False + + return True + +def run_host_actions_for(path_on_host: str) -> None: + actions = POST_COPY_ACTIONS.get(path_on_host, []) + for cmd in actions: + rc, _, err = run_nsenter(cmd, text=True) + if rc == 0: + logger.log_info(f"Post-copy action succeeded: {' '.join(cmd)}") + else: + logger.log_error(f"Post-copy action FAILED (rc={rc}): {' '.join(cmd)}; stderr={str(err).strip()}") + + +# ───────────── file Sync logic ───────────── +def sha256_bytes(b: Optional[bytes]) -> str: + if b is None: + return "" + h = hashlib.sha256() + h.update(b) + return h.hexdigest() + +def sync_items(items: List[SyncItem]) -> bool: + all_ok = True + for item in items: + src_bytes = file_bytes_local(item.src_in_container) + if src_bytes is None: + logger.log_error(f"Cannot read {item.src_in_container} in this container") + all_ok = False + continue + + cont_sha = sha256_bytes(src_bytes) + host_bytes = host_read_bytes(item.dst_on_host) + host_sha = sha256_bytes(host_bytes) + + if host_sha == cont_sha: + logger.log_info(f"{os.path.basename(item.dst_on_host)} up-to-date (sha256={host_sha})") + continue + + logger.log_info( + f"{os.path.basename(item.dst_on_host)} differs " + f"(container {cont_sha} vs host {host_sha or 'missing'}), updating…" + ) + if not host_write_atomic(item.dst_on_host, src_bytes, item.mode): + logger.log_error(f"Copy/update failed for {item.dst_on_host}") + all_ok = False + continue + + # verify + new_host_bytes = host_read_bytes(item.dst_on_host) + new_sha = sha256_bytes(new_host_bytes) + if new_sha != cont_sha: + logger.log_error( + f"Post-copy SHA mismatch for {item.dst_on_host}: host {new_sha or 'read-failed'} vs container {cont_sha}" + ) + all_ok = False + else: + logger.log_info(f"Sync complete for {item.dst_on_host} (sha256={new_sha})") + run_host_actions_for(item.dst_on_host) + + return all_ok + +def ensure_sync() -> bool: + return sync_items(SYNC_ITEMS) + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Sync host scripts from this container to the host via nsenter (syslog logging).") + p.add_argument("--once", action="store_true", help="Run one sync pass and exit") + p.add_argument("--interval", type=int, default=SYNC_INTERVAL_S, help=f"Loop interval seconds (default: {SYNC_INTERVAL_S})") + p.add_argument("--no-post-actions", action="store_true", help="(Optional) Skip host systemctl actions (for debugging)") + return p.parse_args() + +def main() -> int: + args = parse_args() + + if args.no_post_actions: + POST_COPY_ACTIONS.clear() + logger.log_info("Post-copy host actions DISABLED for this run") + + ok = ensure_sync() + if args.once: + return 0 if ok else 1 + + while True: + time.sleep(args.interval) + ok = ensure_sync() and ok + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + logger.log_info("Interrupted by user") + sys.exit(130) diff --git a/files/image_config/monit/container_checker b/files/image_config/monit/container_checker index 769755b7c66..bbbc5695d8a 100755 --- a/files/image_config/monit/container_checker +++ b/files/image_config/monit/container_checker @@ -158,7 +158,6 @@ def get_current_running_from_DB(always_running_containers): return running_containers - def get_current_running_from_dockers(): """ @summary: This function will get all running containers from @@ -172,12 +171,15 @@ def get_current_running_from_dockers(): try: lst = ctrs.list(filters={"status": "running"}) for ctr in lst: - running_containers.add(ctr.name) + # Prefer raw_container_name label over actual name + if ctr.labels and "raw_container_name" in ctr.labels: + running_containers.add(ctr.labels["raw_container_name"]) + else: + running_containers.add(ctr.name) except docker.errors.APIError as err: - print("Failed to retrieve the running container list. Error: '{}'".format(err)) - pass - return running_containers + print(f"Failed to retrieve the running container list. Error: '{err}'") + return running_containers def get_current_running_containers(always_running_containers): """ diff --git a/rules/docker-telemetry-sidecar.dep b/rules/docker-telemetry-sidecar.dep new file mode 100644 index 00000000000..90a59fd1fa7 --- /dev/null +++ b/rules/docker-telemetry-sidecar.dep @@ -0,0 +1,10 @@ +DPATH := $($(DOCKER_TELEMETRY_SIDECAR)_PATH) +DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/docker-telemetry-sidecar.mk rules/docker-telemetry-sidecar.dep +DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST) +DEP_FILES += $(shell git ls-files $(DPATH)) + +$(DOCKER_TELEMETRY_SIDECAR)_CACHE_MODE := GIT_CONTENT_SHA +$(DOCKER_TELEMETRY_SIDECAR)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST) +$(DOCKER_TELEMETRY_SIDECAR)_DEP_FILES := $(DEP_FILES) + +$(eval $(call add_dbg_docker,$(DOCKER_TELEMETRY_SIDECAR),$(DOCKER_TELEMETRY_SIDECAR_DBG))) diff --git a/rules/docker-telemetry-sidecar.mk b/rules/docker-telemetry-sidecar.mk new file mode 100644 index 00000000000..eb1cd9acc63 --- /dev/null +++ b/rules/docker-telemetry-sidecar.mk @@ -0,0 +1,29 @@ +# docker image for docker-telemetry-sidecar + +DOCKER_TELEMETRY_SIDECAR_STEM = docker-telemetry-sidecar +DOCKER_TELEMETRY_SIDECAR = $(DOCKER_TELEMETRY_SIDECAR_STEM).gz +DOCKER_TELEMETRY_SIDECAR_DBG = $(DOCKER_TELEMETRY_SIDECAR_STEM)-$(DBG_IMAGE_MARK).gz + +$(DOCKER_TELEMETRY_SIDECAR)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE_BOOKWORM) + +$(DOCKER_TELEMETRY_SIDECAR)_PATH = $(DOCKERS_PATH)/$(DOCKER_TELEMETRY_SIDECAR_STEM) + +$(DOCKER_TELEMETRY_SIDECAR)_VERSION = 1.0.0 +$(DOCKER_TELEMETRY_SIDECAR)_PACKAGE_NAME = telemetry-sidecar + +SONIC_DOCKER_IMAGES += $(DOCKER_TELEMETRY_SIDECAR) +SONIC_BOOKWORM_DOCKERS += $(DOCKER_TELEMETRY_SIDECAR) +SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_TELEMETRY_SIDECAR) + +SONIC_DOCKER_DBG_IMAGES += $(DOCKER_TELEMETRY_SIDECAR_DBG) +SONIC_BOOKWORM_DBG_DOCKERS += $(DOCKER_TELEMETRY_SIDECAR_DBG) +SONIC_INSTALL_DOCKER_DBG_IMAGES += $(DOCKER_TELEMETRY_SIDECAR_DBG) + +$(DOCKER_TELEMETRY_SIDECAR)_CONTAINER_NAME = telemetry-sidecar +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -t --privileged --pid=host +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /lib/systemd/system:/lib/systemd/system:rw +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /etc/audit:/etc/audit:rw +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /etc/localtime:/etc/localtime:ro + +$(DOCKER_TELEMETRY_SIDECAR)_FILES += files/image_config/monit/container_checker \ No newline at end of file From 3c1062627463e98a36157bb48e9a6d1f77caf09d Mon Sep 17 00:00:00 2001 From: Feng Pan Date: Tue, 9 Sep 2025 11:49:48 +0000 Subject: [PATCH 2/3] Add sonic-telemetry-sidecar container --- .../docker-telemetry-sidecar/Dockerfile.j2 | 33 +++ .../docker-telemetry-sidecar/supervisord.conf | 37 ++++ .../systemd_scripts/telemetry.sh | 100 +++++++++ .../docker-telemetry-sidecar/systemd_stub.py | 203 ++++++++++++++++++ files/image_config/monit/container_checker | 10 +- rules/docker-telemetry-sidecar.dep | 10 + rules/docker-telemetry-sidecar.mk | 27 +++ 7 files changed, 416 insertions(+), 4 deletions(-) create mode 100644 dockers/docker-telemetry-sidecar/Dockerfile.j2 create mode 100644 dockers/docker-telemetry-sidecar/supervisord.conf create mode 100644 dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh create mode 100644 dockers/docker-telemetry-sidecar/systemd_stub.py create mode 100644 rules/docker-telemetry-sidecar.dep create mode 100644 rules/docker-telemetry-sidecar.mk diff --git a/dockers/docker-telemetry-sidecar/Dockerfile.j2 b/dockers/docker-telemetry-sidecar/Dockerfile.j2 new file mode 100644 index 00000000000..f2e6535186c --- /dev/null +++ b/dockers/docker-telemetry-sidecar/Dockerfile.j2 @@ -0,0 +1,33 @@ +{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %} +ARG BASE=docker-config-engine-bookworm-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}} + +FROM $BASE AS base + +ARG docker_container_name +ARG image_version +RUN [ -f /etc/rsyslog.conf ] && sed -ri "s/%syslogtag%/$docker_container_name#%syslogtag%/;" /etc/rsyslog.conf + +# Make apt-get non-interactive +ENV DEBIAN_FRONTEND=noninteractive + +# Pass the image_version to container +ENV IMAGE_VERSION=$image_version + +COPY ["systemd_stub.py", "/usr/bin/"] +COPY ["systemd_scripts/", "/usr/share/sonic/systemd_scripts/"] +COPY ["files/image_config/monit/container_checker", "/usr/share/sonic/systemd_scripts/container_checker"] +COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] + +RUN chmod +x /usr/bin/systemd_stub.py + +FROM $BASE + +RUN --mount=type=bind,from=base,target=/changes-to-image rsync -axAX --no-D --exclude=/sys --exclude=/proc --exclude=/dev --exclude=resolv.conf /changes-to-image/ / + +# Make apt-get non-interactive +ENV DEBIAN_FRONTEND=noninteractive + +# Pass the image_version to container +ENV IMAGE_VERSION=$image_version + +ENTRYPOINT ["/usr/local/bin/supervisord"] diff --git a/dockers/docker-telemetry-sidecar/supervisord.conf b/dockers/docker-telemetry-sidecar/supervisord.conf new file mode 100644 index 00000000000..fb1bd129c13 --- /dev/null +++ b/dockers/docker-telemetry-sidecar/supervisord.conf @@ -0,0 +1,37 @@ +[supervisord] +logfile_maxbytes=1MB +logfile_backups=2 +nodaemon=true + +[eventlistener:dependent-startup] +command=python3 -m supervisord_dependent_startup +autostart=true +autorestart=unexpected +startretries=0 +exitcodes=0,3 +events=PROCESS_STATE +buffer_size=1024 + +[program:rsyslogd] +command=/usr/sbin/rsyslogd -n -iNONE +priority=1 +autostart=false +autorestart=unexpected +stdout_logfile=NONE +stdout_syslog=true +stderr_logfile=NONE +stderr_syslog=true +dependent_startup=true + +[program:systemd_stub] +command=python3 /usr/bin/systemd_stub.py +priority=3 +autostart=true +autorestart=true +startsecs=0 +stdout_logfile=NONE +stdout_syslog=true +stderr_logfile=NONE +stderr_syslog=true +dependent_startup=true +dependent_startup_wait_for=rsyslogd:running diff --git a/dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh b/dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh new file mode 100644 index 00000000000..c196f8c4a1e --- /dev/null +++ b/dockers/docker-telemetry-sidecar/systemd_scripts/telemetry.sh @@ -0,0 +1,100 @@ +#!/bin/bash +set -euo pipefail + +SERVICE="telemetry" +NS="${NS:-sonic}" # k8s namespace +LABEL="raw_container_name=${SERVICE}" # selector used by DaemonSet +KUBECTL_BIN="${KUBECTL_BIN:-kubectl}" +NODE_NAME="${NODE_NAME:-$(hostname)}" +DEV="${2:-}" # accepted for compatibility; unused (single-ASIC) + +log() { /usr/bin/logger -t "${SERVICE}#system" "$*"; } + +require_kubectl() { + if ! command -v "${KUBECTL_BIN}" >/dev/null 2>&1; then + echo "ERROR: kubectl not found (KUBECTL_BIN=${KUBECTL_BIN})." >&2 + exit 127 + fi + # Try a sensible default if KUBECONFIG isn’t set + if [[ -z "${KUBECONFIG:-}" && -r /etc/kubernetes/kubelet.conf ]]; then + export KUBECONFIG=/etc/kubernetes/kubelet.conf + fi +} + +pods_on_node() { + # Prints: " " per line for this node + "${KUBECTL_BIN}" -n "${NS}" get pods \ + -l "${LABEL}" \ + --field-selector "spec.nodeName=${NODE_NAME}" \ + -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' 2>/dev/null || true +} + +kill_pods() { + require_kubectl + local found=0 + while read -r name phase; do + [[ -z "${name}" ]] && continue + found=1 + log "Deleting ${SERVICE} pod ${name} (phase=${phase}) on node ${NODE_NAME}" + # Force/instant delete to emulate “kill”; DaemonSet will recreate + "${KUBECTL_BIN}" -n "${NS}" delete pod "${name}" --grace-period=0 --force >/dev/null 2>&1 || true + done < <(pods_on_node) + if [[ "${found}" -eq 0 ]]; then + log "No ${SERVICE} pods found on node ${NODE_NAME} (namespace=${NS}, label=${LABEL})." + fi +} + +cmd_start() { kill_pods; } # start == kill (DS restarts) +cmd_stop() { kill_pods; } +cmd_restart() { kill_pods; } + +cmd_status() { + require_kubectl + local out; out="$(pods_on_node)" + if [[ -z "${out}" ]]; then + echo "${SERVICE}: NOT RUNNING (no pod on node ${NODE_NAME})" + exit 3 + fi + echo "${out}" | while read -r name phase; do + [[ -z "${name}" ]] && continue + echo "${SERVICE} pod ${name}: ${phase}" + done + # Exit 0 if at least one Running, 1 otherwise + if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then + exit 0 + else + exit 1 + fi +} + +cmd_wait() { + require_kubectl + log "Waiting on ${SERVICE} pods (ns=${NS}, label=${LABEL}) on node ${NODE_NAME}..." + # Keep the systemd service 'active' as long as at least one pod exists for this node. + while true; do + local out; out="$(pods_on_node)" + if [[ -z "${out}" ]]; then + # no pod presently; keep waiting (DaemonSet may bring it up) + sleep 5 + continue + fi + # If at least one is Running, sleep longer; otherwise poll faster + if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then + sleep 60 + else + sleep 5 + fi + done +} + +case "${1:-}" in + start) cmd_start ;; + stop) cmd_stop ;; + restart) cmd_restart ;; + wait) cmd_wait ;; + status) cmd_status ;; + *) + echo "Usage: $0 {start|stop|restart|wait|status} [asic-id(optional, ignored)]" >&2 + exit 2 + ;; +esac diff --git a/dockers/docker-telemetry-sidecar/systemd_stub.py b/dockers/docker-telemetry-sidecar/systemd_stub.py new file mode 100644 index 00000000000..b2195b0f851 --- /dev/null +++ b/dockers/docker-telemetry-sidecar/systemd_stub.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +import time +import argparse +import hashlib +import shlex +import subprocess +from dataclasses import dataclass +from typing import List, Optional, Tuple + +from sonic_py_common import logger as log +logger = log.Logger() + +# ───────────── Config ───────────── +SYNC_INTERVAL_S = int(os.environ.get("SYNC_INTERVAL_S", "900")) # seconds +NSENTER_BASE = ["nsenter", "--target", "1", "--pid", "--mount", "--uts", "--ipc", "--net"] + +@dataclass(frozen=True) +class SyncItem: + src_in_container: str + dst_on_host: str + mode: int = 0o755 + +SYNC_ITEMS: List[SyncItem] = [ + SyncItem("/usr/share/sonic/systemd_scripts/telemetry.sh", "/usr/local/bin/telemetry.sh"), + SyncItem("/usr/share/sonic/systemd_scripts/container_checker", "/bin/container_checker"), +] + +POST_COPY_ACTIONS = { + "/usr/local/bin/telemetry.sh": [ + ["sudo", "docker", "stop", "telemetry"], + ["sudo", "docker", "rm", "telemetry"], + ["sudo", "systemctl", "daemon-reload"], + ["sudo", "systemctl", "restart", "telemetry"], + ], + "/bin/container_checker": [ + ["sudo", "systemctl", "daemon-reload"], + ["sudo", "systemctl", "restart", "monit"], + ], +} + + +def run(args: List[str], *, text: bool = True, input_bytes: Optional[bytes] = None) -> Tuple[int, str | bytes, str | bytes]: + logger.log_debug("Running: " + " ".join(args)) + p = subprocess.Popen( + args, + text=text, + stdin=subprocess.PIPE if input_bytes is not None else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate(input=input_bytes if input_bytes is not None else None) + return p.returncode, out, err + + +def run_nsenter(args: List[str], *, text: bool = True, input_bytes: Optional[bytes] = None) -> Tuple[int, str | bytes, str | bytes]: + return run(NSENTER_BASE + args, text=text, input_bytes=input_bytes) + + +def read_file_bytes_local(path: str) -> Optional[bytes]: + try: + with open(path, "rb") as f: + return f.read() + except Exception as e: + logger.log_error(f"read failed for {path}: {e}") + return None + + +# ───────────── Host file ops via nsenter ───────────── +def host_read_bytes(path_on_host: str) -> Optional[bytes]: + # Use /bin/cat in host namespace + rc, out, _ = run_nsenter(["/bin/cat", path_on_host], text=False) + if rc != 0: + return None + return out + +def host_write_atomic(dst_on_host: str, data: bytes, mode: int) -> bool: + tmp_path = f"/tmp/{os.path.basename(dst_on_host)}.tmp" + + # 1) write bytes to host tmp via stdin + rc, _, err = run_nsenter(["/bin/sh", "-lc", f"cat > {shlex.quote(tmp_path)}"], text=False, input_bytes=data) + if rc != 0: + emsg = err.decode(errors="ignore") if isinstance(err, (bytes, bytearray)) else str(err) + logger.log_error(f"host write tmp failed: {emsg.strip()}") + return False + + # 2) chmod tmp on host + rc, _, err = run_nsenter(["/bin/chmod", f"{mode:o}", tmp_path], text=True) + if rc != 0: + logger.log_error(f"host chmod failed: {str(err).strip()}") + run_nsenter(["/bin/rm", "-f", tmp_path], text=True) + return False + + # 3) ensure parent dir exists on host + parent = os.path.dirname(dst_on_host) or "/" + rc, _, err = run_nsenter(["/bin/mkdir", "-p", parent], text=True) + if rc != 0: + logger.log_error(f"host mkdir failed for {parent}: {str(err).strip()}") + run_nsenter(["/bin/rm", "-f", tmp_path], text=True) + return False + + # 4) atomic replace on host + rc, _, err = run_nsenter(["/bin/mv", "-f", tmp_path, dst_on_host], text=True) + if rc != 0: + logger.log_error(f"host mv failed to {dst_on_host}: {str(err).strip()}") + run_nsenter(["/bin/rm", "-f", tmp_path], text=True) + return False + + return True + +def run_host_actions_for(path_on_host: str) -> None: + actions = POST_COPY_ACTIONS.get(path_on_host, []) + for cmd in actions: + rc, _, err = run_nsenter(cmd, text=True) + if rc == 0: + logger.log_info(f"Post-copy action succeeded: {' '.join(cmd)}") + else: + logger.log_error(f"Post-copy action FAILED (rc={rc}): {' '.join(cmd)}; stderr={str(err).strip()}") + + +# ───────────── file Sync logic ───────────── +def sha256_bytes(b: Optional[bytes]) -> str: + if b is None: + return "" + h = hashlib.sha256() + h.update(b) + return h.hexdigest() + +def sync_items(items: List[SyncItem]) -> bool: + all_ok = True + for item in items: + src_bytes = read_file_bytes_local(item.src_in_container) + if src_bytes is None: + logger.log_error(f"Cannot read {item.src_in_container} in this container") + all_ok = False + continue + + container_file_sha = sha256_bytes(src_bytes) + host_bytes = host_read_bytes(item.dst_on_host) + host_sha = sha256_bytes(host_bytes) + + if host_sha == container_file_sha: + logger.log_info(f"{os.path.basename(item.dst_on_host)} up-to-date (sha256={host_sha})") + continue + + logger.log_info( + f"{os.path.basename(item.dst_on_host)} differs " + f"(container {container_file_sha} vs host {host_sha or 'missing'}), updating…" + ) + if not host_write_atomic(item.dst_on_host, src_bytes, item.mode): + logger.log_error(f"Copy/update failed for {item.dst_on_host}") + all_ok = False + continue + + # verify + new_host_bytes = host_read_bytes(item.dst_on_host) + new_sha = sha256_bytes(new_host_bytes) + if new_sha != container_file_sha: + logger.log_error( + f"Post-copy SHA mismatch for {item.dst_on_host}: host {new_sha or 'read-failed'} vs container {container_file_sha}" + ) + all_ok = False + else: + logger.log_info(f"Sync complete for {item.dst_on_host} (sha256={new_sha})") + run_host_actions_for(item.dst_on_host) + + return all_ok + +def ensure_sync() -> bool: + return sync_items(SYNC_ITEMS) + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Sync host scripts from this container to the host via nsenter (syslog logging).") + p.add_argument("--once", action="store_true", help="Run one sync pass and exit") + p.add_argument("--interval", type=int, default=SYNC_INTERVAL_S, help=f"Loop interval seconds (default: {SYNC_INTERVAL_S})") + p.add_argument("--no-post-actions", action="store_true", help="(Optional) Skip host systemctl actions (for debugging)") + return p.parse_args() + +def main() -> int: + args = parse_args() + + if args.no_post_actions: + POST_COPY_ACTIONS.clear() + logger.log_info("Post-copy host actions DISABLED for this run") + + ok = ensure_sync() + if args.once: + return 0 if ok else 1 + + while True: + time.sleep(args.interval) + ok = ensure_sync() and ok + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + logger.log_info("Interrupted by user") + sys.exit(130) diff --git a/files/image_config/monit/container_checker b/files/image_config/monit/container_checker index 769755b7c66..a1e24eca9cb 100755 --- a/files/image_config/monit/container_checker +++ b/files/image_config/monit/container_checker @@ -158,7 +158,6 @@ def get_current_running_from_DB(always_running_containers): return running_containers - def get_current_running_from_dockers(): """ @summary: This function will get all running containers from @@ -172,13 +171,16 @@ def get_current_running_from_dockers(): try: lst = ctrs.list(filters={"status": "running"}) for ctr in lst: - running_containers.add(ctr.name) + # Prefer raw_container_name label over actual name + if ctr.labels and "raw_container_name" in ctr.labels: + running_containers.add(ctr.labels["raw_container_name"]) + else: + running_containers.add(ctr.name) except docker.errors.APIError as err: - print("Failed to retrieve the running container list. Error: '{}'".format(err)) + print(f"Failed to retrieve the running container list. Error: '{err}'") pass return running_containers - def get_current_running_containers(always_running_containers): """ @summary: This function will get the list of currently running containers. diff --git a/rules/docker-telemetry-sidecar.dep b/rules/docker-telemetry-sidecar.dep new file mode 100644 index 00000000000..90a59fd1fa7 --- /dev/null +++ b/rules/docker-telemetry-sidecar.dep @@ -0,0 +1,10 @@ +DPATH := $($(DOCKER_TELEMETRY_SIDECAR)_PATH) +DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/docker-telemetry-sidecar.mk rules/docker-telemetry-sidecar.dep +DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST) +DEP_FILES += $(shell git ls-files $(DPATH)) + +$(DOCKER_TELEMETRY_SIDECAR)_CACHE_MODE := GIT_CONTENT_SHA +$(DOCKER_TELEMETRY_SIDECAR)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST) +$(DOCKER_TELEMETRY_SIDECAR)_DEP_FILES := $(DEP_FILES) + +$(eval $(call add_dbg_docker,$(DOCKER_TELEMETRY_SIDECAR),$(DOCKER_TELEMETRY_SIDECAR_DBG))) diff --git a/rules/docker-telemetry-sidecar.mk b/rules/docker-telemetry-sidecar.mk new file mode 100644 index 00000000000..0e419b15935 --- /dev/null +++ b/rules/docker-telemetry-sidecar.mk @@ -0,0 +1,27 @@ +# docker image for docker-telemetry-sidecar + +DOCKER_TELEMETRY_SIDECAR_STEM = docker-telemetry-sidecar +DOCKER_TELEMETRY_SIDECAR = $(DOCKER_TELEMETRY_SIDECAR_STEM).gz +DOCKER_TELEMETRY_SIDECAR_DBG = $(DOCKER_TELEMETRY_SIDECAR_STEM)-$(DBG_IMAGE_MARK).gz + +$(DOCKER_TELEMETRY_SIDECAR)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE_BOOKWORM) + +$(DOCKER_TELEMETRY_SIDECAR)_PATH = $(DOCKERS_PATH)/$(DOCKER_TELEMETRY_SIDECAR_STEM) + +$(DOCKER_TELEMETRY_SIDECAR)_VERSION = 1.0.0 +$(DOCKER_TELEMETRY_SIDECAR)_PACKAGE_NAME = telemetry-sidecar + +SONIC_DOCKER_IMAGES += $(DOCKER_TELEMETRY_SIDECAR) +SONIC_BOOKWORM_DOCKERS += $(DOCKER_TELEMETRY_SIDECAR) +SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_TELEMETRY_SIDECAR) + +SONIC_DOCKER_DBG_IMAGES += $(DOCKER_TELEMETRY_SIDECAR_DBG) +SONIC_BOOKWORM_DBG_DOCKERS += $(DOCKER_TELEMETRY_SIDECAR_DBG) +SONIC_INSTALL_DOCKER_DBG_IMAGES += $(DOCKER_TELEMETRY_SIDECAR_DBG) + +$(DOCKER_TELEMETRY_SIDECAR)_CONTAINER_NAME = telemetry-sidecar +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -t --privileged --pid=host +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /lib/systemd/system:/lib/systemd/system:rw +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /etc/audit:/etc/audit:rw +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro +$(DOCKER_TELEMETRY_SIDECAR)_RUN_OPT += -v /etc/localtime:/etc/localtime:ro From d7dede1efe350e5c1f6e04cc76762526f17fb967 Mon Sep 17 00:00:00 2001 From: Qi Luo Date: Sat, 20 Sep 2025 23:14:13 -0700 Subject: [PATCH 3/3] Update dockers/docker-telemetry-sidecar/systemd_stub.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- dockers/docker-telemetry-sidecar/systemd_stub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/docker-telemetry-sidecar/systemd_stub.py b/dockers/docker-telemetry-sidecar/systemd_stub.py index 2365ce45f9c..43a424abc58 100644 --- a/dockers/docker-telemetry-sidecar/systemd_stub.py +++ b/dockers/docker-telemetry-sidecar/systemd_stub.py @@ -64,7 +64,7 @@ def read_file_bytes_local(path: str) -> Optional[bytes]: try: with open(path, "rb") as f: return f.read() - except OSError as e: #covers file-related errors incl. ENOENT, EACCES, EISDIR, etc. + except OSError as e: # covers file-related errors incl. ENOENT, EACCES, EISDIR, etc. logger.log_error(f"read failed for {path}: {e}") return None