Skip to content

Commit 7e5197a

Browse files
FengPan-Frankqiluo-msft
authored andcommitted
Add sonic-telemetry-sidecar container (sonic-net#23936)
Why I did it Create sidecar container which will stub systemd script, so that telemetry container rollout-ed via k8s could be managed by systemd script with naming change, as well as container checker logic. Work item tracking Microsoft ADO (number only): How I did it Added systemd_stub service, which check sha256 for stub file regularly and overwrite Stub into /usr/local/bin/telemetry.sh, so that systemd cmd could keep compatible, but the implementation changes into kubectl Stub into monit container_checker, since container name will be changed when it's rollout-ed via k8s, but it will has label "raw_container_name=telemetry", thus update is in get_current_running_from_dockers Added env IS_V1_ENABLED to control telemetry.sh operation, IS_V1_ENABLED as true will recover system's original telemetry.sh which used as rollback case IS_V1_ENABLED as false will used to stub k8s supported telemetry.sh and move on upgrade.
1 parent 1b28fb7 commit 7e5197a

9 files changed

Lines changed: 667 additions & 4 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %}
2+
ARG BASE=docker-config-engine-bookworm-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}}
3+
4+
FROM $BASE AS base
5+
6+
ARG docker_container_name
7+
ARG image_version
8+
RUN [ -f /etc/rsyslog.conf ] && sed -ri "s/%syslogtag%/$docker_container_name#%syslogtag%/;" /etc/rsyslog.conf
9+
10+
# Make apt-get non-interactive
11+
ENV DEBIAN_FRONTEND=noninteractive
12+
13+
# Pass the image_version to container
14+
ENV IMAGE_VERSION=$image_version
15+
16+
# K8s will override this
17+
ENV IS_V1_ENABLED=false
18+
19+
COPY ["systemd_stub.py", "/usr/bin/"]
20+
COPY ["systemd_scripts/", "/usr/share/sonic/systemd_scripts/"]
21+
COPY ["files/container_checker", "/usr/share/sonic/systemd_scripts/container_checker"]
22+
COPY ["files/telemetry.sh", "/usr/share/sonic/systemd_scripts/telemetry_v1.sh"]
23+
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
24+
25+
RUN chmod +x /usr/bin/systemd_stub.py
26+
27+
FROM $BASE
28+
29+
RUN --mount=type=bind,from=base,target=/changes-to-image rsync -axAX --no-D --exclude=/sys --exclude=/proc --exclude=/dev --exclude=resolv.conf /changes-to-image/ /
30+
31+
# Make apt-get non-interactive
32+
ENV DEBIAN_FRONTEND=noninteractive
33+
34+
# Pass the image_version to container
35+
ENV IMAGE_VERSION=$image_version
36+
37+
ENTRYPOINT ["/usr/local/bin/supervisord"]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
[supervisord]
2+
logfile_maxbytes=1MB
3+
logfile_backups=2
4+
nodaemon=true
5+
6+
[eventlistener:dependent-startup]
7+
command=python3 -m supervisord_dependent_startup
8+
autostart=true
9+
autorestart=unexpected
10+
startretries=0
11+
exitcodes=0,3
12+
events=PROCESS_STATE
13+
buffer_size=1024
14+
15+
[program:rsyslogd]
16+
command=/usr/sbin/rsyslogd -n -iNONE
17+
priority=1
18+
autostart=false
19+
autorestart=unexpected
20+
stdout_logfile=NONE
21+
stdout_syslog=true
22+
stderr_logfile=NONE
23+
stderr_syslog=true
24+
dependent_startup=true
25+
26+
[program:systemd_stub]
27+
command=python3 /usr/bin/systemd_stub.py
28+
priority=3
29+
autostart=true
30+
autorestart=true
31+
startsecs=0
32+
stdout_logfile=NONE
33+
stdout_syslog=true
34+
stderr_logfile=NONE
35+
stderr_syslog=true
36+
dependent_startup=true
37+
dependent_startup_wait_for=rsyslogd:running
38+
environment=IS_V1_ENABLED=%(ENV_IS_V1_ENABLED)s
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
SERVICE="telemetry"
5+
NS="${NS:-sonic}" # k8s namespace
6+
LABEL="raw_container_name=${SERVICE}" # selector used by DaemonSet
7+
KUBECTL_BIN="${KUBECTL_BIN:-kubectl}"
8+
NODE_NAME="${NODE_NAME:-$(hostname)}"
9+
DEV="${2:-}" # accepted for compatibility; unused (single-ASIC)
10+
11+
log() { /usr/bin/logger -t "${SERVICE}#system" "$*"; }
12+
13+
require_kubectl() {
14+
if ! command -v "${KUBECTL_BIN}" >/dev/null 2>&1; then
15+
echo "ERROR: kubectl not found (KUBECTL_BIN=${KUBECTL_BIN})." >&2
16+
exit 127
17+
fi
18+
# Try a sensible default if KUBECONFIG isn’t set
19+
if [[ -z "${KUBECONFIG:-}" && -r /etc/kubernetes/kubelet.conf ]]; then
20+
export KUBECONFIG=/etc/kubernetes/kubelet.conf
21+
fi
22+
}
23+
24+
pods_on_node() {
25+
# Prints: "<name> <phase>" per line for this node
26+
"${KUBECTL_BIN}" -n "${NS}" get pods \
27+
-l "${LABEL}" \
28+
--field-selector "spec.nodeName=${NODE_NAME}" \
29+
-o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' 2>/dev/null || true
30+
}
31+
32+
kill_pods() {
33+
require_kubectl
34+
local found=0
35+
while read -r name phase; do
36+
[[ -z "${name}" ]] && continue
37+
found=1
38+
log "Deleting ${SERVICE} pod ${name} (phase=${phase}) on node ${NODE_NAME}"
39+
# Force/instant delete to emulate “kill”; DaemonSet will recreate
40+
"${KUBECTL_BIN}" -n "${NS}" delete pod "${name}" --grace-period=0 --force >/dev/null 2>&1 || true
41+
done < <(pods_on_node)
42+
if [[ "${found}" -eq 0 ]]; then
43+
log "No ${SERVICE} pods found on node ${NODE_NAME} (namespace=${NS}, label=${LABEL})."
44+
fi
45+
}
46+
47+
cmd_start() { kill_pods; } # start == kill (DS restarts)
48+
cmd_stop() { kill_pods; }
49+
cmd_restart() { kill_pods; }
50+
51+
cmd_status() {
52+
require_kubectl
53+
local out; out="$(pods_on_node)"
54+
if [[ -z "${out}" ]]; then
55+
echo "${SERVICE}: NOT RUNNING (no pod on node ${NODE_NAME})"
56+
exit 3
57+
fi
58+
echo "${out}" | while read -r name phase; do
59+
[[ -z "${name}" ]] && continue
60+
echo "${SERVICE} pod ${name}: ${phase}"
61+
done
62+
# Exit 0 if at least one Running, 1 otherwise
63+
if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then
64+
exit 0
65+
else
66+
exit 1
67+
fi
68+
}
69+
70+
cmd_wait() {
71+
require_kubectl
72+
log "Waiting on ${SERVICE} pods (ns=${NS}, label=${LABEL}) on node ${NODE_NAME}..."
73+
# Keep the systemd service 'active' as long as at least one pod exists for this node.
74+
while true; do
75+
local out; out="$(pods_on_node)"
76+
if [[ -z "${out}" ]]; then
77+
# no pod presently; keep waiting (DaemonSet may bring it up)
78+
sleep 5
79+
continue
80+
fi
81+
# If at least one is Running, sleep longer; otherwise poll faster
82+
if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then
83+
sleep 60
84+
else
85+
sleep 5
86+
fi
87+
done
88+
}
89+
90+
case "${1:-}" in
91+
start) cmd_start ;;
92+
stop) cmd_stop ;;
93+
restart) cmd_restart ;;
94+
wait) cmd_wait ;;
95+
status) cmd_status ;;
96+
*)
97+
echo "Usage: $0 {start|stop|restart|wait|status} [asic-id(optional, ignored)]" >&2
98+
exit 2
99+
;;
100+
esac
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
# tests/test_systemd_stub.py
2+
import sys
3+
import types
4+
import importlib
5+
from pathlib import Path
6+
7+
import pytest
8+
9+
10+
@pytest.fixture(scope="session", autouse=True)
11+
def fake_logger_module():
12+
pkg = types.ModuleType("sonic_py_common")
13+
logger_mod = types.ModuleType("sonic_py_common.logger")
14+
15+
class _Logger:
16+
def __init__(self):
17+
self.messages = []
18+
19+
def _log(self, level, msg):
20+
self.messages.append((level, msg))
21+
22+
def log_debug(self, msg): self._log("DEBUG", msg)
23+
def log_info(self, msg): self._log("INFO", msg)
24+
def log_error(self, msg): self._log("ERROR", msg)
25+
def log_notice(self, msg): self._log("NOTICE", msg)
26+
def log_warning(self, msg): self._log("WARNING", msg)
27+
def log_critical(self, msg): self._log("CRITICAL", msg)
28+
29+
logger_mod.Logger = _Logger
30+
pkg.logger = logger_mod
31+
sys.modules["sonic_py_common"] = pkg
32+
sys.modules["sonic_py_common.logger"] = logger_mod
33+
yield
34+
35+
36+
@pytest.fixture
37+
def ss(tmp_path, monkeypatch):
38+
"""
39+
Import systemd_stub fresh for every test, and provide fakes:
40+
- run_nsenter: simulates a host FS + systemctl/docker calls
41+
- container_fs: dict for "container" files
42+
- host_fs: dict for "host" files
43+
"""
44+
if "systemd_stub" in sys.modules:
45+
del sys.modules["systemd_stub"]
46+
ss = importlib.import_module("systemd_stub")
47+
48+
# Fake host filesystem and command recorder
49+
host_fs = {}
50+
commands = []
51+
52+
# Fake run_nsenter
53+
def fake_run_nsenter(args, *, text=True, input_bytes=None):
54+
commands.append(("nsenter", tuple(args)))
55+
# /bin/cat <path>
56+
if args[:1] == ["/bin/cat"] and len(args) == 2:
57+
path = args[1]
58+
if path in host_fs:
59+
out = host_fs[path]
60+
return 0, (out if not text else out.decode("utf-8", "ignore")), b"" if not text else ""
61+
return 1, b"" if not text else "", b"No such file" if text else b"No such file"
62+
# /bin/sh -lc "cat > /tmp/xxx"
63+
if args[:2] == ["/bin/sh", "-lc"] and len(args) == 3 and args[2].startswith("cat > "):
64+
tmp_path = args[2].split("cat > ", 1)[1].strip()
65+
host_fs[tmp_path] = input_bytes or (b"" if text else b"")
66+
return 0, "" if text else b"", "" if text else b""
67+
# chmod / mkdir / mv / rm
68+
if args[:1] == ["/bin/chmod"]:
69+
return 0, "" if text else b"", "" if text else b""
70+
if args[:1] == ["/bin/mkdir"]:
71+
return 0, "" if text else b"", "" if text else b""
72+
if args[:1] == ["/bin/mv"] and len(args) == 4:
73+
src, dst = args[2], args[3]
74+
host_fs[dst] = host_fs.get(src, b"")
75+
host_fs.pop(src, None)
76+
return 0, "" if text else b"", "" if text else b""
77+
if args[:1] == ["/bin/rm"]:
78+
target = args[-1]
79+
host_fs.pop(target, None)
80+
return 0, "" if text else b"", "" if text else b""
81+
# sudo …
82+
if args[:1] == ["sudo"]:
83+
return 0, "" if text else b"", "" if text else b""
84+
return 1, "" if text else b"", "unsupported" if text else b"unsupported"
85+
86+
monkeypatch.setattr(ss, "run_nsenter", fake_run_nsenter, raising=True)
87+
88+
# Fake container FS
89+
container_fs = {}
90+
def fake_read_file_bytes_local(path: str):
91+
return container_fs.get(path, None)
92+
93+
monkeypatch.setattr(ss, "read_file_bytes_local", fake_read_file_bytes_local, raising=True)
94+
95+
# Isolate POST_COPY_ACTIONS
96+
monkeypatch.setattr(ss, "POST_COPY_ACTIONS", {}, raising=True)
97+
98+
return ss, container_fs, host_fs, commands
99+
100+
101+
def test_sha256_bytes_basic():
102+
if "systemd_stub" in sys.modules:
103+
del sys.modules["systemd_stub"]
104+
ss = importlib.import_module("systemd_stub")
105+
assert ss.sha256_bytes(b"") == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
106+
assert ss.sha256_bytes(None) == ""
107+
assert ss.sha256_bytes(b"abc") == "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
108+
109+
110+
def test_host_write_atomic_and_read(ss):
111+
ss, container_fs, host_fs, commands = ss
112+
ok = ss.host_write_atomic("/etc/testfile", b"hello", 0o755)
113+
assert ok
114+
data = ss.host_read_bytes("/etc/testfile")
115+
assert data == b"hello"
116+
cmd_names = [c[1][0] for c in commands]
117+
assert "/bin/sh" in cmd_names
118+
assert "/bin/chmod" in cmd_names
119+
assert "/bin/mkdir" in cmd_names
120+
assert "/bin/mv" in cmd_names
121+
122+
123+
def test_sync_no_change_fast_path(ss):
124+
ss, container_fs, host_fs, commands = ss
125+
item = ss.SyncItem("/container/telemetry.sh", "/host/telemetry.sh", 0o755)
126+
container_fs[item.src_in_container] = b"same"
127+
host_fs[item.dst_on_host] = b"same"
128+
ss.SYNC_ITEMS[:] = [item]
129+
130+
ok = ss.ensure_sync()
131+
assert ok is True
132+
assert not any("/bin/sh" == c[1][0] and "-lc" in c[1] for c in commands)
133+
134+
135+
def test_sync_updates_and_post_actions(ss):
136+
ss, container_fs, host_fs, commands = ss
137+
item = ss.SyncItem("/container/container_checker", "/bin/container_checker", 0o755)
138+
container_fs[item.src_in_container] = b"NEW"
139+
host_fs[item.dst_on_host] = b"OLD"
140+
ss.SYNC_ITEMS[:] = [item]
141+
142+
ss.POST_COPY_ACTIONS[item.dst_on_host] = [
143+
["sudo", "systemctl", "daemon-reload"],
144+
["sudo", "systemctl", "restart", "monit"],
145+
]
146+
147+
ok = ss.ensure_sync()
148+
assert ok is True
149+
assert host_fs[item.dst_on_host] == b"NEW"
150+
151+
post_cmds = [args for _, args in commands if args and args[0] == "sudo"]
152+
assert ("sudo", "systemctl", "daemon-reload") in post_cmds
153+
assert ("sudo", "systemctl", "restart", "monit") in post_cmds
154+
155+
156+
def test_sync_missing_src_returns_false(ss):
157+
ss, container_fs, host_fs, commands = ss
158+
item = ss.SyncItem("/container/missing.sh", "/usr/local/bin/telemetry.sh", 0o755)
159+
ss.SYNC_ITEMS[:] = [item]
160+
ok = ss.ensure_sync()
161+
assert ok is False
162+
163+
164+
def test_main_once_exits_zero_and_disables_post_actions(monkeypatch):
165+
if "systemd_stub" in sys.modules:
166+
del sys.modules["systemd_stub"]
167+
ss = importlib.import_module("systemd_stub")
168+
169+
ss.POST_COPY_ACTIONS["/bin/container_checker"] = [["sudo", "echo", "hi"]]
170+
monkeypatch.setattr(ss, "ensure_sync", lambda: True, raising=True)
171+
monkeypatch.setattr(sys, "argv", ["systemd_stub.py", "--once", "--no-post-actions"])
172+
173+
rc = ss.main()
174+
assert rc == 0
175+
assert ss.POST_COPY_ACTIONS == {}
176+
177+
178+
def test_main_once_exits_nonzero_when_sync_fails(monkeypatch):
179+
if "systemd_stub" in sys.modules:
180+
del sys.modules["systemd_stub"]
181+
ss = importlib.import_module("systemd_stub")
182+
monkeypatch.setattr(ss, "ensure_sync", lambda: False, raising=True)
183+
monkeypatch.setattr(sys, "argv", ["systemd_stub.py", "--once"])
184+
rc = ss.main()
185+
assert rc == 1
186+
187+
188+
def test_env_controls_telemetry_src_true(monkeypatch):
189+
if "systemd_stub" in sys.modules:
190+
del sys.modules["systemd_stub"]
191+
monkeypatch.setenv("IS_V1_ENABLED", "true")
192+
193+
ss = importlib.import_module("systemd_stub")
194+
assert ss.IS_V1_ENABLED is True
195+
assert ss._TELEMETRY_SRC.endswith("telemetry_v1.sh")
196+
197+
198+
def test_env_controls_telemetry_src_false(monkeypatch):
199+
if "systemd_stub" in sys.modules:
200+
del sys.modules["systemd_stub"]
201+
monkeypatch.setenv("IS_V1_ENABLED", "false")
202+
203+
ss = importlib.import_module("systemd_stub")
204+
assert ss.IS_V1_ENABLED is False
205+
assert ss._TELEMETRY_SRC.endswith("telemetry.sh")
206+
207+
208+
def test_env_controls_telemetry_src_default(monkeypatch):
209+
if "systemd_stub" in sys.modules:
210+
del sys.modules["systemd_stub"]
211+
monkeypatch.delenv("IS_V1_ENABLED", raising=False)
212+
213+
ss = importlib.import_module("systemd_stub")
214+
assert ss.IS_V1_ENABLED is False
215+
assert ss._TELEMETRY_SRC.endswith("telemetry.sh")

0 commit comments

Comments
 (0)