diff --git a/dockers/docker-telemetry-sidecar/cli-plugin-tests/test_systemd_stub.py b/dockers/docker-telemetry-sidecar/cli-plugin-tests/test_systemd_stub.py index 237a6df6496..ac5662926b0 100644 --- a/dockers/docker-telemetry-sidecar/cli-plugin-tests/test_systemd_stub.py +++ b/dockers/docker-telemetry-sidecar/cli-plugin-tests/test_systemd_stub.py @@ -336,31 +336,143 @@ def test_reconcile_enables_user_auth_and_cname(ss): ss, container_fs, host_fs, commands, config_db = ss # Set module-level flags directly (they're read inside reconcile) ss.GNMI_VERIFY_ENABLED = True - ss.GNMI_CLIENT_CNAME = "AME Infra CA o6" + ss.GNMI_CLIENT_CERTS = [{"cname": "fake-infra-ca.test.example.com", "role": "gnmi_show_readonly"}] # Precondition: empty DB assert config_db == {} ss.reconcile_config_db_once() - # user_auth must be set to 'cert' - assert config_db.get("GNMI|gnmi", {}).get("user_auth") == "cert" - # CNAME hash must exist with role=gnmi_show_readonly (default GNMI_CLIENT_ROLE) - cname_key = f"GNMI_CLIENT_CERT|{ss.GNMI_CLIENT_CNAME}" - assert config_db.get(cname_key, {}).get("role") == "gnmi_show_readonly" + assert config_db.get("TELEMETRY|gnmi", {}).get("user_auth") == "cert" + # CNAME hash must exist with role=gnmi_show_readonly + assert config_db.get("GNMI_CLIENT_CERT|fake-infra-ca.test.example.com", {}).get("role") == "gnmi_show_readonly" def test_reconcile_disabled_removes_cname(ss): ss, container_fs, host_fs, commands, config_db = ss ss.GNMI_VERIFY_ENABLED = False - ss.GNMI_CLIENT_CNAME = "AME Infra CA o6" + ss.GNMI_CLIENT_CERTS = [{"cname": "fake-infra-ca.test.example.com", "role": "gnmi_show_readonly"}] # Seed an existing entry to be removed - config_db[f"GNMI_CLIENT_CERT|{ss.GNMI_CLIENT_CNAME}"] = {"role": "gnmi_show_readonly"} + config_db["GNMI_CLIENT_CERT|fake-infra-ca.test.example.com"] = {"role": "gnmi_show_readonly"} ss.reconcile_config_db_once() - assert f"GNMI_CLIENT_CERT|{ss.GNMI_CLIENT_CNAME}" not in config_db + assert "GNMI_CLIENT_CERT|fake-infra-ca.test.example.com" not in config_db + +def test_reconcile_multiple_cnames(ss): + ss, container_fs, host_fs, commands, config_db = ss + ss.GNMI_VERIFY_ENABLED = True + ss.GNMI_CLIENT_CERTS = [ + {"cname": "fake-client.test.example.com", "role": "admin"}, + {"cname": "fake-server.test.example.com", "role": '["gnmi_show_readonly","admin"]'}, + ] + assert config_db == {} + ss.reconcile_config_db_once() + + assert config_db.get("TELEMETRY|gnmi", {}).get("user_auth") == "cert" + assert config_db.get("GNMI_CLIENT_CERT|fake-client.test.example.com", {}).get("role") == "admin" + assert config_db.get("GNMI_CLIENT_CERT|fake-server.test.example.com", {}).get("role") == '["gnmi_show_readonly","admin"]' + +# ─────────────────────────── Tests for _parse_client_certs ─────────────────────────── + +class TestParseClientCerts: + """Tests for _parse_client_certs() env-var parsing.""" + + @pytest.fixture(autouse=True) + def _fresh_module(self, monkeypatch): + if "systemd_stub" in sys.modules: + del sys.modules["systemd_stub"] + self.monkeypatch = monkeypatch + + def _import_with_env(self, env_vars): + """Set env vars, re-import systemd_stub, and return the parsed GNMI_CLIENT_CERTS.""" + for k, v in env_vars.items(): + if v is None: + self.monkeypatch.delenv(k, raising=False) + else: + self.monkeypatch.setenv(k, v) + # Clear stale env vars not in the dict + for k in ("GNMI_CLIENT_CERTS", "TELEMETRY_CLIENT_CNAME", "GNMI_CLIENT_ROLE"): + if k not in env_vars: + self.monkeypatch.delenv(k, raising=False) + if "systemd_stub" in sys.modules: + del sys.modules["systemd_stub"] + ss = importlib.import_module("systemd_stub") + return ss.GNMI_CLIENT_CERTS + + def test_valid_json_array(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '[{"cname": "client.gbl", "role": "admin"}]' + }) + assert certs == [{"cname": "client.gbl", "role": "admin"}] + + def test_valid_json_multiple_entries(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '[{"cname": "a.gbl", "role": "admin"}, {"cname": "b.gbl", "role": "readonly"}]' + }) + assert len(certs) == 2 + assert certs[0] == {"cname": "a.gbl", "role": "admin"} + assert certs[1] == {"cname": "b.gbl", "role": "readonly"} + + def test_non_array_json_falls_back_to_legacy(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '{"cname": "c.gbl", "role": "admin"}', + "TELEMETRY_CLIENT_CNAME": "legacy.gbl", + "GNMI_CLIENT_ROLE": "readonly", + }) + assert certs == [{"cname": "legacy.gbl", "role": "readonly"}] + + def test_invalid_json_falls_back_to_legacy(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": "not-json!", + "TELEMETRY_CLIENT_CNAME": "fallback.gbl", + }) + assert certs == [{"cname": "fallback.gbl", "role": "gnmi_show_readonly"}] + + def test_entry_not_dict_falls_back(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '["not-a-dict"]', + "TELEMETRY_CLIENT_CNAME": "fb.gbl", + }) + assert certs == [{"cname": "fb.gbl", "role": "gnmi_show_readonly"}] + + def test_entry_missing_role_falls_back(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '[{"cname": "x.gbl"}]', + "TELEMETRY_CLIENT_CNAME": "fb.gbl", + }) + assert certs == [{"cname": "fb.gbl", "role": "gnmi_show_readonly"}] + + def test_entry_empty_cname_falls_back(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '[{"cname": " ", "role": "admin"}]', + "TELEMETRY_CLIENT_CNAME": "fb.gbl", + }) + assert certs == [{"cname": "fb.gbl", "role": "gnmi_show_readonly"}] + + def test_legacy_single_entry(self): + certs = self._import_with_env({ + "TELEMETRY_CLIENT_CNAME": "legacy.gbl", + "GNMI_CLIENT_ROLE": "admin", + }) + assert certs == [{"cname": "legacy.gbl", "role": "admin"}] + + def test_legacy_default_role(self): + certs = self._import_with_env({ + "TELEMETRY_CLIENT_CNAME": "legacy.gbl", + }) + assert certs == [{"cname": "legacy.gbl", "role": "gnmi_show_readonly"}] + + def test_no_env_returns_empty(self): + certs = self._import_with_env({}) + assert certs == [] + + def test_whitespace_stripped(self): + certs = self._import_with_env({ + "GNMI_CLIENT_CERTS": '[{"cname": " client.gbl ", "role": " admin "}]' + }) + assert certs == [{"cname": "client.gbl", "role": "admin"}] # ─────────────────────────── Tests for _get_branch_name ─────────────────────────── diff --git a/dockers/docker-telemetry-sidecar/systemd_stub.py b/dockers/docker-telemetry-sidecar/systemd_stub.py index 3ce0247e559..904d4a719b2 100644 --- a/dockers/docker-telemetry-sidecar/systemd_stub.py +++ b/dockers/docker-telemetry-sidecar/systemd_stub.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 from __future__ import annotations +import json import os import re import subprocess import time import argparse -from typing import List +from typing import Dict, List from sonic_py_common.sidecar_common import ( get_bool_env_var, logger, SyncItem, @@ -21,11 +22,46 @@ # CONFIG_DB reconcile env GNMI_VERIFY_ENABLED = get_bool_env_var("TELEMETRY_CLIENT_CERT_VERIFY_ENABLED", default=False) -GNMI_CLIENT_CNAME = os.getenv("TELEMETRY_CLIENT_CNAME", "") -GNMI_CLIENT_ROLE = os.getenv("GNMI_CLIENT_ROLE", "gnmi_show_readonly") +def _parse_client_certs() -> List[Dict[str, str]]: + """ + Build the list of GNMI client cert entries from env vars. + + Preferred: GNMI_CLIENT_CERTS (JSON array of {"cname": ..., "role": ...}) + Fallback: TELEMETRY_CLIENT_CNAME / GNMI_CLIENT_ROLE (single entry, backward-compat) + """ + raw = os.getenv("GNMI_CLIENT_CERTS", "").strip() + if raw: + try: + entries = json.loads(raw) + if not isinstance(entries, list): + raise ValueError("GNMI_CLIENT_CERTS must be a JSON array") + normalized: List[Dict[str, str]] = [] + for e in entries: + if not isinstance(e, dict): + raise ValueError(f"Each entry must be an object: {e!r}") + if "cname" not in e or "role" not in e: + raise ValueError(f"Each entry needs 'cname' and 'role': {e}") + cname = str(e.get("cname", "")).strip() + role = str(e.get("role", "")).strip() + if not cname or not role: + raise ValueError(f"'cname' and 'role' must be non-empty strings: {e}") + normalized.append({"cname": cname, "role": role}) + return normalized + except (json.JSONDecodeError, ValueError) as exc: + logger.log_error(f"Bad GNMI_CLIENT_CERTS env var: {exc}; falling back to legacy") + + # Legacy single-entry env vars + cname = os.getenv("TELEMETRY_CLIENT_CNAME", "").strip() + role = os.getenv("GNMI_CLIENT_ROLE", "gnmi_show_readonly").strip() + if cname: + return [{"cname": cname, "role": role}] + return [] + + +GNMI_CLIENT_CERTS: List[Dict[str, str]] = _parse_client_certs() logger.log_notice(f"IS_V1_ENABLED={IS_V1_ENABLED}") -logger.log_notice(f"GNMI_CLIENT_ROLE={GNMI_CLIENT_ROLE}") +logger.log_notice(f"GNMI_CLIENT_CERTS={GNMI_CLIENT_CERTS}") _TELEMETRY_SRC = ( "/usr/share/sonic/systemd_scripts/telemetry_v1.sh" @@ -131,31 +167,25 @@ def _get_branch_name() -> str: def _ensure_user_auth_cert() -> None: - cur = db_hget("GNMI|gnmi", "user_auth") + cur = db_hget("TELEMETRY|gnmi", "user_auth") if cur != "cert": - if db_hset("GNMI|gnmi", "user_auth", "cert"): - logger.log_notice(f"Set GNMI|gnmi.user_auth=cert (was: {cur or ''})") + if db_hset("TELEMETRY|gnmi", "user_auth", "cert"): + logger.log_notice(f"Set TELEMETRY|gnmi.user_auth=cert (was: {cur or ''})") else: - logger.log_error("Failed to set GNMI|gnmi.user_auth=cert") + logger.log_error("Failed to set TELEMETRY|gnmi.user_auth=cert") -def _ensure_cname_present(cname: str) -> None: - if not cname: - logger.log_warning("TELEMETRY_CLIENT_CNAME not set; skip CNAME creation") - return - +def _ensure_cname_present(cname: str, role: str) -> None: key = f"GNMI_CLIENT_CERT|{cname}" entry = db_hgetall(key) if not entry: - if db_hset(key, "role", GNMI_CLIENT_ROLE): - logger.log_notice(f"Created {key} with role={GNMI_CLIENT_ROLE}") + if db_hset(key, "role", role): + logger.log_notice(f"Created {key} with role={role}") else: logger.log_error(f"Failed to create {key}") def _ensure_cname_absent(cname: str) -> None: - if not cname: - return key = f"GNMI_CLIENT_CERT|{cname}" if db_hgetall(key): if db_del(key): @@ -163,20 +193,21 @@ def _ensure_cname_absent(cname: str) -> None: else: logger.log_error(f"Failed to remove {key}") - def reconcile_config_db_once() -> None: """ Idempotent drift-correction for CONFIG_DB: - When TELEMETRY_CLIENT_CERT_VERIFY_ENABLED=true: - * Ensure GNMI|gnmi.user_auth=cert - * Ensure GNMI_CLIENT_CERT| exists with role= - - When false: ensure the CNAME row is absent + * Ensure TELEMETRY|gnmi.user_auth=cert + * Ensure every GNMI_CLIENT_CERT| entry exists with its role + - When false: ensure all CNAME rows are absent """ if GNMI_VERIFY_ENABLED: _ensure_user_auth_cert() - _ensure_cname_present(GNMI_CLIENT_CNAME) + for entry in GNMI_CLIENT_CERTS: + _ensure_cname_present(entry["cname"], entry["role"]) else: - _ensure_cname_absent(GNMI_CLIENT_CNAME) + for entry in GNMI_CLIENT_CERTS: + _ensure_cname_absent(entry["cname"]) def ensure_sync() -> bool: branch_name = _get_branch_name() diff --git a/dockers/docker-telemetry-watchdog/watchdog/src/main.rs b/dockers/docker-telemetry-watchdog/watchdog/src/main.rs index bb2f11f622b..c736e17d279 100644 --- a/dockers/docker-telemetry-watchdog/watchdog/src/main.rs +++ b/dockers/docker-telemetry-watchdog/watchdog/src/main.rs @@ -36,14 +36,40 @@ const GNMI_BASE_CMD: &str = "gnmi_get"; // assumed in PATH const SHOW_API_PROBE_ENV_VAR: &str = "TELEMETRY_WATCHDOG_SHOW_API_PROBE_ENABLED"; // optional: set to "true" to enable serial number probing const SERIALNUMBER_PROBE_ENV_VAR: &str = "TELEMETRY_WATCHDOG_SERIALNUMBER_PROBE_ENABLED"; -const TARGET_NAME_ENV_VAR: &str = "TELEMETRY_WATCHDOG_TARGET_NAME"; // optional override for target_name -const DEFAULT_TARGET_NAME: &str = "server.ndastreaming.ap.gbl"; -const DEFAULT_CA_CRT: &str = "/etc/sonic/telemetry/dsmsroot.cer"; -const DEFAULT_SERVER_CRT: &str = "/etc/sonic/telemetry/streamingtelemetryserver.cer"; -const DEFAULT_SERVER_KEY: &str = "/etc/sonic/telemetry/streamingtelemetryserver.key"; +const TARGET_NAME_ENV_VAR: &str = "TELEMETRY_WATCHDOG_TARGET_NAME"; +const CA_CRT_ENV_VAR: &str = "TELEMETRY_WATCHDOG_CA_CRT"; +const SERVER_CRT_ENV_VAR: &str = "TELEMETRY_WATCHDOG_SERVER_CRT"; +const SERVER_KEY_ENV_VAR: &str = "TELEMETRY_WATCHDOG_SERVER_KEY"; +const BAD_CA_ENV_VAR: &str = "TELEMETRY_WATCHDOG_BAD_CA"; +const BAD_CERT_ENV_VAR: &str = "TELEMETRY_WATCHDOG_BAD_CERT"; +const BAD_KEY_ENV_VAR: &str = "TELEMETRY_WATCHDOG_BAD_KEY"; +const BAD_CNAME_ENV_VAR: &str = "TELEMETRY_WATCHDOG_BAD_CNAME"; +const GOOD_CA_ENV_VAR: &str = "TELEMETRY_WATCHDOG_GOOD_CA"; +const GOOD_CERT_ENV_VAR: &str = "TELEMETRY_WATCHDOG_GOOD_CERT"; +const GOOD_KEY_ENV_VAR: &str = "TELEMETRY_WATCHDOG_GOOD_KEY"; +const GOOD_CNAME_ENV_VAR: &str = "TELEMETRY_WATCHDOG_GOOD_CNAME"; + +const DEFAULT_TARGET_NAME: &str = "default-target-name"; +const DEFAULT_CA_CRT: &str = "/path/to/ca.crt"; +const DEFAULT_SERVER_CRT: &str = "/path/to/server.crt"; +const DEFAULT_SERVER_KEY: &str = "/path/to/server.key"; // Max stderr we keep per gnmi_get (bytes) before truncation. const STDERR_TRUNCATE_LIMIT: usize = 16 * 1024; // 16KB +const CERT_PROBE_ENV_VAR: &str = "TELEMETRY_WATCHDOG_CERT_PROBE_ENABLED"; + +// BAD (expected fail) probe +const DEFAULT_BAD_CA: &str = "/path/to/bad-ca.crt"; +const DEFAULT_BAD_CERT: &str = "/path/to/bad-cert.crt"; +const DEFAULT_BAD_KEY: &str = "/path/to/bad-cert.key"; +const DEFAULT_BAD_CNAME: &str = "bad-cname.example.com"; + +// GOOD (expected success) probe +const DEFAULT_GOOD_CA: &str = "/path/to/good-ca.crt"; +const DEFAULT_GOOD_CERT: &str = "/path/to/good-cert.crt"; +const DEFAULT_GOOD_KEY: &str = "/path/to/good-cert.key"; +const DEFAULT_GOOD_CNAME: &str = "good-cname.example.com"; + // Configuration: // 1. JSON file (/cmd_list.json) optional. Format: // { @@ -61,6 +87,13 @@ const STDERR_TRUNCATE_LIMIT: usize = 16 * 1024; // 16KB // Any failure (spawn error / non-zero exit) sets HTTP 500; body lists per-xpath results. // SHOW probe control: env TELEMETRY_WATCHDOG_SHOW_API_PROBE="disable" skips gnmi_get xpaths (default enabled). +fn env_or_default(env_var: &str, default: &str) -> String { + match env::var(env_var) { + Ok(v) if !v.trim().is_empty() => v.trim().to_string(), + _ => default.to_string(), + } +} + fn load_xpath_list() -> (Vec, Vec) { let mut set: HashSet = HashSet::new(); let mut errors: Vec = Vec::new(); @@ -259,13 +292,13 @@ fn get_security_config() -> TelemetrySecurityConfig { // TELEMETRY|gnmi: client_auth let ca_crt = redis_hget("TELEMETRY|certs", "ca_crt") .filter(|v| !v.trim().is_empty()) - .unwrap_or_else(|| DEFAULT_CA_CRT.to_string()); + .unwrap_or_else(|| env_or_default(CA_CRT_ENV_VAR, DEFAULT_CA_CRT)); let server_crt = redis_hget("TELEMETRY|certs", "server_crt") .filter(|v| !v.trim().is_empty()) - .unwrap_or_else(|| DEFAULT_SERVER_CRT.to_string()); + .unwrap_or_else(|| env_or_default(SERVER_CRT_ENV_VAR, DEFAULT_SERVER_CRT)); let server_key = redis_hget("TELEMETRY|certs", "server_key") .filter(|v| !v.trim().is_empty()) - .unwrap_or_else(|| DEFAULT_SERVER_KEY.to_string()); + .unwrap_or_else(|| env_or_default(SERVER_KEY_ENV_VAR, DEFAULT_SERVER_KEY)); let client_auth_opt = redis_hget("TELEMETRY|gnmi", "client_auth"); let use_client_auth = matches!(client_auth_opt.as_ref(), Some(v) if v.eq_ignore_ascii_case("true")); TelemetrySecurityConfig { use_client_auth, ca_crt, server_crt, server_key } @@ -303,6 +336,13 @@ fn is_serialnumber_probe_enabled() -> bool { } } +fn is_cert_probe_enabled() -> bool { + match env::var(CERT_PROBE_ENV_VAR) { + Ok(v) if v.eq_ignore_ascii_case("true") => true, + _ => false, // default disabled + } +} + fn get_gnmi_port() -> u16 { match redis_hget("TELEMETRY|gnmi", "port") { Some(p) => p.parse::().unwrap_or_else(|_| { @@ -382,6 +422,45 @@ fn main() { let timeout = read_timeout(); let target_name = get_target_name(); + // Certificate probes on reboot-cause/history API + // 1) BAD cert: expect failure + // 2) GOOD cert: expect success + // Only run cert probes when client_auth is actually enabled; + // otherwise the probes would always fail against an insecure server. + if is_cert_probe_enabled() && sec_cfg.use_client_auth { + let xpath_rc = "reboot-cause/history"; + + let bad_sec = TelemetrySecurityConfig { + use_client_auth: true, + ca_crt: env_or_default(BAD_CA_ENV_VAR, DEFAULT_BAD_CA), + server_crt: env_or_default(BAD_CERT_ENV_VAR, DEFAULT_BAD_CERT), + server_key: env_or_default(BAD_KEY_ENV_VAR, DEFAULT_BAD_KEY), + }; + let bad_cname = env_or_default(BAD_CNAME_ENV_VAR, DEFAULT_BAD_CNAME); + let mut res_bad = run_gnmi_for_xpath(&xpath_rc, port, &bad_sec, &bad_cname, timeout, "SHOW"); + if res_bad.success { + res_bad.success = false; + let msg = "Expected FAILURE with BAD cert but command SUCCEEDED".to_string(); + res_bad.error = Some(match res_bad.error.take() { + Some(existing) => format!("{existing}; {msg}"), + None => msg, + }); + http_status = "HTTP/1.1 500 Internal Server Error"; + } + cmd_results.push(res_bad); + + let good_sec = TelemetrySecurityConfig { + use_client_auth: true, + ca_crt: env_or_default(GOOD_CA_ENV_VAR, DEFAULT_GOOD_CA), + server_crt: env_or_default(GOOD_CERT_ENV_VAR, DEFAULT_GOOD_CERT), + server_key: env_or_default(GOOD_KEY_ENV_VAR, DEFAULT_GOOD_KEY), + }; + let good_cname = env_or_default(GOOD_CNAME_ENV_VAR, DEFAULT_GOOD_CNAME); + let res_good = run_gnmi_for_xpath(&xpath_rc, port, &good_sec, &good_cname, timeout, "SHOW"); + if !res_good.success { http_status = "HTTP/1.1 500 Internal Server Error"; } + cmd_results.push(res_good); + } + // Check Serial Number if is_serialnumber_probe_enabled() { let xpath_sn = "DEVICE_METADATA/localhost/chassis_serial_number";