diff --git a/files/build_templates/docker_image_ctl.j2 b/files/build_templates/docker_image_ctl.j2 index 95c2e1fa52a..9114f146461 100644 --- a/files/build_templates/docker_image_ctl.j2 +++ b/files/build_templates/docker_image_ctl.j2 @@ -108,6 +108,37 @@ function preStartAction() fi {%- elif docker_container_name == "snmp" %} $SONIC_DB_CLI STATE_DB HSET 'DEVICE_METADATA|localhost' chassis_serial_number $(decode-syseeprom -s) + + # Wait for system ready. Continue after timeout if we won't hear anything + # from DB. Read timeout from config file and add two extra minutes on top of + # it. + CONFFILE=system_health_monitoring_config.json + PLATFORM=$(sonic-cfggen -d -v "DEVICE_METADATA['localhost']['platform']") + TIMEOUT=$(cat /usr/share/sonic/device/$PLATFORM/$CONFFILE | jq ".timeout") + SYSTEM_READY_EVENT="SYSTEM_READY|SYSTEM_STATE" + CHECK_SR_EVENT="sonic-db-dump -n STATE_DB -k $SYSTEM_READY_EVENT" + + # Set default if not found + if [[ -z $TIMEOUT ]]; then + MESSAGE="Failed to read timeout from config file, assuming default is \ + 10 minutes" + echo $MESSAGE + TIMEOUT=10 + fi + + # Add to extra minutes and convert to seconds and + TIMEOUT=$(((TIMEOUT + 2) * 60)) + + # Waiting for event + echo "Waiting for $SYSTEM_READY_EVENT event" + while [[ -z "$($CHECK_SR_EVENT | jq '.[]["value"]["Status"]')" ]]; do + UPTIME=$(awk -F. '{print $1}' /proc/uptime) + if [[ $UPTIME -gt $TIMEOUT ]]; then + echo "Got uptime timeout - starting snmp..." + break + fi + sleep 15 + done {%- else %} : # nothing {%- endif %} diff --git a/files/build_templates/init_cfg.json.j2 b/files/build_templates/init_cfg.json.j2 index 9c02de60207..c1e7a308a54 100644 --- a/files/build_templates/init_cfg.json.j2 +++ b/files/build_templates/init_cfg.json.j2 @@ -79,11 +79,17 @@ "auto_restart": "{{autorestart}}", "support_syslog_rate_limit" : "true", {# Set check_up_status to true here when app readiness will be marked in state db #} +{%- if feature in ["swss", "syncd", "pmon"] %} + "check_up_status" : "true", +{%- endif %} {# For now, to support the infrastrucure, setting the check_up_status to false for bgp,swss,pmon #} {# Once apps like bgp,synd supports app readiness, then bgp,syncd can set check_up_status to true #} -{%- if feature in ["bgp", "swss", "pmon"] %} +{%- if feature in ["bgp"] %} "check_up_status" : "false", {%- endif %} +{%- if feature in ["ib-utils", "snmp"] %} + "irrel_for_sysready" : "true", +{%- endif %} {%- if include_kubernetes == "y" %} {%- if feature in ["lldp", "pmon", "radv", "eventd", "snmp", "telemetry", "gnmi"] %} "set_owner": "kube", {% else %} diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index e9c2cfd494b..2115198dd27 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -44,6 +44,8 @@ function startplatform() { /usr/bin/mlnx-fw-upgrade.sh -v if [[ "$?" -ne "${EXIT_SUCCESS}" ]]; then debug "Failed to upgrade fw. " "$?" "Restart syncd" + sonic-db-cli STATE_DB HSET "FEATURE|$DEV_SRV" fail_reason \ + "ASIC FW update failed" up_status false exit 1 fi /etc/init.d/sxdkernel restart @@ -86,6 +88,9 @@ function startplatform() { /etc/init.d/xpnet.sh start fi fi + + sonic-db-cli STATE_DB HDEL "FEATURE|$DEV_SRV" fail_reason + sonic-db-cli STATE_DB HSET "FEATURE|$DEV_SRV" up_status true } function waitplatform() { @@ -169,9 +174,11 @@ LOCKFILE="/tmp/swss-syncd-lock$DEV" NAMESPACE_PREFIX="asic" if [ "$DEV" ]; then NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace + DEV_SRV="$SERVICE@$DEV" SONIC_DB_CLI="sonic-db-cli -n $NET_NS" else NET_NS="" + DEV_SRV="$SERVICE" SONIC_DB_CLI="sonic-db-cli" fi diff --git a/src/sonic-yang-models/doc/Configuration.md b/src/sonic-yang-models/doc/Configuration.md index 269a60f6739..84bd1456ce4 100644 --- a/src/sonic-yang-models/doc/Configuration.md +++ b/src/sonic-yang-models/doc/Configuration.md @@ -938,31 +938,31 @@ deployment envionment id and deployment type are specified. BGP local AS number is also specified in this table as current only single BGP instance is supported in SONiC. -``` +```json { -"DEVICE_METADATA": { + "DEVICE_METADATA": { "localhost": { - "hwsku": "Force10-S6100", - "default_bgp_status": "up", - "docker_routing_config_mode": "unified", - "hostname": "sonic-s6100-01", - "platform": "x86_64-dell_s6100_c2538-r0", - "mac": "4c:76:25:f4:70:82", - "default_pfcwd_status": "disable", - "bgp_asn": "65100", - "deployment_id": "1", - "type": "ToRRouter", - "bgp_adv_lo_prefix_as_128" : "true", - "buffer_model": "traditional", - "yang_config_validation": "disable", - "rack_mgmt_map": "dummy_value", - "timezome": "Europe/Kiev" + "hwsku": "Force10-S6100", + "default_bgp_status": "up", + "docker_routing_config_mode": "unified", + "hostname": "sonic-s6100-01", + "platform": "x86_64-dell_s6100_c2538-r0", + "mac": "4c:76:25:f4:70:82", + "default_pfcwd_status": "disable", + "bgp_asn": "65100", + "deployment_id": "1", + "type": "ToRRouter", + "bgp_adv_lo_prefix_as_128" : "true", + "buffer_model": "traditional", + "yang_config_validation": "disable", + "rack_mgmt_map": "dummy_value", + "timezome": "Europe/Kiev", + "sysready_state": "enabled" + } } - } } - ``` - +* `sysready_state` - System-ready feature configuration: `{enabled,disabled}` ### Device neighbor metada diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json b/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json index 141d8b5c823..c3ef45958cf 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/device_metadata.json @@ -156,5 +156,12 @@ "DEVICE_METADATA_INVALID_CREATE_ONLY_CONFIG_DB_BUFFERS": { "desc": "Verifying invalid create_only_config_db_buffers value", "eStrKey": "InvalidValue" + }, + "DEVICE_METADATA_VALID_SYSREADY": { + "desc": "Verifying valid system-ready state" + }, + "DEVICE_METADATA_INVALID_SYSREADY": { + "desc": "Verifying invalid system-ready state", + "eStrKey": "InvalidValue" } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json b/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json index 52dce58a2cf..2e8400a2319 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/feature.json @@ -14,5 +14,10 @@ }, "FEATURE_WITH_NO_OWNER" : { "desc": "Config feature table without set_owner" + }, + "FEATURE_INVALID_IRRELEVANT_FOR_SYSTEM_READY" : { + "desc": "Invalid value for 'irrelevant for system-ready' parameter", + "eStrKey": "Pattern", + "eStr": ["false|true|False|True"] } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json index 941077205df..526ca8abb85 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/device_metadata.json @@ -416,5 +416,23 @@ } } } + }, + "DEVICE_METADATA_VALID_SYSREADY": { + "sonic-device_metadata:sonic-device_metadata": { + "sonic-device_metadata:DEVICE_METADATA": { + "sonic-device_metadata:localhost": { + "sysready_state": "disabled" + } + } + } + }, + "DEVICE_METADATA_INVALID_SYSREADY": { + "sonic-device_metadata:sonic-device_metadata": { + "sonic-device_metadata:DEVICE_METADATA": { + "sonic-device_metadata:localhost": { + "sysready_state": "enable" + } + } + } } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json index 12e1e07173c..a50f071b3d0 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/feature.json @@ -57,6 +57,7 @@ "has_per_asic_scope": "{% if not DEVICE_RUNTIME_METADATA['ETHERNET_PORTS_PRESENT'] or ('CHASSIS_METADATA' in DEVICE_RUNTIME_METADATA and DEVICE_RUNTIME_METADATA['CHASSIS_METADATA']['module_type'] in ['supervisor']) %}False{% else %}True{% endif %}", "set_owner": "kube", "check_up_status": "false", + "irrel_for_sysready": "true", "support_syslog_rate_limit": "true" }, { @@ -68,6 +69,7 @@ "has_per_asic_scope": "true", "set_owner": "kube", "check_up_status": "false", + "irrel_for_sysready": "false", "support_syslog_rate_limit": "true" } ] @@ -129,5 +131,17 @@ ] } } + }, + "FEATURE_INVALID_IRRELEVANT_FOR_SYSTEM_READY": { + "sonic-feature:sonic-feature": { + "sonic-feature:FEATURE": { + "FEATURE_LIST": [ + { + "name": "swss", + "irrel_for_sysready": "tr" + } + ] + } + } } } diff --git a/src/sonic-yang-models/yang-models/sonic-device_metadata.yang b/src/sonic-yang-models/yang-models/sonic-device_metadata.yang index 7226b02e53e..42422e97d3e 100644 --- a/src/sonic-yang-models/yang-models/sonic-device_metadata.yang +++ b/src/sonic-yang-models/yang-models/sonic-device_metadata.yang @@ -19,6 +19,10 @@ module sonic-device_metadata { description "DEVICE_METADATA YANG Module for SONiC OS"; + revision 2023-05-10 { + description "Add system-ready feature configuration"; + } + revision 2021-02-27 { description "Added frr_mgmt_framework_config field to handle BGP config DB schema events to configure FRR protocols."; @@ -242,6 +246,12 @@ module sonic-device_metadata { otherwise the maximum available buffers (which are read from SAI) will be created, regardless of the CONFIG_DB buffers configuration."; } + + leaf sysready_state { + description "System-ready feature state"; + type stypes:admin_mode; + default enabled; + } } /* end of container localhost */ } diff --git a/src/sonic-yang-models/yang-models/sonic-feature.yang b/src/sonic-yang-models/yang-models/sonic-feature.yang index 6f23135b609..68556276895 100644 --- a/src/sonic-yang-models/yang-models/sonic-feature.yang +++ b/src/sonic-yang-models/yang-models/sonic-feature.yang @@ -11,6 +11,10 @@ module sonic-feature{ description "Feature Table yang Module for SONiC"; + revision 2023-05-10 { + description "Add system-ready feature configuration"; + } + typedef feature-state { description "configuration to set the feature running state"; type string; @@ -112,6 +116,12 @@ module sonic-feature{ type stypes:boolean_type; default "false"; } + + leaf irrel_for_sysready { + description "Readiness of this feature is irrelevant for system-ready state"; + type stypes:boolean_type; + default "false"; + } } } } diff --git a/src/system-health/health_checker/config.py b/src/system-health/health_checker/config.py index 63af22db4ad..ffc676d1a26 100644 --- a/src/system-health/health_checker/config.py +++ b/src/system-health/health_checker/config.py @@ -32,6 +32,9 @@ class Config(object): # Monit service start delay configuration entry MONIT_START_DELAY_CONFIG = 'with start delay' + # Default system ready timeout + DEFAULT_SYSREADY_TIMEOUT = 10 + def __init__(self): """ Constructor. Initialize all configuration entry to default value in case there is no configuration file. @@ -44,6 +47,9 @@ def __init__(self): self.ignore_services = None self.ignore_devices = None self.user_defined_checkers = None + self.wait_services = None + self.services_to_report_app_status = None + self.timeout = Config.DEFAULT_SYSREADY_TIMEOUT def config_file_exists(self): return os.path.exists(self._config_file) @@ -70,9 +76,13 @@ def load_config(self): self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) self.ignore_services = self._get_list_data('services_to_ignore') + self.wait_services = self._get_list_data('services_to_wait') + self.services_to_report_app_status = self._get_list_data('services_to_report_app_status') + self.timeout = self.config_data.get('timeout', Config.DEFAULT_SYSREADY_TIMEOUT) self.ignore_devices = self._get_list_data('devices_to_ignore') self.user_defined_checkers = self._get_list_data('user_defined_checkers') except Exception as e: + # TODO: Add log here. Unexpected fail is not visible self._reset() def _reset(self): diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index 3a2840e11a6..2cbf90aa6cf 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -1,28 +1,34 @@ #!/usr/bin/python3 +import glob +import multiprocessing import os import sys import time -import glob -import multiprocessing +from copy import copy from datetime import datetime -from swsscommon import swsscommon + from sonic_py_common.logger import Logger -from . import utils from sonic_py_common.task_base import ProcessTaskBase +from swsscommon import swsscommon + +from . import utils from .config import Config import signal SYSLOG_IDENTIFIER = "system#monitor" REDIS_TIMEOUT_MS = 0 -system_allsrv_state = "DOWN" -spl_srv_list = ['database-chassis', 'gbsyncd'] +spl_srv_list = ['database-chassis', 'gbsyncd', 'e2scrub_reap'] SELECT_TIMEOUT_MSECS = 1000 QUEUE_TIMEOUT = 15 TASK_STOP_TIMEOUT = 10 logger = Logger(log_identifier=SYSLOG_IDENTIFIER) +def get_timestamp(): + return '{}'.format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) + + #Subprocess which subscribes to STATE_DB FEATURE table for any update #and push service events to main process via queue class MonitorStateDbTask(ProcessTaskBase): @@ -34,8 +40,10 @@ def __init__(self,myQ): def subscribe_statedb(self): state_db = swsscommon.DBConnector("STATE_DB", REDIS_TIMEOUT_MS, False) sel = swsscommon.Select() - cst = swsscommon.SubscriberStateTable(state_db, "FEATURE") - sel.addSelectable(cst) + feature_sub = swsscommon.SubscriberStateTable(state_db, "FEATURE") + service_sub = swsscommon.SubscriberStateTable(state_db, "SERVICE_APP") + sel.addSelectable(feature_sub) + sel.addSelectable(service_sub) while not self.task_stopping_event.is_set(): (state, c) = sel.select(SELECT_TIMEOUT_MSECS) @@ -44,11 +52,22 @@ def subscribe_statedb(self): if state != swsscommon.Select.OBJECT: logger.log_warning("sel.select() did not return swsscommon.Select.OBJECT") continue - (key, op, cfvs) = cst.pop() + + fd = c.getFd() + if fd == feature_sub.getFd(): + (key, op, cfvs) = feature_sub.pop() + source = 'feature' + elif fd == service_sub.getFd(): + (key, op, cfvs) = service_sub.pop() + source = 'service_app' + else: + logger.log_warning("sel.select() returned unexpected selectable OBJECT") + continue + key_ext = key + ".service" - timestamp = "{}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")) - msg={"unit": key_ext, "evt_src":"feature", "time":timestamp} - self.task_notify(msg) + self.task_notify({"unit": key_ext, + "evt_src": source, + "time": get_timestamp()}) def task_worker(self): @@ -83,8 +102,8 @@ def on_job_removed(self, id, job, unit, result): #Function for listening the systemd event on dbus def subscribe_sysbus(self): import dbus - from gi.repository import GLib from dbus.mainloop.glib import DBusGMainLoop + from gi.repository import GLib DBusGMainLoop(set_as_default=True) bus = dbus.SystemBus() @@ -107,17 +126,72 @@ def task_notify(self, msg): return self.task_queue.put(msg) + +# Subprocess to monitor system ready timeout. If timeout will be exceeded, +# send a message to queue and exit +class MonitorTimeout(ProcessTaskBase): + def __init__(self, myQ, config): + ProcessTaskBase.__init__(self) + self._stop_timeout_secs = TASK_STOP_TIMEOUT + self.task_queue = myQ + + if config and config.timeout: + self.timeout = config.timeout + else: + self.timeout = 10 + logger.log_warning("Can't get timeout configuration, assuming 10 " + "minutes") + + def task_worker(self): + if self.task_stopping_event.is_set(): + return + + logger.log_info(f'Set system ready timeout task for {self.timeout} ' + 'minutes') + + # Repeat until timeout or canceling + while not self.task_stopping_event.is_set(): + if utils.get_uptime() > self.timeout * 60: + logger.log_warning('Got system ready timeout. Notifying') + self.task_notify({ + 'unit': 'timeout', + 'evt_src': 'timeout', + 'time': get_timestamp() + }) + break + + time.sleep(15) + + logger.log_info('System ready timeout task finished') + + def task_notify(self, msg): + if self.task_stopping_event.is_set(): + return + self.task_queue.put(msg) + + #Mainprocess which launches 2 subtasks - systembus task and statedb task #and on receiving events, checks and updates the system ready status to state db class Sysmonitor(ProcessTaskBase): def __init__(self): ProcessTaskBase.__init__(self) + # Initialize the timeout to stop this task once "task_stop" event is + # received self._stop_timeout_secs = TASK_STOP_TIMEOUT - self.dnsrvs_name = set() + self.state_db = None self.config_db = None + self.monitor_timeout = None + + # List of services in "DOWN" state + self.dnsrvs_name = set() + # Last reported system state + self.last_reported_state = 'INIT' + + # Load system health monitoring configuration self.config = Config() + self.config.load_config() self.mpmgr = multiprocessing.Manager() self.myQ = self.mpmgr.Queue() @@ -141,25 +215,84 @@ def get_all_service_list(self): self.config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=True) self.config_db.connect() - dir_list = [] + dir_set = set() + srvs_files = set() + #add the services from the below targets - targets= ["/etc/systemd/system/multi-user.target.wants", "/etc/systemd/system/sonic.target.wants"] + targets= [ + "/etc/systemd/system/multi-user.target.wants", + "/etc/systemd/system/sonic.target.wants", + ] for path in targets: - dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))] - - #add the enabled docker services from config db feature table - self.get_service_from_feature_table(dir_list) - - self.config.load_config() + srvs_files_list = glob.glob('{}/*.service'.format(path)) + srvs_files |= {os.path.basename(i) for i in srvs_files_list} + + # By default, wait for all + dir_set = copy(srvs_files) + + # Want only services: means all multi-user and sonic targets, except + # services defined in FEATURE table. Feature table is expandable, so + # it may and should control services which will have effect on + # system-ready state by itself. + # Leave only services we want to wait for + if self.config and self.config.wait_services: + # srv_file example - swss@1.service, syncd@0.service + # full_srv_name example - swss@1, syncd@0 + # srv_name example - swss, syncd + dir_set = set() + for srv_name in self.config.wait_services: + for srv_file in srvs_files: + full_srv_name = srv_file.removesuffix('.service') + if (full_srv_name == srv_name or + full_srv_name.startswith(srv_name + '@')): + dir_set.add(srv_file) + + # add the enabled docker services from config db feature table. + # It will respect irrel_for_sysready flag. + feature_table_srvs = self.get_service_from_feature_table() + # srv_file example - swss@1.service, syncd@0.service + # full_srv_name example - swss@1, syncd@0 + # srv_name example - swss, syncd + for srv_name in feature_table_srvs: + for srv_file in srvs_files: + full_srv_name = srv_file.removesuffix('.service') + if (full_srv_name == srv_name or + full_srv_name.startswith(srv_name + '@')): + dir_set.add(srv_file) + + # Ignore what we don't need (we are using here the list of services to + # ignore for system health), of course if some service doesn't affect + # system health, so why it should affect on system ready? Well, it + # shouldn't. + # So that last filter filters again all services we don't want. if self.config and self.config.ignore_services: - for srv in self.config.ignore_services: - if srv in dir_list: - dir_list.remove(srv) + # srv_file example - swss@1.service, syncd@0.service + # srv_name example - swss, syncd + for srv_file in copy(dir_set): + srv_name = srv_file.removesuffix('.service').split('@')[0] + if srv_name in self.config.ignore_services: + dir_set.remove(srv_file) + + # Feature is controllable by sysready_state parameter of + # DEVICE_METADATA table. Setting it to "disable" will just simulate + # "PortInitDone" behavior. + device_metadata = self.config_db.get_table( + swsscommon.CFG_DEVICE_METADATA_TABLE_NAME).get('localhost', {}) + feature_state = device_metadata.get('sysready_state', 'enabled') + + # Handle feature state. + # Use only swss.service to rely on ready state + if feature_state == 'disabled': + # Order matters + desired_srvs = ['swss.service'] + for srv in desired_srvs: + if srv in dir_set: + dir_set = {srv} + break - dir_list.sort() - return dir_list + return sorted(dir_set) - def get_service_from_feature_table(self, dir_list): + def get_service_from_feature_table(self): """Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table is not an atomic operation, sonic-cfggen do it with two steps: 1. Add an empty table entry to CONFIG DB @@ -168,13 +301,15 @@ def get_service_from_feature_table(self, dir_list): So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry mechanism is here to avoid such issue. - Args: - dir_list (list): service list + Return: + srvs_list: list: tracked services list """ max_retry = 3 retry_delay = 1 success = True + srvs_list = [] + while max_retry > 0: success = True feature_table = self.config_db.get_table("FEATURE") @@ -184,9 +319,9 @@ def get_service_from_feature_table(self, dir_list): logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table)) break if fields["state"] not in ["disabled", "always_disabled"]: - srvext = srv + ".service" - if srvext not in dir_list: - dir_list.append(srvext) + if fields.get('irrel_for_sysready', '').lower() != 'true': + if srv not in srvs_list: + srvs_list.append(srv) if not success: max_retry -= 1 time.sleep(retry_delay) @@ -195,9 +330,11 @@ def get_service_from_feature_table(self, dir_list): if not success: logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table)) - #Checks FEATURE table from config db for the service' check_up_status flag - #if marked to true, then read the service up_status from FEATURE table of state db. - #else, just return Up + return srvs_list + + # Checks FEATURE table from config db or data from json file for the + # service' check_up_status flag if marked to true, then read the service + # up_status from FEATURE table of state db. Else, just return Up def get_app_ready_status(self, service): if not self.state_db: self.state_db = swsscommon.SonicV2Connector(use_unix_socket_path=True) @@ -212,26 +349,59 @@ def get_app_ready_status(self, service): configdb_feature_table = self.config_db.get_table('FEATURE') update_time = "-" - if service not in configdb_feature_table.keys(): + app_up_status_cfg = set() + if self.config and self.config.services_to_report_app_status: + app_up_status_cfg = self.config.services_to_report_app_status + + # Config DB will have a configuration for non parametrized service + # But if there is a parameter, the service should handle the right + # reporting itself. + service_noparam = service.split('@')[0] + if service_noparam not in (app_up_status_cfg | + configdb_feature_table.keys()): + pstate = "Up" + return pstate,fail_reason,update_time + + if service_noparam in configdb_feature_table.keys(): + table = 'FEATURE' + check_app_up_status = \ + configdb_feature_table[service_noparam].get('check_up_status') + else: + table = 'SERVICE_APP' + + srvc_table = '{}|{}'.format(table, service) + + # Handle feature's application up status + if table == 'FEATURE' and (not check_app_up_status or + check_app_up_status.lower() == 'false'): + # Don't check application up status if check_app_up_status flag is + # not present or set to false pstate = "Up" else: - check_app_up_status = configdb_feature_table[service].get('check_up_status') - if check_app_up_status is not None and (check_app_up_status.lower()) == "true": - up_status_flag = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'up_status') - if up_status_flag is not None and (up_status_flag.lower()) == "true": - pstate = "Up" + up_status_flag = self.state_db.get(self.state_db.STATE_DB, + srvc_table, 'up_status') + if up_status_flag is not None and up_status_flag.lower() == "true": + pstate = "Up" + else: + if (up_status_flag is not None and + up_status_flag.lower() == "false"): + # The application is in failed state if status flag set to + # false + pstate = "Failed" else: - fail_reason = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'fail_reason') - if fail_reason is None: - fail_reason = "NA" + # In that case it is just in init state pstate = "Down" - update_time = self.state_db.get(self.state_db.STATE_DB, 'FEATURE|{}'.format(service), 'update_time') - if update_time is None: - update_time = "-" - else: - #Either check_up_status marked False or entry does not exist - pstate = "Up" + # Try to get fail reason anyway + fail_reason = self.state_db.get(self.state_db.STATE_DB, + srvc_table, 'fail_reason') + if fail_reason is None: + fail_reason = "NA" + + update_time = self.state_db.get(self.state_db.STATE_DB, srvc_table, + 'update_time') + if update_time is None: + update_time = "-" return pstate,fail_reason,update_time @@ -254,6 +424,27 @@ def post_unit_status(self, srv_name, srv_status, app_status, fail_reason, update self.state_db = swsscommon.SonicV2Connector(use_unix_socket_path=True) self.state_db.connect(self.state_db.STATE_DB) + # Log message with proper level + logf = logger.log_notice + msg = f'{srv_name} service status: {srv_status}, ' + msg += f'app status: {app_status}' + + if srv_status != 'OK': + msg += f', fail reason: {fail_reason}' + + # So far good down reasons for each service status + good_dr = { + 'Down': ['Inactive'], + 'Starting': ['-'], + 'Stopping': ['-'] + } + + # Only allowed combinations + if fail_reason not in good_dr.get(srv_status, []): + logf = logger.log_error + + logf(msg) + key = 'ALL_SERVICE_STATUS|{}'.format(srv_name) statusvalue = {} statusvalue['service_status'] = srv_status @@ -300,6 +491,9 @@ def get_unit_status(self, event): if init_state == "Up": service_up_status = "OK" unit_status = "OK" + elif init_state == "Failed": + fail_reason = app_fail_reason + unit_status = "FAILED" else: fail_reason = app_fail_reason unit_status = "NOT OK" @@ -313,10 +507,20 @@ def get_unit_status(self, event): service_status = "Stopping" service_up_status = "Stopping" elif active_state == "inactive": - if srv_type == "oneshot" or service_name in spl_srv_list: + if service_name in spl_srv_list: service_status = "OK" service_up_status = "OK" unit_status = "OK" + elif srv_type == "oneshot": + # If oneshot service is + if sub_state == "exited": + service_status = "OK" + service_up_status = "OK" + unit_status = "OK" + else: + unit_status = "NOT OK" + if fail_reason == "-": + fail_reason = "Inactive" else: unit_status = "NOT OK" if fail_reason == "-": @@ -344,11 +548,13 @@ def get_all_system_status(self): if ustate == "NOT OK": if service not in self.dnsrvs_name: self.dnsrvs_name.add(service) + elif ustate == "FAILED": + return "FAILED" if len(self.dnsrvs_name) == 0: return "UP" else: - return "DOWN" + return "INIT" #Displays the system ready status message on console def print_console_message(self, message): @@ -356,51 +562,65 @@ def print_console_message(self, message): console.write("\n{} {}\n".format(datetime.now().strftime("%b %d %H:%M:%S.%f"), message)) #Publish the system ready status message on logger,console and state db - def publish_system_status(self, astate): - global system_allsrv_state - if system_allsrv_state != astate: - system_allsrv_state = astate - if astate == "DOWN": - msg = "System is not ready - one or more services are not up" - elif astate == "UP": - msg = "System is ready" - logger.log_notice(msg) - self.print_console_message(msg) - self.post_system_status(astate) - - #Checks all the services and updates the current system status - def update_system_status(self): - try: - astate = self.get_all_system_status() - self.publish_system_status(astate) + def publish_system_status(self, new_state): + # Allowed transitions + # 'from': ['to'] + transitions = { + 'INIT': ['UP', 'DOWN', 'FAILED'], + 'DOWN': ['FAILED'] + } + + # Check if state transition is allowed. If status update not allowed, + # don't perform it + if new_state not in transitions.get(self.last_reported_state, []): + return - except Exception as e: - logger.log_error("update system status exception:{}".format(str(e))) + if new_state == "DOWN": + msg = "System is not ready - one or more services are not up" + elif new_state == "FAILED": + msg = "System is not ready - one or more services failed" + else: + msg = "System is ready" + + # Broadcast new system status + logger.log_notice(msg) + self.print_console_message(msg) + self.post_system_status(new_state) + self.last_reported_state = new_state + + # We don't need timeout anymore + self.monitor_timeout.task_stop() #Checks a service status and updates the system status def check_unit_status(self, event): - #global dnsrvs_name if not self.state_db: self.state_db = swsscommon.SonicV2Connector(use_unix_socket_path=True) self.state_db.connect(self.state_db.STATE_DB) - astate = "DOWN" + astate = 'INIT' full_srv_list = self.get_all_service_list() if event in full_srv_list: ustate = self.get_unit_status(event) - if ustate == "OK" and system_allsrv_state == "UP": - astate = "UP" - elif ustate == "OK" and system_allsrv_state == "DOWN": + if ustate == 'OK': + # If that unit was down/inactive, and now it is up, remove it + # from the set of down units if event in self.dnsrvs_name: self.dnsrvs_name.remove(event) - if len(self.dnsrvs_name) == 0: - astate = "UP" - else: - astate = "DOWN" - else: + # If now the set of down units is empty - we are good to report + # "OK" system status + if len(self.dnsrvs_name) == 0: + astate = 'UP' + elif ustate == 'FAILED': + # If unit reported failed state, the system should report + # "NOT OK" status + astate = 'FAILED' + elif ustate == 'NOT OK': if event not in self.dnsrvs_name: self.dnsrvs_name.add(event) - astate = "DOWN" + else: + logger.log_warning(f'Got wrong {event} unit state: {ustate}. ' + 'Nothing to do') + return self.publish_system_status(astate) else: @@ -411,9 +631,7 @@ def check_unit_status(self, event): if len(self.dnsrvs_name) == 0: astate = "UP" - else: - astate = "DOWN" - self.publish_system_status(astate) + self.publish_system_status(astate) srv_name,last = event.split('.') # stop on service maybe propagated to timers and in that case, @@ -424,7 +642,7 @@ def check_unit_status(self, event): if key_exists == 1: self.state_db.delete(self.state_db.STATE_DB, key) - return 0 + return def system_service(self): if not self.state_db: @@ -438,12 +656,18 @@ def system_service(self): monitor_statedb_table = MonitorStateDbTask(self.myQ) monitor_statedb_table.task_run() + self.monitor_timeout = MonitorTimeout(self.myQ, self.config) + self.monitor_timeout.task_run() except Exception as e: logger.log_error("SubProcess-{}".format(str(e))) sys.exit(1) - - self.update_system_status() + try: + # Maybe everything is ready now, so let's try + astate = self.get_all_system_status() + self.publish_system_status(astate) + except Exception as e: + logger.log_error(f"Failed to update initial system status: {e}") from queue import Empty # Queue to receive the STATEDB and Systemd state change event @@ -455,18 +679,34 @@ def system_service(self): event_time = msg["time"] logger.log_debug("Main process- received event:{} from source:{} time:{}".format(event,event_src,event_time)) logger.log_info("check_unit_status for [ "+event+" ] ") - self.check_unit_status(event) + if event_src == 'timeout': + # If got timeout, so not all daemons/apps reported their + # ready status. It mean the system is in "NOT OK" state. + # But we can't assume it is failed, so it is just down so + # far. + self.publish_system_status('DOWN') + else: + self.check_unit_status(event) + + # Recheck the down services, cause they may not update dbus + for service in self.dnsrvs_name.copy(): + self.check_unit_status(service) except Empty: pass except Exception as e: logger.log_error("system_service"+str(e)) #cleanup tables "'ALL_SERVICE_STATUS*', 'SYSTEM_READY*'" from statedb - self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "ALL_SERVICE_STATUS|*") - self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, "SYSTEM_READY|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, + "ALL_SERVICE_STATUS|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, + "SYSTEM_READY|*") + self.state_db.delete_all_by_pattern(self.state_db.STATE_DB, + "SERVICE_APP|*") monitor_system_bus.task_stop() monitor_statedb_table.task_stop() + self.monitor_timeout.task_stop() def task_worker(self): if self.task_stopping_event.is_set(): @@ -492,5 +732,3 @@ def task_stop(self): return False return True - - diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 67f819ecc5f..19a5724af52 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -37,6 +37,7 @@ from health_checker.sysmonitor import Sysmonitor from health_checker.sysmonitor import MonitorStateDbTask from health_checker.sysmonitor import MonitorSystemBusTask +from health_checker.sysmonitor import MonitorTimeout load_source('healthd', os.path.join(scripts_path, 'healthd')) from healthd import HealthDaemon @@ -46,6 +47,14 @@ snmp-subagent EXITED Oct 19 01:53 AM """ device_info.get_platform = MagicMock(return_value='unittest') +stub_glob = [ + '/some/systemd/path/radv.service', + '/some/systemd/path/bgp.service', + '/some/systemd/path/pmon.service', + '/some/systemd/path/swss.service', + '/some/systemd/path/swss@1.service', + '/some/systemd/path/ntp-config.service', +] def setup(): @@ -579,10 +588,12 @@ def test_utils(): @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) -@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('sonic_py_common.multi_asic.is_multi_asic', + MagicMock(return_value=False)) @patch('docker.DockerClient') @patch('health_checker.utils.run_command') @patch('swsscommon.swsscommon.ConfigDBConnector') +@patch('glob.glob', MagicMock(return_value=stub_glob)) def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): mock_db_data = MagicMock() mock_get_table = MagicMock() @@ -598,6 +609,7 @@ def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): 'state': 'enabled', 'has_global_scope': 'True', 'has_per_asic_scope': 'False', + 'irrel_for_sysready': 'True' }, 'pmon': { 'state': 'disabled', @@ -605,16 +617,31 @@ def test_get_all_service_list(mock_config_db, mock_run, mock_docker_client): 'has_per_asic_scope': 'False', } } + sysmon = Sysmonitor() - print("mock get table:{}".format(mock_get_table.return_value)) + sysmon.config.wait_services = ['ntp-config'] + sysmon.config.ignore_services = ['swss'] result = sysmon.get_all_service_list() - print("result get all service list:{}".format(result)) + assert 'radv.service' in result + assert 'bgp.service' not in result assert 'pmon.service' not in result + # Test with feature disabled. The swss is expected only + mock_get_table.return_value = { + 'localhost': { + 'sysready_state': 'disabled' + } + } + sysmon = Sysmonitor() + result = sysmon.get_all_service_list() + assert len(result) == 1 + assert 'swss.service' in result + @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) -@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('sonic_py_common.multi_asic.is_multi_asic', + MagicMock(return_value=False)) @patch('docker.DockerClient') @patch('health_checker.utils.run_command') @patch('swsscommon.swsscommon.ConfigDBConnector') @@ -657,14 +684,12 @@ def test_get_app_ready_status(mock_config_db, mock_run, mock_docker_client): }}) sysmon = Sysmonitor() + sysmon.config.services_to_report_app_status = ['rsyslog'] result = sysmon.get_app_ready_status('radv') - print(result) assert 'Up' in result result = sysmon.get_app_ready_status('bgp') - print(result) - assert 'Down' in result + assert 'Failed' in result result = sysmon.get_app_ready_status('snmp') - print(result) assert 'Up' in result @@ -694,13 +719,15 @@ def test_check_unit_status(): @patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) def test_system_status_up_after_service_removed(): sysmon = Sysmonitor() - sysmon.publish_system_status('UP') + sysmon.monitor_timeout = MagicMock() + # sysmon.publish_system_status('UP') + # By default it is in init state. And should remain in init sysmon.check_unit_status('mock_bgp.service') assert 'mock_bgp.service' in sysmon.dnsrvs_name - result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + result = swsscommon.SonicV2Connector.exists(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE") print("system status result before service was removed from system: {}".format(result)) - assert result == "DOWN" + assert result == False sysmon.check_unit_status('mock_bgp.service') assert 'mock_bgp.service' not in sysmon.dnsrvs_name @@ -712,6 +739,7 @@ def test_system_status_up_after_service_removed(): @patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service'])) def test_check_unit_status_timer(): sysmon = Sysmonitor() + sysmon.monitor_timeout = MagicMock() sysmon.state_db = MagicMock() sysmon.state_db.exists = MagicMock(return_value=1) sysmon.state_db.delete = MagicMock() @@ -760,7 +788,24 @@ def test_get_all_system_status_not_ok(): sysmon = Sysmonitor() result = sysmon.get_all_system_status() print("result:{}".format(result)) - assert result == 'DOWN' + assert result == 'INIT' + + +@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', + MagicMock(return_value=['mock_snmp.service', 'mock_ns.service'])) +@patch('health_checker.sysmonitor.Sysmonitor.get_unit_status', + MagicMock(return_value= 'FAILED')) +@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', + MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.post_unit_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', + MagicMock(return_value='Up')) +def test_get_all_system_status_failed(): + sysmon = Sysmonitor() + result = sysmon.get_all_system_status() + print("result:{}".format(result)) + assert result == 'FAILED' + def test_post_unit_status(): sysmon = Sysmonitor() @@ -771,10 +816,20 @@ def test_post_unit_status(): assert result['app_ready_status'] == 'Down' assert result['fail_reason'] == 'mock reason' + # Test reporting failed + sysmon.post_unit_status("mock_ntp", 'Failed', 'Down', 'mock reason', '-') + result = swsscommon.SonicV2Connector.get_all(MockConnector, 0, + 'ALL_SERVICE_STATUS|mock_ntp') + assert result['service_status'] == 'Failed' + assert result['app_ready_status'] == 'Down' + assert result['fail_reason'] == 'mock reason' + + def test_post_system_status(): sysmon = Sysmonitor() sysmon.post_system_status("UP") - result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') + result = swsscommon.SonicV2Connector.get( + MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') print("post system status result:{}".format(result)) assert result == "UP" @@ -783,10 +838,15 @@ def test_post_system_status(): print("post system status result:{}".format(result)) assert result == "DOWN" + # Simulate fail. Non expected exceptions is fail + sysmon.state_db.set = MagicMock(side_effect=Exception) + sysmon.post_system_status("UP") + @patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) @patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) def test_publish_system_status_allowed_status(): sysmon = Sysmonitor() + sysmon.monitor_timeout = MagicMock() sysmon.publish_system_status('UP') sysmon.publish_system_status('DOWN') @@ -797,20 +857,16 @@ def test_publish_system_status_allowed_status(): for call_args in sysmon.post_system_status.call_args_list: assert call_args in expected_calls -@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', MagicMock()) +# @patch('health_checker.sysmonitor.Sysmonitor.post_system_status', MagicMock()) +@patch('health_checker.sysmonitor.Sysmonitor.print_console_message', + MagicMock()) def test_publish_system_status(): sysmon = Sysmonitor() + sysmon.monitor_timeout = MagicMock() sysmon.publish_system_status('UP') result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') assert result == "UP" -@patch('health_checker.sysmonitor.Sysmonitor.get_all_system_status', test_get_all_system_status_ok()) -@patch('health_checker.sysmonitor.Sysmonitor.publish_system_status', test_publish_system_status()) -def test_update_system_status(): - sysmon = Sysmonitor() - sysmon.update_system_status() - result = swsscommon.SonicV2Connector.get(MockConnector, 0, "SYSTEM_READY|SYSTEM_STATE", 'Status') - assert result == "UP" from sonic_py_common.task_base import ProcessTaskBase import multiprocessing @@ -824,6 +880,7 @@ def test_monitor_statedb_task(): assert sysmon._task_process is not None sysmon.task_stop() + @patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock()) def test_monitor_sysbus_task(): sysmon = MonitorSystemBusTask(myQ) @@ -832,6 +889,24 @@ def test_monitor_sysbus_task(): assert sysmon._task_process is not None sysmon.task_stop() + +def test_monitor_timeout_task(): + config = lambda: None + config.timeout = 10 + monitor = MonitorTimeout(myQ, config) + monitor.task_run() + monitor.task_notify('Test') + assert monitor._task_process is not None + monitor.task_stop() + + # Make sure not failing and increase coverage + monitor = MonitorTimeout(myQ, {}) + monitor.task_run() + assert monitor._task_process is not None + monitor.task_stop() + monitor.task_notify('Test') + + @patch('health_checker.sysmonitor.MonitorSystemBusTask.subscribe_sysbus', MagicMock()) @patch('health_checker.sysmonitor.MonitorStateDbTask.subscribe_statedb', MagicMock()) def test_system_service(): @@ -855,11 +930,9 @@ def test_get_service_from_feature_table(): 'swss': {'state': 'disabled'} } ] - dir_list = [] - sysmon.get_service_from_feature_table(dir_list) - assert 'bgp.service' in dir_list - assert 'swss.service' not in dir_list - + dir_list = sysmon.get_service_from_feature_table() + assert 'bgp' in dir_list + assert 'swss' not in dir_list @patch('healthd.time.time') def test_healthd_check_interval(mock_time):