Skip to content
Closed
31 changes: 31 additions & 0 deletions files/build_templates/docker_image_ctl.j2
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,37 @@ function preStartAction()
fi
{%- elif docker_container_name == "snmp" %}
$SONIC_DB_CLI STATE_DB HSET 'DEVICE_METADATA|localhost' chassis_serial_number $(decode-syseeprom -s)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you enabling this feature by default for snmp? Shouldn't it be based on configuration?

# Wait for system ready. Continue after timeout if we won't hear anything
# from DB. Read timeout from config file and add two extra minutes on top of
# it.
CONFFILE=system_health_monitoring_config.json
PLATFORM=$(sonic-cfggen -d -v "DEVICE_METADATA['localhost']['platform']")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try to use cache rather than using sonic-cfggen as sonic-cfggen is a costly operation.

Please refer to https://github.com/sonic-net/sonic-buildimage/pull/17343

TIMEOUT=$(cat /usr/share/sonic/device/$PLATFORM/$CONFFILE | jq ".timeout")
SYSTEM_READY_EVENT="SYSTEM_READY|SYSTEM_STATE"
CHECK_SR_EVENT="sonic-db-dump -n STATE_DB -k $SYSTEM_READY_EVENT"

# Set default if not found
if [[ -z $TIMEOUT ]]; then
MESSAGE="Failed to read timeout from config file, assuming default is \
10 minutes"
echo $MESSAGE
TIMEOUT=10
fi

# Add to extra minutes and convert to seconds and

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the rationale behind adding extra two minutes?

TIMEOUT=$(((TIMEOUT + 2) * 60))

# Waiting for event
echo "Waiting for $SYSTEM_READY_EVENT event"
while [[ -z "$($CHECK_SR_EVENT | jq '.[]["value"]["Status"]')" ]]; do
UPTIME=$(awk -F. '{print $1}' /proc/uptime)
if [[ $UPTIME -gt $TIMEOUT ]]; then
echo "Got uptime timeout - starting snmp..."
break
fi
sleep 15
done
{%- else %}
: # nothing
{%- endif %}
Expand Down
8 changes: 7 additions & 1 deletion files/build_templates/init_cfg.json.j2
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,17 @@
"auto_restart": "{{autorestart}}",
"support_syslog_rate_limit" : "true",
{# Set check_up_status to true here when app readiness will be marked in state db #}
{%- if feature in ["swss", "syncd", "pmon"] %}
"check_up_status" : "true",
{%- endif %}
{# For now, to support the infrastrucure, setting the check_up_status to false for bgp,swss,pmon #}
{# Once apps like bgp,synd supports app readiness, then bgp,syncd can set check_up_status to true #}
{%- if feature in ["bgp", "swss", "pmon"] %}
{%- if feature in ["bgp"] %}
"check_up_status" : "false",
{%- endif %}
{%- if feature in ["ib-utils", "snmp"] %}
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove ib-utils

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove ib-utils as you commented

"irrel_for_sysready" : "true",
{%- endif %}
{%- if include_kubernetes == "y" %}
{%- if feature in ["lldp", "pmon", "radv", "eventd", "snmp", "telemetry", "gnmi"] %}
"set_owner": "kube", {% else %}
Expand Down
7 changes: 7 additions & 0 deletions files/scripts/syncd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ function startplatform() {
/usr/bin/mlnx-fw-upgrade.sh -v
if [[ "$?" -ne "${EXIT_SUCCESS}" ]]; then
debug "Failed to upgrade fw. " "$?" "Restart syncd"
sonic-db-cli STATE_DB HSET "FEATURE|$DEV_SRV" fail_reason \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we considering just the asic firmware update as sysready indication for syncd. Shouldn't it be the create switch success?

"ASIC FW update failed" up_status false
exit 1
fi
/etc/init.d/sxdkernel restart
Expand Down Expand Up @@ -86,6 +88,9 @@ function startplatform() {
/etc/init.d/xpnet.sh start
fi
fi

sonic-db-cli STATE_DB HDEL "FEATURE|$DEV_SRV" fail_reason
sonic-db-cli STATE_DB HSET "FEATURE|$DEV_SRV" up_status true
}

function waitplatform() {
Expand Down Expand Up @@ -169,9 +174,11 @@ LOCKFILE="/tmp/swss-syncd-lock$DEV"
NAMESPACE_PREFIX="asic"
if [ "$DEV" ]; then
NET_NS="$NAMESPACE_PREFIX$DEV" #name of the network namespace
DEV_SRV="$SERVICE@$DEV"
SONIC_DB_CLI="sonic-db-cli -n $NET_NS"
else
NET_NS=""
DEV_SRV="$SERVICE"
SONIC_DB_CLI="sonic-db-cli"
fi

Expand Down
40 changes: 20 additions & 20 deletions src/sonic-yang-models/doc/Configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -938,31 +938,31 @@ deployment envionment id and deployment type are specified. BGP local AS
number is also specified in this table as current only single BGP
instance is supported in SONiC.

```
```json
{
"DEVICE_METADATA": {
"DEVICE_METADATA": {
"localhost": {
"hwsku": "Force10-S6100",
"default_bgp_status": "up",
"docker_routing_config_mode": "unified",
"hostname": "sonic-s6100-01",
"platform": "x86_64-dell_s6100_c2538-r0",
"mac": "4c:76:25:f4:70:82",
"default_pfcwd_status": "disable",
"bgp_asn": "65100",
"deployment_id": "1",
"type": "ToRRouter",
"bgp_adv_lo_prefix_as_128" : "true",
"buffer_model": "traditional",
"yang_config_validation": "disable",
"rack_mgmt_map": "dummy_value",
"timezome": "Europe/Kiev"
"hwsku": "Force10-S6100",
"default_bgp_status": "up",
"docker_routing_config_mode": "unified",
"hostname": "sonic-s6100-01",
"platform": "x86_64-dell_s6100_c2538-r0",
"mac": "4c:76:25:f4:70:82",
"default_pfcwd_status": "disable",
"bgp_asn": "65100",
"deployment_id": "1",
"type": "ToRRouter",
"bgp_adv_lo_prefix_as_128" : "true",
"buffer_model": "traditional",
"yang_config_validation": "disable",
"rack_mgmt_map": "dummy_value",
"timezome": "Europe/Kiev",
"sysready_state": "enabled"
}
}
}
}

```

* `sysready_state` - System-ready feature configuration: `{enabled,disabled}`

### Device neighbor metada

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,5 +156,12 @@
"DEVICE_METADATA_INVALID_CREATE_ONLY_CONFIG_DB_BUFFERS": {
"desc": "Verifying invalid create_only_config_db_buffers value",
"eStrKey": "InvalidValue"
},
"DEVICE_METADATA_VALID_SYSREADY": {
"desc": "Verifying valid system-ready state"
},
"DEVICE_METADATA_INVALID_SYSREADY": {
"desc": "Verifying invalid system-ready state",
"eStrKey": "InvalidValue"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,10 @@
},
"FEATURE_WITH_NO_OWNER" : {
"desc": "Config feature table without set_owner"
},
"FEATURE_INVALID_IRRELEVANT_FOR_SYSTEM_READY" : {
"desc": "Invalid value for 'irrelevant for system-ready' parameter",
"eStrKey": "Pattern",
"eStr": ["false|true|False|True"]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -416,5 +416,23 @@
}
}
}
},
"DEVICE_METADATA_VALID_SYSREADY": {
"sonic-device_metadata:sonic-device_metadata": {
"sonic-device_metadata:DEVICE_METADATA": {
"sonic-device_metadata:localhost": {
"sysready_state": "disabled"
}
}
}
},
"DEVICE_METADATA_INVALID_SYSREADY": {
"sonic-device_metadata:sonic-device_metadata": {
"sonic-device_metadata:DEVICE_METADATA": {
"sonic-device_metadata:localhost": {
"sysready_state": "enable"
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"has_per_asic_scope": "{% if not DEVICE_RUNTIME_METADATA['ETHERNET_PORTS_PRESENT'] or ('CHASSIS_METADATA' in DEVICE_RUNTIME_METADATA and DEVICE_RUNTIME_METADATA['CHASSIS_METADATA']['module_type'] in ['supervisor']) %}False{% else %}True{% endif %}",
"set_owner": "kube",
"check_up_status": "false",
"irrel_for_sysready": "true",
"support_syslog_rate_limit": "true"
},
{
Expand All @@ -68,6 +69,7 @@
"has_per_asic_scope": "true",
"set_owner": "kube",
"check_up_status": "false",
"irrel_for_sysready": "false",
"support_syslog_rate_limit": "true"
}
]
Expand Down Expand Up @@ -129,5 +131,17 @@
]
}
}
},
"FEATURE_INVALID_IRRELEVANT_FOR_SYSTEM_READY": {
"sonic-feature:sonic-feature": {
"sonic-feature:FEATURE": {
"FEATURE_LIST": [
{
"name": "swss",
"irrel_for_sysready": "tr"
}
]
}
}
}
}
10 changes: 10 additions & 0 deletions src/sonic-yang-models/yang-models/sonic-device_metadata.yang
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ module sonic-device_metadata {

description "DEVICE_METADATA YANG Module for SONiC OS";

revision 2023-05-10 {
description "Add system-ready feature configuration";
}

revision 2021-02-27 {
description "Added frr_mgmt_framework_config field to handle BGP
config DB schema events to configure FRR protocols.";
Expand Down Expand Up @@ -242,6 +246,12 @@ module sonic-device_metadata {
otherwise the maximum available buffers (which are read from SAI) will be
created, regardless of the CONFIG_DB buffers configuration.";
}

leaf sysready_state {
description "System-ready feature state";
type stypes:admin_mode;
default enabled;
}
}
/* end of container localhost */
}
Expand Down
10 changes: 10 additions & 0 deletions src/sonic-yang-models/yang-models/sonic-feature.yang
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ module sonic-feature{

description "Feature Table yang Module for SONiC";

revision 2023-05-10 {
description "Add system-ready feature configuration";
}

typedef feature-state {
description "configuration to set the feature running state";
type string;
Expand Down Expand Up @@ -112,6 +116,12 @@ module sonic-feature{
type stypes:boolean_type;
default "false";
}

leaf irrel_for_sysready {
description "Readiness of this feature is irrelevant for system-ready state";
type stypes:boolean_type;
default "false";
}
}
}
}
Expand Down
10 changes: 10 additions & 0 deletions src/system-health/health_checker/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ class Config(object):
# Monit service start delay configuration entry
MONIT_START_DELAY_CONFIG = 'with start delay'

# Default system ready timeout
DEFAULT_SYSREADY_TIMEOUT = 10

def __init__(self):
"""
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
Expand All @@ -44,6 +47,9 @@ def __init__(self):
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None
self.wait_services = None
self.services_to_report_app_status = None
self.timeout = Config.DEFAULT_SYSREADY_TIMEOUT

def config_file_exists(self):
return os.path.exists(self._config_file)
Expand All @@ -70,9 +76,13 @@ def load_config(self):

self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
self.ignore_services = self._get_list_data('services_to_ignore')
self.wait_services = self._get_list_data('services_to_wait')
self.services_to_report_app_status = self._get_list_data('services_to_report_app_status')
self.timeout = self.config_data.get('timeout', Config.DEFAULT_SYSREADY_TIMEOUT)
self.ignore_devices = self._get_list_data('devices_to_ignore')
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
except Exception as e:
# TODO: Add log here. Unexpected fail is not visible
self._reset()

def _reset(self):
Expand Down
Loading