Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 110 additions & 50 deletions scripts/hostcfgd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import syslog
import signal
import re
import jinja2
import threading
from sonic_py_common import device_info
from sonic_py_common.general import check_output_pipe
from swsscommon.swsscommon import ConfigDBConnector, DBConnector, Table, SonicDBConfig
Expand Down Expand Up @@ -61,6 +62,7 @@ CFG_DB = "CONFIG_DB"
STATE_DB = "STATE_DB"
HOSTCFGD_MAX_PRI = 10 # Used to enforce ordering b/w daemons under Hostcfgd
DEFAULT_SELECT_TIMEOUT = 1000
PORT_INIT_TIMEOUT_SEC = 180


def safe_eval(val, default_value=False):
Expand Down Expand Up @@ -156,11 +158,15 @@ class Feature(object):
feature_cfg (dict): Feature CONFIG_DB configuration
deviec_config (dict): DEVICE_METADATA section of CONFIG_DB
"""
if 'has_timer' in feature_cfg:
err_str = "Invalid obsolete field 'has_timer' in FEATURE table. Please update configuration schema version"
syslog.syslog(syslog.LOG_ERR, err_str)
raise ValueError(err_str)

self.name = feature_name
self.state = self._get_feature_table_key_render_value(feature_cfg.get('state'), device_config or {}, ['enabled', 'disabled', 'always_enabled', 'always_disabled'])
self.auto_restart = feature_cfg.get('auto_restart', 'disabled')
self.has_timer = safe_eval(feature_cfg.get('has_timer', 'False'))
self.delayed = safe_eval(feature_cfg.get('delayed', 'False'))
self.has_global_scope = safe_eval(feature_cfg.get('has_global_scope', 'True'))
self.has_per_asic_scope = safe_eval(self._get_feature_table_key_render_value(feature_cfg.get('has_per_asic_scope', 'False'), device_config or {}, ['True', 'False']))

Expand Down Expand Up @@ -204,12 +210,18 @@ class FeatureHandler(object):
FEATURE_STATE_DISABLED = "disabled"
FEATURE_STATE_FAILED = "failed"

def __init__(self, config_db, feature_state_table, device_config):
def __init__(self, config_db, feature_state_table, device_config, is_advanced_boot):
self._config_db = config_db
self._feature_state_table = feature_state_table
self._device_config = device_config
self._cached_config = {}
self.is_multi_npu = device_info.is_multi_npu()
self.enable_delayed_service = False
self.is_advanced_boot = is_advanced_boot
self.lock = threading.Lock()
self.port_listener_thread = threading.Thread(target=self._portListener, name='port_listener_thread')
self.port_listener_thread.daemon = True
self.port_listener_thread.start()
self._device_running_config = device_info.get_device_runtime_metadata()
self.ns_cfg_db = {}
self.ns_feature_state_tbl = {}
Expand All @@ -227,58 +239,101 @@ class FeatureHandler(object):
db_conn = DBConnector(STATE_DB, 0, False, ns);
self.ns_feature_state_tbl[ns] = Table(db_conn, 'FEATURE')

def _enableDelayedServices(self):
with self.lock:
self.enable_delayed_service = True
for feature_name in self._cached_config:
if self._cached_config[feature_name].delayed:
self.update_feature_state(self._cached_config[feature_name])

def _portListener(self):
syslog.syslog(syslog.LOG_INFO, "Starting port listener")
appl_db_connector = DBConnector("APPL_DB", 0)
subscribe_port = swsscommon.SubscriberStateTable(appl_db_connector, "PORT_TABLE")
sel = swsscommon.Select()
sel.addSelectable(subscribe_port)

if self.is_advanced_boot:
syslog.syslog(syslog.LOG_INFO, "Updating delayed features after warm/fast boot")
self._enableDelayedServices()

while True:
(state, selectableObj) = sel.select(PORT_INIT_TIMEOUT_SEC*1000)
# Continue if select is timeout or selectable object is not return
if state == swsscommon.Select.ERROR:
if not self.enable_delayed_service:
syslog.syslog(syslog.LOG_ERR, "Received unexpected error in waiting for port init. Restarting services")
self._enableDelayedServices()
continue
if state != swsscommon.Select.OBJECT:
if not self.enable_delayed_service:
syslog.syslog(syslog.LOG_INFO, "Updating delayed features after timeout")
self._enableDelayedServices()
continue

key, op, fvs = subscribe_port.pop()
if not key:
break
if op == 'SET' and key == 'PortInitDone':
syslog.syslog(syslog.LOG_INFO, "Updating delayed features after port initialization")
self._enableDelayedServices()

def handler(self, feature_name, op, feature_cfg):
if not feature_cfg:
syslog.syslog(syslog.LOG_INFO, "Deregistering feature {}".format(feature_name))
self._cached_config.pop(feature_name, None)
self._feature_state_table._del(feature_name)
return
with self.lock:
if not feature_cfg:
syslog.syslog(syslog.LOG_INFO, "Deregistering feature {}".format(feature_name))
self._cached_config.pop(feature_name, None)
self._feature_state_table._del(feature_name)
return

device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)

feature = Feature(feature_name, feature_cfg, device_config)
self._cached_config.setdefault(feature_name, Feature(feature_name, {}))

# Change auto-restart configuration first.
# If service reached failed state before this configuration applies (e.g. on boot)
# the next called self.update_feature_state will start it again. If it will fail
# again the auto restart will kick-in. Another order may leave it in failed state
# and not auto restart.
if self._cached_config[feature_name].auto_restart != feature.auto_restart:
syslog.syslog(syslog.LOG_INFO, "Auto-restart status of feature '{}' is changed from '{}' to '{}' ..."
.format(feature_name, self._cached_config[feature_name].auto_restart, feature.auto_restart))
self.update_systemd_config(feature)
self._cached_config[feature_name].auto_restart = feature.auto_restart

# Enable/disable the container service if the feature state was changed from its previous state.
if self._cached_config[feature_name].state != feature.state:
if self.update_feature_state(feature):
self._cached_config[feature_name].state = feature.state
else:
self.resync_feature_state(self._cached_config[feature_name])
device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)

feature = Feature(feature_name, feature_cfg, device_config)
self._cached_config.setdefault(feature_name, Feature(feature_name, {}))

# Change auto-restart configuration first.
# If service reached failed state before this configuration applies (e.g. on boot)
# the next called self.update_feature_state will start it again. If it will fail
# again the auto restart will kick-in. Another order may leave it in failed state
# and not auto restart.
if self._cached_config[feature_name].auto_restart != feature.auto_restart:
syslog.syslog(syslog.LOG_INFO, "Auto-restart status of feature '{}' is changed from '{}' to '{}' ..."
.format(feature_name, self._cached_config[feature_name].auto_restart, feature.auto_restart))
self.update_systemd_config(feature)
self._cached_config[feature_name].auto_restart = feature.auto_restart

# Enable/disable the container service if the feature state was changed from its previous state.
if self._cached_config[feature_name].state != feature.state:
if self.update_feature_state(feature):
self.sync_feature_asic_scope(feature)
self._cached_config[feature_name] = feature
else:
self.resync_feature_state(self._cached_config[feature_name])

def sync_state_field(self, feature_table):
"""
Summary:
Updates the state field in the FEATURE|* tables as the state field
might have to be rendered based on DEVICE_METADATA table and generated Device Running Metadata
"""
for feature_name in feature_table.keys():
if not feature_name:
syslog.syslog(syslog.LOG_WARNING, "Feature is None")
continue
with self.lock:
for feature_name in feature_table.keys():
if not feature_name:
syslog.syslog(syslog.LOG_WARNING, "Feature is None")
continue

device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)
feature = Feature(feature_name, feature_table[feature_name], device_config)
device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)
feature = Feature(feature_name, feature_table[feature_name], device_config)

self._cached_config.setdefault(feature_name, feature)
self.update_systemd_config(feature)
self.update_feature_state(feature)
self.resync_feature_state(feature)
self._cached_config.setdefault(feature_name, feature)
self.update_systemd_config(feature)
self.update_feature_state(feature)
self.sync_feature_asic_scope(feature)
self.resync_feature_state(feature)

def update_feature_state(self, feature):
cached_feature = self._cached_config[feature.name]
Expand Down Expand Up @@ -311,6 +366,10 @@ class FeatureHandler(object):
.format(feature.state, feature.name))
return False

if feature.delayed and not self.enable_delayed_service:
syslog.syslog(syslog.LOG_INFO, "Feature is {} delayed for port init".format(feature.name))
return True

if enable:
self.enable_feature(feature)
syslog.syslog(syslog.LOG_INFO, "Feature {} is enabled and started".format(feature.name))
Expand All @@ -319,8 +378,6 @@ class FeatureHandler(object):
self.disable_feature(feature)
syslog.syslog(syslog.LOG_INFO, "Feature {} is stopped and disabled".format(feature.name))

self.sync_feature_asic_scope(feature)

return True

def sync_feature_asic_scope(self, feature_config):
Expand Down Expand Up @@ -393,7 +450,7 @@ class FeatureHandler(object):
with open(feature_systemd_config_file_path, 'w') as feature_systemd_config_file_handler:
feature_systemd_config_file_handler.write(feature_systemd_config)

syslog.syslog(syslog.LOG_INFO, "Feautre '{}' systemd config file related to auto-restart is updated!"
syslog.syslog(syslog.LOG_INFO, "Feature '{}' systemd config file related to auto-restart is updated!"
.format(feature_name))

try:
Expand All @@ -415,7 +472,7 @@ class FeatureHandler(object):
syslog.syslog(syslog.LOG_ERR, "Feature '{}' service not available"
.format(feature.name))

feature_suffixes = ["service"] + (["timer"] if feature.has_timer else [])
feature_suffixes = ["service"]

return feature_names, feature_suffixes

Expand Down Expand Up @@ -1550,9 +1607,13 @@ class SyslogCfg:
return interval, burst
return interval, burst


class HostConfigDaemon:
def __init__(self):
self.state_db_conn = DBConnector(STATE_DB, 0)
self.advanced_boot = False
if swsscommon.RestartWaiter.isAdvancedBootInProgress(self.state_db_conn):
self.advanced_boot = True
swsscommon.RestartWaiter.waitAdvancedBootDone()
# Just a sanity check to verify if the CONFIG_DB has been initialized
# before moving forward
self.config_db = ConfigDBConnector()
Expand All @@ -1564,7 +1625,6 @@ class HostConfigDaemon:
self.device_config['DEVICE_METADATA'] = self.config_db.get_table('DEVICE_METADATA')

# Load feature state table
self.state_db_conn = DBConnector(STATE_DB, 0)
feature_state_table = Table(self.state_db_conn, 'FEATURE')

# Initialize KDump Config and set the config to default if nothing is provided
Expand All @@ -1574,7 +1634,7 @@ class HostConfigDaemon:
self.iptables = Iptables()

# Intialize Feature Handler
self.feature_handler = FeatureHandler(self.config_db, feature_state_table, self.device_config)
self.feature_handler = FeatureHandler(self.config_db, feature_state_table, self.device_config, self.advanced_boot)

# Initialize Ntp Config Handler
self.ntpcfg = NtpCfg()
Expand Down
74 changes: 74 additions & 0 deletions tests/common/mock_configdb.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
class MockConfigDb(object):
"""
Mock Config DB which responds to data tables requests and store updates to the data table
Expand Down Expand Up @@ -55,6 +56,79 @@ def listen(self, init_data_handler=None):
for e in MockConfigDb.event_queue:
self.handlers[e[0]](e[0], e[1], self.get_entry(e[0], e[1]))

class MockSelect():

event_queue = []
OBJECT = "OBJECT"
ERROR = ""

@staticmethod
def set_event_queue(Q):
MockSelect.event_queue = Q

@staticmethod
def get_event_queue():
return MockSelect.event_queue

@staticmethod
def reset_event_queue():
MockSelect.event_queue = []

def __init__(self):
self.sub_map = {}
self.TIMEOUT = "TIMEOUT"
self.ERROR = "ERROR"

def addSelectable(self, subscriber):
self.sub_map[subscriber.table] = subscriber

def select(self, TIMEOUT):
if not MockSelect.get_event_queue():
time.sleep(TIMEOUT/1000)
return "TIMEOUT", {}
table, key = MockSelect.get_event_queue().pop(0)
self.sub_map[table].nextKey(key)
self.reset_event_queue()
return "OBJECT", self.sub_map[table]


class MockSubscriberStateTable():

FD_INIT = 0

@staticmethod
def generate_fd():
curr = MockSubscriberStateTable.FD_INIT
MockSubscriberStateTable.FD_INIT = curr + 1
return curr

@staticmethod
def reset_fd():
MockSubscriberStateTable.FD_INIT = 0

def __init__(self, conn, table, pop=None, pri=None):
self.fd = MockSubscriberStateTable.generate_fd()
self.next_key = ''
self.table = table

def getFd(self):
return self.fd

def nextKey(self, key):
print("next key")
self.next_key = key

def pop(self):
table = MockConfigDb.CONFIG_DB.get(self.table, {})
print(self.next_key)
if self.next_key not in table:
op = "DEL"
fvs = {}
else:
op = "SET"
fvs = table.get(self.next_key, {})
return self.next_key, op, fvs


class MockDBConnector():
def __init__(self, db, val, tcpFlag=False, name=None):
Expand Down
25 changes: 25 additions & 0 deletions tests/common/mock_restart_waiter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
class MockRestartWaiter(object):
advancedReboot = False
"""
Mock Config DB which responds to data tables requests and store updates to the data table
"""
def waitAdvancedBootDone(maxWaitSec=180, dbTimeout=0, isTcpConn=False):
return True

def waitWarmBootDone(maxWaitSec=180, dbTimeout=0, isTcpConn=False):
return False

def waitFastBootDone(maxWaitSec=180, dbTimeout=0, isTcpConn=False):
return False

def isAdvancedBootInProgress(stateDb):
return MockRestartWaiter.advancedReboot

def isFastBootInProgress(stateDb):
return False

def isWarmBootInProgress(stateDb):
return False

def __init__(self):
pass
Loading