Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 2 additions & 22 deletions src/system-health/health_checker/manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from . import utils
from .config import Config
from .health_checker import HealthChecker
from .service_checker import ServiceChecker
Expand All @@ -10,14 +9,10 @@ class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
"""
STATE_BOOTING = 'booting'
STATE_RUNNING = 'running'
boot_timeout = None

def __init__(self):
self._checkers = []
self._state = self.STATE_BOOTING

self.config = Config()
self.initialize()

Expand All @@ -33,17 +28,11 @@ def check(self, chassis):
"""
Load new configuration if any and perform the system health check for all existing checkers.
:param chassis: A chassis object.
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
:return: A dictionary that contains the status for all objects that was checked.
"""
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
# check state first to avoid user change boot timeout in configuration file
# after finishing system boot
if self._state == self.STATE_BOOTING and self._is_system_booting():
self._set_system_led(chassis, self.config, 'booting')
return self._state, stats

for checker in self._checkers:
self._do_check(checker, stats)
Expand All @@ -56,7 +45,7 @@ def check(self, chassis):
led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault'
self._set_system_led(chassis, self.config, led_status)

return self._state, stats
return stats

def _do_check(self, checker, stats):
"""
Expand Down Expand Up @@ -86,15 +75,6 @@ def _do_check(self, checker, stats):
else:
stats['Internal'].update(entry)

def _is_system_booting(self):
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
if not booting:
self._state = self.STATE_RUNNING
return booting

def _set_system_led(self, chassis, config, status):
try:
chassis.set_status_led(config.get_led_color(status))
Expand Down
2 changes: 1 addition & 1 deletion src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def check_by_monit(self, config):
output = utils.run_command(ServiceChecker.CHECK_CMD)
lines = output.splitlines()
if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES:
self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible')
self.set_object_not_ok('Service', 'monit', 'monit service is not ready')
return

status_begin = lines[1].find('Status')
Expand Down
2 changes: 1 addition & 1 deletion src/system-health/health_checker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def run_command(command):
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0]
except Exception:
return None
Expand Down
11 changes: 5 additions & 6 deletions src/system-health/scripts/healthd
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ SYSLOG_IDENTIFIER = 'healthd'

class HealthDaemon(DaemonBase):
"""
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED
according to the check result and store the check result to redis.
"""
SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'
Expand All @@ -35,7 +35,7 @@ class HealthDaemon(DaemonBase):
def deinit(self):
"""
Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table.
:return:
:return:
"""
self._clear_system_health_table()

Expand Down Expand Up @@ -64,7 +64,7 @@ class HealthDaemon(DaemonBase):
def run(self):
"""
Check system health in an infinite loop.
:return:
:return:
"""
self.log_notice("Starting up...")

Expand All @@ -76,9 +76,8 @@ class HealthDaemon(DaemonBase):
self.log_warning("System health configuration file not found, exit...")
return
while 1:
state, stat = manager.check(chassis)
if state == HealthCheckerManager.STATE_RUNNING:
self._process_stat(chassis, manager.config, stat)
stat = manager.check(chassis)
self._process_stat(chassis, manager.config, stat)

if self.stop_event.wait(manager.config.interval):
break
Expand Down
22 changes: 3 additions & 19 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,26 +439,14 @@ def test_config():
@patch('health_checker.user_defined_checker.UserDefinedChecker.get_info')
@patch('health_checker.service_checker.ServiceChecker.get_info')
@patch('health_checker.hardware_checker.HardwareChecker.get_info')
@patch('health_checker.utils.get_uptime')
def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
def test_manager(mock_hw_info, mock_service_info, mock_udc_info):
chassis = MagicMock()
chassis.set_status_led = MagicMock()

manager = HealthCheckerManager()
manager.config.user_defined_checkers = ['some check']
assert manager._state == HealthCheckerManager.STATE_BOOTING
assert len(manager._checkers) == 2

mock_uptime.return_value = 200
assert manager._is_system_booting()
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_BOOTING
assert len(stat) == 0
chassis.set_status_led.assert_called_with('orange_blink')

mock_uptime.return_value = 500
assert not manager._is_system_booting()
assert manager._state == HealthCheckerManager.STATE_RUNNING
mock_hw_info.return_value = {
'ASIC': {
'type': 'ASIC',
Expand All @@ -485,8 +473,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
'status': 'OK'
}
}
state, stat = manager.check(chassis)
assert state == HealthCheckerManager.STATE_RUNNING
stat = manager.check(chassis)
assert 'Services' in stat
assert stat['Services']['snmp:snmpd']['status'] == 'OK'

Expand All @@ -500,7 +487,7 @@ def test_manager(mock_uptime, mock_hw_info, mock_service_info, mock_udc_info):
mock_hw_info.side_effect = RuntimeError()
mock_service_info.side_effect = RuntimeError()
mock_udc_info.side_effect = RuntimeError()
state, stat = manager.check(chassis)
stat = manager.check(chassis)
assert 'Internal' in stat
assert stat['Internal']['ServiceChecker']['status'] == 'Not OK'
assert stat['Internal']['HardwareChecker']['status'] == 'Not OK'
Expand All @@ -518,6 +505,3 @@ def test_utils():

output = utils.run_command('ls')
assert output

uptime = utils.get_uptime()
assert uptime > 0