Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 170 additions & 32 deletions tests/process_monitoring/test_critical_process_monitoring.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
Test the feature of monitoring critical processes by Supervisord.
Test the feature of monitoring critical processes on 201911 image (Monit)
and 202012 image (Supervisor)
"""
from collections import defaultdict
import logging
Expand Down Expand Up @@ -48,7 +49,7 @@ def disable_and_enable_autorestart(duthost):
disabled_autorestart_containers = []

for container_name, state in containers_autorestart_states.items():
if state == "enabled":
if "enabled" in state:
logger.info("Disabling the autorestart of container '{}'.".format(container_name))
command_disable_autorestart = "sudo config feature autorestart {} disabled".format(container_name)
command_output = duthost.shell(command_disable_autorestart)
Expand Down Expand Up @@ -77,8 +78,43 @@ def check_image_version(duthost):
Returns:
None.
"""
pytest_require(parse_version(duthost.kernel_version) > parse_version("4.9.0"),
"Test was not supported for 201911 and older image versions!")
pytest_require(("20191130" in duthost.os_version and parse_version(duthost.os_version) > parse_version("20191130.70"))
or parse_version(duthost.kernel_version) > parse_version("4.9.0"),
"Test was not supported for 20191130.70 and older image versions!")


def update_monit_config_and_restart(duthost):
"""Decrease the monitoring interval of Monit from 1 minute to 10 seconds and restart
Monit service without delaying.

Args:
duthost: Hostname of DuT.

Returns:
None.
"""
logger.info("Back up Monit configuration files.")
duthost.shell("sudo cp -f /etc/monit/monitrc /tmp/")

duthost.shell("sudo sed -i 's/set daemon 60/set daemon 10/' /etc/monit/monitrc")
duthost.shell("sudo sed -i '/with start delay 300/s/^./#/' /etc/monit/monitrc")
logger.info("Restart the Monit service without delaying to monitor.")
duthost.shell("sudo systemctl restart monit")


def restore_monit_config_and_restart(duthost):
"""Restore the Monit configuration and restart Monit service.

Args:
duthost: Hostname of DuT.

Returns:
None.
"""
logger.info("Roll back the Monit configuration of container checker.")
duthost.shell("sudo mv -f /tmp/monitrc /etc/monit/")
logger.info("Restart the Monit service and delay monitoring for 5 minutes.")
duthost.shell("sudo systemctl restart monit")


def check_all_critical_processes_running(duthost):
Expand Down Expand Up @@ -133,45 +169,127 @@ def postcheck_critical_processes_status(duthost, up_bgp_neighbors):
post_test_check, duthost, up_bgp_neighbors)


def get_expected_alerting_messages(duthost, containers_in_namespaces):
"""Generates the regex of expected alerting messages for the critical processes in each namespace.
def get_critical_process_from_monit(duthost, container_name):
"""Gets command lines of critical processes by parsing the Monit configration file
of each container.

Args:
duthost: Hostname of DuT.
container_name: Name of a container.

Returns:
A list contains command lines of critical processes. Bool varaible indicates
whether the operation is successful or not.
"""
critical_process_list = []
succeeded = True

monit_config_file_name = "monit_" + container_name
file_content = duthost.shell("bash -c '[ -f /etc/monit/conf.d/{0} ] && cat /etc/monit/conf.d/{0}'".format(monit_config_file_name))
if file_content["rc"] != 0:
succeeded = False
return critical_process_list, succeeded

for line in file_content["stdout_lines"]:
if "check program" in line:
command_line = line.split(" {} ".format(container_name))[1].strip(" \n\"")
critical_process_list.append(command_line)

return critical_process_list, succeeded


def get_expected_alerting_messages_monit(duthost, containers_in_namespaces):
"""Generates the regex of expected alerting messages for critical processes of all containers
in each namespace by parsing the Monit configuration files.

Args:
duthost: Hostname of DUT.
containers_in_namespaces: A dictionary where keys are container names and
values are lists which contains ids of namespaces this container should reside in.

Returns:
None.
A list contains the regex of alerting messages.
"""
expected_alerting_messages = []

logger.info("Generating the expected alerting messages ...")
for container_name in containers_in_namespaces.keys():
logger.info("Generating the expected alerting messages for container '{}'...".format(container_name))
critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name)
pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name))
namespace_ids = containers_in_namespaces[container_name]

critical_process_list, succeeded = get_critical_process_from_monit(duthost, container_name)
pytest_assert(succeeded, "Failed to get critical processes of container '{}' from Monit config file".format(container_name))

for namespace_id in namespace_ids:
namespace_name = "host"
container_name_in_namespace = container_name
if namespace_id != DEFAULT_ASIC_ID:
namespace_name = NAMESPACE_PREFIX + namespace_id
container_name_in_namespace += namespace_id

logger.info("Generating the expected alerting messages for container '{}'...".format(container_name_in_namespace))
for critical_process in critical_process_list:
# Skip 'dsserve' process since it was not managed by supervisord
# TODO: Should remove the following two lines once the issue was solved in the image.
if "syncd" in container_name_in_namespace and "/usr/bin/dsserve" not in critical_process:
continue

logger.info("Generating the expected alerting message for process '{}' in container '{}'".format(critical_process, container_name_in_namespace))
if namespace_id != DEFAULT_ASIC_ID:
expected_alerting_messages.append(".*'{}' is not running.*in namespace.*{}".format(critical_process, namespace_name))
else:
expected_alerting_messages.append(".*'{}' is not running.*in {}".format(critical_process, namespace_name))

logger.info("Generating the expected alerting messages for container '{}' was done!".format(container_name_in_namespace))

return expected_alerting_messages


def get_expected_alerting_messages_supervisor(duthost, containers_in_namespaces):
"""Generates the regex of expected alerting messages for the critical processes in each namespace.

Args:
duthost: Hostname of DUT.
containers_in_namespaces: A dictionary where keys are container names and
values are lists which contains ids of namespaces this container should reside in.

Returns:
A list contains the regex of alerting messages.
"""
expected_alerting_messages = []

logger.info("Generating the expected alerting messages ...")
for container_name in containers_in_namespaces.keys():
namespace_ids = containers_in_namespaces[container_name]
container_name_in_namespace = container_name
if len(namespace_ids) > 2:
container_name_in_namespace += namespace_ids[1]

critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name_in_namespace)
pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name_in_namespace))

for namespace_id in namespace_ids:
namespace_name = "host"
container_name_in_namespace = container_name
if namespace_id != DEFAULT_ASIC_ID:
namespace_name = NAMESPACE_PREFIX + namespace_id
container_name_in_namespace += namespace_id

logger.info("Generating the expected alerting messages for container '{}'...".format(container_name_in_namespace))
for critical_process in critical_process_list:
# Skip 'dsserve' process since it was not managed by supervisord
# TODO: Should remove the following two lines once the issue was solved in the image.
if container_name == "syncd" and critical_process == "dsserve":
if "syncd" in container_name_in_namespace and critical_process == "dsserve":
continue
logger.info("Generating the expected alerting message for process '{}'".format(critical_process))
logger.info("Generating the expected alerting message for process '{}' in container '{}'".format(critical_process, container_name_in_namespace))
expected_alerting_messages.append(".*Process '{}' is not running in namespace '{}'.*".format(critical_process, namespace_name))

for critical_group in critical_group_list:
group_program_info = get_group_program_info(duthost, container_name, critical_group)
group_program_info = get_group_program_info(duthost, container_name_in_namespace, critical_group)
for program_name in group_program_info:
logger.info("Generating the expected alerting message for process '{}'".format(program_name))
logger.info("Generating the expected alerting message for process '{}' in container '{}'".format(program_name, container_name_in_namespace))
expected_alerting_messages.append(".*Process '{}' is not running in namespace '{}'.*".format(program_name, namespace_name))

logger.info("Generating the expected alerting messages for container '{}' was done!".format(container_name))
logger.info("Generating the expected alerting messages for container '{}' was done!".format(container_name_in_namespace))

return expected_alerting_messages

Expand All @@ -191,13 +309,13 @@ def get_containers_namespace_ids(duthost, skip_containers):
containers_in_namespaces = defaultdict(list)

logger.info("Getting the namespace ids for each container...")
containers_states, succeed = duthost.get_feature_status()
pytest_assert(succeed, "Failed to get feature status of containers!")
containers_states, succeeded = duthost.get_feature_status()
pytest_assert(succeeded, "Failed to get feature status of containers!")

for container_name, state in containers_states.items():
if container_name not in skip_containers and state == "enabled":
namespace_ids, succeed = duthost.get_namespace_ids(container_name)
pytest_assert(succeed, "Failed to get namespace ids of container '{}'".format(container_name))
if container_name not in skip_containers and state not in ["disabled", "always_disabled"]:
namespace_ids, succeeded = duthost.get_namespace_ids(container_name)
pytest_assert(succeeded, "Failed to get namespace ids of container '{}'".format(container_name))
containers_in_namespaces[container_name] = namespace_ids

logger.info("Getting the namespace ids for each container was done!")
Expand Down Expand Up @@ -262,10 +380,14 @@ def stop_critical_processes(duthost, containers_in_namespaces):
None.
"""
for container_name in containers_in_namespaces.keys():
critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name)
pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name))

namespace_ids = containers_in_namespaces[container_name]
container_name_in_namespace = container_name
if len(namespace_ids) >= 2:
container_name_in_namespace += namespace_ids[1]

critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name_in_namespace)
pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name_in_namespace))

for namespace_id in namespace_ids:
container_name_in_namespace = container_name
if namespace_id != DEFAULT_ASIC_ID:
Expand All @@ -274,14 +396,14 @@ def stop_critical_processes(duthost, containers_in_namespaces):
for critical_process in critical_process_list:
# Skip 'dsserve' process since it was not managed by supervisord
# TODO: Should remove the following two lines once the issue was solved in the image.
if container_name_in_namespace == "syncd" and critical_process == "dsserve":
if "syncd" in container_name_in_namespace and critical_process == "dsserve":
continue

program_status, program_pid = get_program_info(duthost, container_name_in_namespace, critical_process)
check_and_kill_process(duthost, container_name_in_namespace, critical_process, program_status, program_pid)

for critical_group in critical_group_list:
group_program_info = get_group_program_info(duthost, container_name, critical_group)
group_program_info = get_group_program_info(duthost, container_name_in_namespace, critical_group)
for program_name in group_program_info:
check_and_kill_process(duthost, container_name_in_namespace, program_name,
group_program_info[program_name][0],
Expand Down Expand Up @@ -324,10 +446,14 @@ def ensure_all_critical_processes_running(duthost, containers_in_namespaces):
None.
"""
for container_name in containers_in_namespaces.keys():
critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name)
pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name))

namespace_ids = containers_in_namespaces[container_name]
container_name_in_namespace = container_name
if len(namespace_ids) >= 2:
container_name_in_namespace += namespace_ids[1]

critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name_in_namespace)
pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name_in_namespace))

for namespace_id in namespace_ids:
container_name_in_namespace = container_name
if namespace_id != DEFAULT_ASIC_ID:
Expand All @@ -336,7 +462,7 @@ def ensure_all_critical_processes_running(duthost, containers_in_namespaces):
for critical_process in critical_process_list:
# Skip 'dsserve' process since it was not managed by supervisord
# TODO: Should remove the following two lines once the issue was solved in the image.
if container_name_in_namespace == "syncd" and critical_process == "dsserve":
if "syncd" in container_name_in_namespace and critical_process == "dsserve":
continue

ensure_process_is_running(duthost, container_name_in_namespace, critical_process)
Expand All @@ -363,6 +489,10 @@ def test_monitoring_critical_processes(duthosts, rand_one_dut_hostname, tbinfo):
None.
"""
duthost = duthosts[rand_one_dut_hostname]

if "20191130" in duthost.os_version:
update_monit_config_and_restart(duthost)

loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="monitoring_critical_processes")
loganalyzer.expect_regex = []
bgp_neighbors = duthost.get_bgp_neighbors()
Expand All @@ -371,20 +501,25 @@ def test_monitoring_critical_processes(duthosts, rand_one_dut_hostname, tbinfo):
skip_containers = []
skip_containers.append("database")
skip_containers.append("gbsyncd")
skip_containers.append("acms")
# Skip 'radv' container on devices whose role is not T0.
if tbinfo["topo"]["type"] != "t0":
skip_containers.append("radv")

containers_in_namespaces = get_containers_namespace_ids(duthost, skip_containers)

expected_alerting_messages = get_expected_alerting_messages(duthost, containers_in_namespaces)
if "20191130" in duthost.os_version:
expected_alerting_messages = get_expected_alerting_messages_monit(duthost, containers_in_namespaces)
else:
expected_alerting_messages = get_expected_alerting_messages_supervisor(duthost, containers_in_namespaces)

loganalyzer.expect_regex.extend(expected_alerting_messages)
marker = loganalyzer.init()

stop_critical_processes(duthost, containers_in_namespaces)

# Wait for 70 seconds such that Supervisord has a chance to write alerting message into syslog.
logger.info("Sleep 70 seconds to wait for the alerting message...")
# Wait for 70 seconds such that Supervisord/Monit has a chance to write alerting message into syslog.
logger.info("Sleep 70 seconds to wait for the alerting messages in syslog...")
time.sleep(70)

logger.info("Checking the alerting messages from syslog...")
Expand All @@ -397,6 +532,9 @@ def test_monitoring_critical_processes(duthosts, rand_one_dut_hostname, tbinfo):

ensure_all_critical_processes_running(duthost, containers_in_namespaces)

if "20191130" in duthost.os_version:
restore_monit_config_and_restart(duthost)

if not postcheck_critical_processes_status(duthost, up_bgp_neighbors):
pytest.fail("Post-check failed after testing the container checker!")
logger.info("Post-checking status of critical processes and BGP sessions was done!")