sonic-net · yxieca · Jul 29, 2020 · Jul 29, 2020 · neethajohn · Jul 29, 2020
diff --git a/tests/common/platform/processes_utils.py b/tests/common/platform/processes_utils.py
@@ -0,0 +1,49 @@
+"""
+Helper script for checking status of critical processes
+
+This script contains re-usable functions for checking status of critical services.
+"""
+import logging
+import time
+
+from tests.common.helpers.assertions import pytest_assert
+from tests.common.utilities import wait_until
+
+
+def _get_critical_processes_status(dut):
+    processes_status = dut.all_critical_process_status()
+    for k, v in processes_status.items():
+        if v['status'] == False or len(v['exited_critical_process']) > 0:
+            return False, processes_status
+
+    return True, processes_status
+
+def _all_critical_processes_healthy(dut):
+    logging.info("Check critical processes status")
+    status, _ = _get_critical_processes_status(dut)
+    return status
+
+def check_critical_processes(dut, watch_secs=0):
+    """
+    @summary: check all critical processes. They should be all running.
+              keep on checking every 5 seconds until watch_secs drops below 0.
+    @param dut: The AnsibleHost object of DUT. For interacting with DUT.
+    @param watch_secs: all processes should remain healthy for watch_secs seconds.
+    """
+    logging.info("Check all critical processes are healthy for {} seconds".format(watch_secs))
+    while watch_secs >= 0:
+        status, details = _get_critical_processes_status(dut)
+        pytest_assert(status, "Not all critical processes are healthy: {}".format(details))
+        if watch_secs > 0:
+            time.sleep(min(5, watch_secs))
+        watch_secs = watch_secs - 5
+
+def wait_critical_processes(dut):
+    """
+    @summary: wait until all critical processes are healthy.
+    @param dut: The AnsibleHost object of DUT. For interacting with DUT.
+    """
+    logging.info("Wait until all critical processes are healthy")
+    pytest_assert(wait_until(300, 20, _all_critical_processes_healthy, dut),
+                  "Not all critical processes are healthy")
+
diff --git a/tests/platform_tests/check_critical_services.py b/tests/platform_tests/check_critical_services.py
@@ -6,6 +6,7 @@
 import time
 import logging
 
+from tests.common.helpers.assertions import pytest_assert
 from tests.common.utilities import wait_until
 
 
@@ -33,5 +34,6 @@ def check_critical_services(dut):
     @param dut: The AnsibleHost object of DUT. For interacting with DUT.
     """
     logging.info("Wait until all critical services are fully started")
-    assert wait_until(300, 20, _all_critical_services_fully_started, dut), "Not all critical services are fully started"
+    pytest_assert(wait_until(300, 20, _all_critical_services_fully_started, dut),
+                  "Not all critical services are fully started")
 
diff --git a/tests/platform_tests/test_sequential_restart.py b/tests/platform_tests/test_sequential_restart.py
@@ -9,6 +9,8 @@
 import pytest
 
 from tests.common.fixtures.conn_graph_facts import conn_graph_facts
+from tests.common.helpers.assertions import pytest_assert
+from tests.common.platform.processes_utils import check_critical_processes
 from tests.common.utilities import wait_until
 from check_critical_services import check_critical_services
 from check_transceiver_status import check_transceiver_basic
@@ -31,8 +33,8 @@ def restart_service_and_check(localhost, dut, service, interfaces):
     check_critical_services(dut)
 
     logging.info("Wait some time for all the transceivers to be detected")
-    assert wait_until(300, 20, check_interface_information, dut, interfaces), \
-        "Not all interface information are detected within 300 seconds"
+    pytest_assert(wait_until(300, 20, check_interface_information, dut, interfaces),
+                  "Not all interface information are detected within 300 seconds")
 
     logging.info("Check transceiver status")
     check_transceiver_basic(dut, interfaces)
@@ -48,6 +50,9 @@ def restart_service_and_check(localhost, dut, service, interfaces):
         logging.info("Check sysfs")
         check_sysfs(dut)
 
+    logging.info("Check that critical processes are healthy for 60 seconds")
+    check_critical_processes(dut, 60)
+
 
 def test_restart_swss(duthost, localhost, conn_graph_facts):
     """