sonic-net · yxieca · Apr 21, 2020 · Apr 17, 2020 · Apr 17, 2020 · Apr 17, 2020
diff --git a/tests/common/devices.py b/tests/common/devices.py
@@ -350,6 +350,24 @@ def get_image_info(self):
     def get_asic_type(self):
         return self.facts["asic_type"]
 
+    def shutdown(self, portname):
+        """
+            Shutdown interface (parity function as EosHost)
+
+            Args:
+                portname: the interface to shutdown
+        """
+        return self.command("sudo config interface shutdown {}".format(portname))
+
+    def no_shutdown(self, portname):
+        """
+            Bring up interface (parity function as EosHost)
+
+            Args:
+                portname: the interface to bring up
+        """
+        return self.command("sudo config interface startup {}".format(portname))
+
 
 class EosHost(AnsibleHostBase):
     """

diff --git a/tests/common/plugins/sanity_check/__init__.py b/tests/common/plugins/sanity_check/__init__.py
@@ -49,15 +49,15 @@ def _update_check_items(old_items, new_items, supported_items):
 
 
 @pytest.fixture(scope="module", autouse=True)
-def sanity_check(testbed_devices, request):
+def sanity_check(testbed_devices, request, fanouthosts):
     logger.info("Start pre-test sanity check")
 
     dut = testbed_devices["dut"]
     localhost = testbed_devices["localhost"]
 
     skip_sanity = False
     allow_recover = False
-    recover_method = "config_reload"
+    recover_method = "adaptive"
     check_items = set(copy.deepcopy(constants.SUPPORTED_CHECK_ITEMS))  # Default check items
     post_check = False
 
@@ -72,7 +72,7 @@ def sanity_check(testbed_devices, request):
         logger.info("Process marker %s in script. m.args=%s, m.kwargs=%s" % (m.name, str(m.args), str(m.kwargs)))
         skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False)
         allow_recover = customized_sanity_check.kwargs.get("allow_recover", False)
-        recover_method = customized_sanity_check.kwargs.get("recover_method", "config_reload")
+        recover_method = customized_sanity_check.kwargs.get("recover_method", "adaptive")
         if allow_recover and recover_method not in constants.RECOVER_METHODS:
             pytest.warning("Unsupported recover method")
             logger.info("Fall back to use default recover method 'config_reload'")
@@ -108,11 +108,11 @@ def sanity_check(testbed_devices, request):
                 json.dumps(check_results, indent=4))
     if any([result["failed"] for result in check_results]):
         if not allow_recover:
-            pytest.fail("Pre-test sanity check failed, allow_recover=False")
+            pytest.fail("Pre-test sanity check failed, allow_recover=False {}".format(check_results))
             return
 
         logger.info("Pre-test sanity check failed, try to recover, recover_method=%s" % recover_method)
-        recover(dut, localhost, recover_method)
+        recover(dut, localhost, fanouthosts, check_results, recover_method)
         logger.info("Run sanity check again after recovery")
         new_check_results = do_checks(dut, check_items)
         logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check after recovery results: !!!!!!!!!!!!!!!!\n%s" % \

diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py
@@ -12,11 +12,12 @@
 
 # Recover related definitions
 RECOVER_METHODS = {
-    "config_reload": {"cmd": "config reload -y", "reboot": False},
-    "load_minigraph": {"cmd": "config load_minigraph -y", "reboot": False},
-    "reboot": {"cmd": "reboot", "reboot": True},
-    "warm_reboot": {"cmd": "warm-reboot", "reboot": True},
-    "fast_reboot": {"cmd": "fast_reboot", "reboot": True}
+    "config_reload": {"cmd": "config reload -y", "reboot": False, "adaptive": False},
+    "load_minigraph": {"cmd": "config load_minigraph -y", "reboot": False, "adaptive": False},
+    "reboot": {"cmd": "reboot", "reboot": True, "adaptive": False},
+    "warm_reboot": {"cmd": "warm-reboot", "reboot": True, "adaptive": False},
+    "fast_reboot": {"cmd": "fast_reboot", "reboot": True, "adaptive": False},
+    "adaptive": {"cmd": None, "reboot": False, "adaptive": True},
 }       # All supported recover methods
 
 SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"]          # Supported checks
diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py
@@ -6,6 +6,7 @@
 
 from common.utilities import wait, wait_until
 from common.errors import RunAnsibleModuleFail
+from common.platform.device_utils import fanout_switch_port_lookup
 
 logger = logging.getLogger(__name__)
 
@@ -26,13 +27,54 @@ def reboot_dut(dut, localhost, cmd):
         assert False, "Failed to reboot the DUT"
 
     localhost.wait_for(host=dut.hostname, port=22, state="started", delay=10, timeout=300)
-    wait(30, msg="Wait 30 seconds for system to be stable.")
+    wait(120, msg="Wait 120 seconds for system to be stable.")
 
 
-def recover(dut, localhost, recover_method):
+def __recover_interfaces(dut, fanouthosts, result):
+    for port in result['down_ports']:
+        logging.info("Restoring port {}".format(port))
+        fanout, fanout_port = fanout_switch_port_lookup(fanouthosts, port)
+        if fanout and fanout_port:
+            fanout.no_shutdown(fanout_port)
+        dut.no_shutdown(port)
+    wait(30, msg="Wait 30 seconds for interface(s) to restore.")
+
+
+def __recover_services(dut, result):
+    status   = result['services_status']
+    services = [ x for x in status if not status[x] ]
+    logging.info("Service(s) down: {}".format(services))
+    return 'reboot' if 'database' in services else 'config_reload'
+
+
+def adaptive_recover(dut, localhost, fanouthosts, check_results):
+    outstanding_action = None
+    for result in check_results:
+        if result['failed']:
+            logging.info("Restoring {}".format(result))
+            if result['check_item'] == 'interfaces':
+                __recover_interfaces(dut, fanouthosts, result)
+            elif result['check_item'] == 'services':
+                outstanding_action = __recover_services(dut, result)
+            else:
+                outstanding_action = 'reboot'
+
+    if outstanding_action:
+        method = constants.RECOVER_METHODS[outstanding_action]
+        if method["reboot"]:
+            reboot_dut(dut, localhost, constants.RECOVER_METHODS[recover_method]["cmd"])
+        else:
+            dut.command(method["cmd"])
+            wait(60, msg="Wait 60 seconds for system to be stable.")
+
+
+def recover(dut, localhost, fanouthosts, check_results, recover_method):
     logger.info("Try to recover %s using method %s" % (dut.hostname, recover_method))
-    if constants.RECOVER_METHODS[recover_method]["reboot"]:
+    method = constants.RECOVER_METHODS[recover_method]
+    if method["adaptive"]:
+        adaptive_recover(dut, localhost, fanouthosts, check_results)
+    elif method["reboot"]:
         reboot_dut(dut, localhost, constants.RECOVER_METHODS[recover_method]["cmd"])
     else:
-        dut.command(constants.RECOVER_METHODS[recover_method]["cmd"])
-        wait(30, msg="Wait 30 seconds for system to be stable.")
+        dut.command(method["cmd"])
+        wait(60, msg="Wait 60 seconds for system to be stable.")