From e59b40c55e2e24ead5016c04433f58ff0e5c0de9 Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Thu, 10 Jan 2019 22:03:32 +0000 Subject: [PATCH 1/3] [neighsyncd] increase neighbor syncd restore timeout to 120 seconds Neighbor syncd is restoring important information for teamd and BGP. our timeout should not be shorter than the down stream service. Signed-off-by: Ying Xie --- neighsyncd/neighsync.h | 4 ++-- neighsyncd/restore_neighbors.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/neighsyncd/neighsync.h b/neighsyncd/neighsync.h index 9360a18713e..647b70aa107 100644 --- a/neighsyncd/neighsync.h +++ b/neighsyncd/neighsync.h @@ -11,10 +11,10 @@ /* * This is the timer value (in seconds) that the neighsyncd waits for restore_neighbors - * service to finish, should be longer than the restore_neighbors timeout value (60) + * service to finish, should be longer than the restore_neighbors timeout value (120) * This should not happen, if happens, system is in a unknown state, we should exit. */ -#define RESTORE_NEIGH_WAIT_TIME_OUT 70 +#define RESTORE_NEIGH_WAIT_TIME_OUT 130 namespace swss { diff --git a/neighsyncd/restore_neighbors.py b/neighsyncd/restore_neighbors.py index 387723dfe9e..541abe2199e 100755 --- a/neighsyncd/restore_neighbors.py +++ b/neighsyncd/restore_neighbors.py @@ -33,8 +33,9 @@ # timeout the restore process in 1 min if not finished # This is mostly to wait for interfaces to be created and up after system warm-reboot # and this process is started by supervisord in swss docker. -# It would be good to keep that time below routing reconciliation time-out. -TIME_OUT = 60 +# It would be good to keep that time consistent routing reconciliation time-out, here +# we are upstream, we shouldn't give up before down stream. +TIME_OUT = 120 # every 5 seconds to check interfaces states CHECK_INTERVAL = 5 From 904f652d0a5979a2b11f41c50ecf1fff7403634e Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Tue, 5 Feb 2019 21:32:37 +0000 Subject: [PATCH 2/3] [restore_neighbor] improve restore neighbor timeouts Try to get the bgp timeout and use it for restoring neighbor timeout. When unavailable, use default 110 seconds. Signed-off-by: Ying Xie --- neighsyncd/neighsync.h | 2 +- neighsyncd/restore_neighbors.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/neighsyncd/neighsync.h b/neighsyncd/neighsync.h index 647b70aa107..bffa6283e44 100644 --- a/neighsyncd/neighsync.h +++ b/neighsyncd/neighsync.h @@ -14,7 +14,7 @@ * service to finish, should be longer than the restore_neighbors timeout value (120) * This should not happen, if happens, system is in a unknown state, we should exit. */ -#define RESTORE_NEIGH_WAIT_TIME_OUT 130 +#define RESTORE_NEIGH_WAIT_TIME_OUT 180 namespace swss { diff --git a/neighsyncd/restore_neighbors.py b/neighsyncd/restore_neighbors.py index 541abe2199e..375bae4a665 100755 --- a/neighsyncd/restore_neighbors.py +++ b/neighsyncd/restore_neighbors.py @@ -35,7 +35,7 @@ # and this process is started by supervisord in swss docker. # It would be good to keep that time consistent routing reconciliation time-out, here # we are upstream, we shouldn't give up before down stream. -TIME_OUT = 120 +TIME_OUT = 110 # every 5 seconds to check interfaces states CHECK_INTERVAL = 5 @@ -191,12 +191,12 @@ def set_statedb_neigh_restore_done(): # The interfaces' states were checked in a loop with an interval (CHECK_INTERVAL) # The function will timeout in case interfaces' states never meet the condition # after some time (TIME_OUT). -def restore_update_kernel_neighbors(intf_neigh_map): +def restore_update_kernel_neighbors(intf_neigh_map, timeout=TIME_OUT): # create object for netlink calls to kernel ipclass = IPRoute() mtime = monotonic.time.time start_time = mtime() - while (mtime() - start_time) < TIME_OUT: + while (mtime() - start_time) < timeout: for intf, family_neigh_map in intf_neigh_map.items(): # only try to restore to kernel when link is up if is_intf_oper_state_up(intf): @@ -240,6 +240,10 @@ def main(): warmstart.initialize("neighsyncd", "swss") warmstart.checkWarmStart("neighsyncd", "swss", False) + timeout = warmstart.getWarmStartTimer("bgp", "bgp") + if timeout <= 0: + timeout = TIME_OUT + # if swss or system warm reboot not enabled, don't run if not warmstart.isWarmStart(): print "restore_neighbors service is skipped as warm restart not enabled" @@ -259,7 +263,7 @@ def main(): sys.exit(1) try: - restore_update_kernel_neighbors(intf_neigh_map) + restore_update_kernel_neighbors(intf_neigh_map, timeout) except Exception as e: logger.exception(str(e)) sys.exit(1) From c0df948f54027a730894445cea4360c98d2f31e9 Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Wed, 6 Feb 2019 20:43:51 +0000 Subject: [PATCH 3/3] Set default values according group discussion result - restore_neighbors.py timeout at 110 seconds due to observed requirement of greater than 70 seconds. - neighbor syncd timeout at 120 seconds (longer than 110 seconds). Signed-off-by: Ying Xie --- neighsyncd/neighsync.h | 4 ++-- neighsyncd/restore_neighbors.py | 18 +++++++----------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/neighsyncd/neighsync.h b/neighsyncd/neighsync.h index bffa6283e44..66fd1c2645b 100644 --- a/neighsyncd/neighsync.h +++ b/neighsyncd/neighsync.h @@ -11,10 +11,10 @@ /* * This is the timer value (in seconds) that the neighsyncd waits for restore_neighbors - * service to finish, should be longer than the restore_neighbors timeout value (120) + * service to finish, should be longer than the restore_neighbors timeout value (110) * This should not happen, if happens, system is in a unknown state, we should exit. */ -#define RESTORE_NEIGH_WAIT_TIME_OUT 180 +#define RESTORE_NEIGH_WAIT_TIME_OUT 120 namespace swss { diff --git a/neighsyncd/restore_neighbors.py b/neighsyncd/restore_neighbors.py index 375bae4a665..e0a0eea9434 100755 --- a/neighsyncd/restore_neighbors.py +++ b/neighsyncd/restore_neighbors.py @@ -30,12 +30,12 @@ logger.setLevel(logging.WARNING) logger.addHandler(logging.NullHandler()) -# timeout the restore process in 1 min if not finished +# timeout the restore process in 110 seconds if not finished # This is mostly to wait for interfaces to be created and up after system warm-reboot # and this process is started by supervisord in swss docker. -# It would be good to keep that time consistent routing reconciliation time-out, here -# we are upstream, we shouldn't give up before down stream. -TIME_OUT = 110 +# There had been devices taking close to 70 seconds to complete restoration, setting +# default timeout to 110 seconds. +DEF_TIME_OUT = 110 # every 5 seconds to check interfaces states CHECK_INTERVAL = 5 @@ -190,8 +190,8 @@ def set_statedb_neigh_restore_done(): # Once all the entries are restored, this function is returned. # The interfaces' states were checked in a loop with an interval (CHECK_INTERVAL) # The function will timeout in case interfaces' states never meet the condition -# after some time (TIME_OUT). -def restore_update_kernel_neighbors(intf_neigh_map, timeout=TIME_OUT): +# after some time (DEF_TIME_OUT). +def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT): # create object for netlink calls to kernel ipclass = IPRoute() mtime = monotonic.time.time @@ -240,10 +240,6 @@ def main(): warmstart.initialize("neighsyncd", "swss") warmstart.checkWarmStart("neighsyncd", "swss", False) - timeout = warmstart.getWarmStartTimer("bgp", "bgp") - if timeout <= 0: - timeout = TIME_OUT - # if swss or system warm reboot not enabled, don't run if not warmstart.isWarmStart(): print "restore_neighbors service is skipped as warm restart not enabled" @@ -263,7 +259,7 @@ def main(): sys.exit(1) try: - restore_update_kernel_neighbors(intf_neigh_map, timeout) + restore_update_kernel_neighbors(intf_neigh_map) except Exception as e: logger.exception(str(e)) sys.exit(1)