From 2030ad0966860b2c81caa83b95ccc0558d14c661 Mon Sep 17 00:00:00 2001 From: Sonic Build Admin Date: Wed, 26 Feb 2025 20:37:19 +0000 Subject: [PATCH] Fix TSA-TSB race condition on multi-asic platforms #### Why I did it Fixes https://github.com/sonic-net/sonic-buildimage/issues/21816 ##### Work item tracking - Microsoft ADO **31499777**: #### How I did it Setting the STATE_DB ALL_SERVICE_STATUS|tsa_tsb_service flag first as part of startup_tsa_tsb service, followed by configuring TSA. And as part of the case, when tsa_ena is False (genuine or due to race condition), we explictly call TSA again to ensure all asics go to TSA state. #### How to verify it Reboot the multi-asic linecard, and validate that all asics are in TSA state and TSA-TSB timer is running config_reload Tested following scenarios: 1. reboot multi-asic linecard 2. config reload 3. execute TSA while the service is running 4. TSA, config save and then config_reload 5. execute TSB while the service is running #### Which release branch to backport (provide reason below if selected) - [ ] 201811 - [ ] 201911 - [ ] 202006 - [ ] 202012 - [ ] 202106 - [ ] 202111 - [ ] 202205 - [ ] 202211 - [ ] 202305 #### Tested branch (Please provide the tested image version) 20240532.08 - [ ] - [ ] #### Description for the changelog #### Link to config_db schema for YANG module changes #### A picture of a cute animal (not mandatory but encouraged) --- files/scripts/startup_tsa_tsb.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/files/scripts/startup_tsa_tsb.py b/files/scripts/startup_tsa_tsb.py index 6b975f49aa..dfbc66d8fd 100644 --- a/files/scripts/startup_tsa_tsb.py +++ b/files/scripts/startup_tsa_tsb.py @@ -67,12 +67,12 @@ def config_tsa(): num_asics = multi_asic.get_num_asics() tsa_ena = get_tsa_status(num_asics) if tsa_ena == True: - logger.log_info("Configuring TSA") - subprocess.check_output(['TSA']).strip() logger.log_info("Setting TSA-TSB service field in STATE_DB") subprocess.check_output([ 'sonic-db-cli', 'STATE_DB', 'HSET', 'ALL_SERVICE_STATUS|tsa_tsb_service', 'running', 'OK' ]).strip() + logger.log_info("Configuring TSA") + subprocess.check_output(['TSA']).strip() else: #check if tsa_tsb service is already running, restart the timer try: @@ -84,6 +84,8 @@ def config_tsa(): if startup_tsa_tsb_service_status == 'OK': logger.log_info("TSA-TSB service is already running, just restart the timer") + # execute TSA again: this is to overcome race condition where in its previous run, TSA configuration didnt complete on all asics + subprocess.check_output(['TSA']).strip() return True else: if num_asics > 1: