Skip to content

Commit 639854a

Browse files
authored
Workaround for "dead worker" after docker-sonic-mgmt upgrade (#21407)
After the docker-sonic-mgmt image is upgraded, some tests failed while creating `nbrhosts` fixture with below error: ``` self = <ansible.plugins.strategy.linear.StrategyModule object at 0x7596c07986e0> iterator = <ansible.executor.play_iterator.PlayIterator object at 0x7596c09b2a80> def _wait_on_pending_results(self, iterator): ''' Wait for the shared counter to drop to zero, using a short sleep between checks to ensure we don't spin lock ''' ret_results = [] display.debug("waiting for pending results...") while self._pending_results > 0 and not self._tqm._terminated: if self._tqm.has_dead_workers(): > raise AnsibleError("A worker was found in a dead state") E ansible.errors.AnsibleError: A worker was found in a dead state ``` The new docker-sonic-mgmt image has ansible upgraded from 2.13 to 2.18. While creating the `nbrhosts` fixture, thread pool is used to improve the performance for initializing large number of neighbors. For the t0-sonic topology, sonic VM is used as neighbor. The `nbrhosts` fixture needs to initialize multiple `SonicHost` objects. In __init__ of SonicHost, parallel thread is again being used to boost the execution of multiple commands on the device to gather various facts. For new ansible 2.18, it is not able to handle the complicated scenario properly. Possibly the task queue manager is checking state of workers of other task queue manager created by other threads. Because of this issue, PR testing easily fail. To stop bleeding and unblock PR testing, this change added a threading lock for initializing neighbor hosts. Signed-off-by: Xin Wang <[email protected]>
1 parent a7aaab1 commit 639854a

1 file changed

Lines changed: 45 additions & 43 deletions

File tree

tests/conftest.py

Lines changed: 45 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989

9090
logger = logging.getLogger(__name__)
9191
cache = FactsCache()
92+
_ansible_tqm_lock = threading.Lock()
9293

9394
DUTHOSTS_FIXTURE_FAILED_RC = 15
9495
CUSTOM_MSG_PREFIX = "sonic_custom_msg"
@@ -858,49 +859,50 @@ def nbrhosts(enhance_inventory, ansible_adhoc, tbinfo, creds, request):
858859
return devices
859860

860861
def initial_neighbor(neighbor_name, vm_name):
861-
logger.info(f"nbrhosts started: {neighbor_name}_{vm_name}")
862-
if neighbor_type == "eos":
863-
device = NeighborDevice(
864-
{
865-
'host': EosHost(
866-
ansible_adhoc,
867-
vm_name,
868-
creds['eos_login'],
869-
creds['eos_password'],
870-
shell_user=creds['eos_root_user'] if 'eos_root_user' in creds else None,
871-
shell_passwd=creds['eos_root_password'] if 'eos_root_password' in creds else None
872-
),
873-
'conf': tbinfo['topo']['properties']['configuration'][neighbor_name]
874-
}
875-
)
876-
elif neighbor_type == "sonic":
877-
device = NeighborDevice(
878-
{
879-
'host': SonicHost(
880-
ansible_adhoc,
881-
vm_name,
882-
ssh_user=creds['sonic_login'] if 'sonic_login' in creds else None,
883-
ssh_passwd=creds['sonic_password'] if 'sonic_password' in creds else None
884-
),
885-
'conf': tbinfo['topo']['properties']['configuration'][neighbor_name]
886-
}
887-
)
888-
elif neighbor_type == "cisco":
889-
device = NeighborDevice(
890-
{
891-
'host': CiscoHost(
892-
ansible_adhoc,
893-
vm_name,
894-
creds['cisco_login'],
895-
creds['cisco_password'],
896-
),
897-
'conf': tbinfo['topo']['properties']['configuration'][neighbor_name]
898-
}
899-
)
900-
else:
901-
raise ValueError("Unknown neighbor type %s" % (neighbor_type,))
902-
devices[neighbor_name] = device
903-
logger.info(f"nbrhosts finished: {neighbor_name}_{vm_name}")
862+
with _ansible_tqm_lock:
863+
logger.info(f"nbrhosts started: {neighbor_name}_{vm_name}")
864+
if neighbor_type == "eos":
865+
device = NeighborDevice(
866+
{
867+
'host': EosHost(
868+
ansible_adhoc,
869+
vm_name,
870+
creds['eos_login'],
871+
creds['eos_password'],
872+
shell_user=creds['eos_root_user'] if 'eos_root_user' in creds else None,
873+
shell_passwd=creds['eos_root_password'] if 'eos_root_password' in creds else None
874+
),
875+
'conf': tbinfo['topo']['properties']['configuration'][neighbor_name]
876+
}
877+
)
878+
elif neighbor_type == "sonic":
879+
device = NeighborDevice(
880+
{
881+
'host': SonicHost(
882+
ansible_adhoc,
883+
vm_name,
884+
ssh_user=creds['sonic_login'] if 'sonic_login' in creds else None,
885+
ssh_passwd=creds['sonic_password'] if 'sonic_password' in creds else None
886+
),
887+
'conf': tbinfo['topo']['properties']['configuration'][neighbor_name]
888+
}
889+
)
890+
elif neighbor_type == "cisco":
891+
device = NeighborDevice(
892+
{
893+
'host': CiscoHost(
894+
ansible_adhoc,
895+
vm_name,
896+
creds['cisco_login'],
897+
creds['cisco_password'],
898+
),
899+
'conf': tbinfo['topo']['properties']['configuration'][neighbor_name]
900+
}
901+
)
902+
else:
903+
raise ValueError("Unknown neighbor type %s" % (neighbor_type,))
904+
devices[neighbor_name] = device
905+
logger.info(f"nbrhosts finished: {neighbor_name}_{vm_name}")
904906

905907
executor = concurrent.futures.ThreadPoolExecutor(max_workers=8)
906908
futures = []

0 commit comments

Comments
 (0)