Skip to content

Commit 110257c

Browse files
wangxinmssonicbld
authored andcommitted
Improve the cleanup of processes and interfaces before stopping PTF container (#10069)
What is the motivation for this PR? We still observe issues with "testbed-cli.sh remove-topo" and "testbed-cli.sh restart-ptf": Server may crash and run into CPU softlock issue. Some exabgp process cannot be fully stopped and "restart-ptf" may fail. The expectation is that remove-topo and restart-ptf can always be successful. And of course, no server crash. Possible reason of server crash: Some exabgp processes are still running in PTF container while we remove the container. This could cause server crash. Some network interfaces are in the PTF container's network namespace while we remove the container. How did you do it? Added a customized module "ptf_control" to stop&kill processes running in PTF container in a more aggressive and reliable way. Improve the vm_topology module to remove network interfaces from the PTF container in the "unbind" procedure. Added a vm_topology "unbind" step in the "testbed-cli.sh restart-ptf" procedure. Updated some "ip link" commands to fully compliant with the syntax in "ip link help". How did you verify/test it? Tested the add-topo/remove-topo on both physical and KVM testbed. Tested restart-ptf on phsycial testbed.
1 parent cbc4498 commit 110257c

5 files changed

Lines changed: 235 additions & 132 deletions

File tree

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/python
2+
3+
import json
4+
import logging
5+
import traceback
6+
7+
import docker
8+
9+
from ansible.module_utils.debug_utils import config_module_logging
10+
from ansible.module_utils.basic import AnsibleModule
11+
12+
DOCUMENTATION = '''
13+
---
14+
module: ptf_control
15+
version_added: "0.1"
16+
author: Xin Wang ([email protected])
17+
short_description: Control PTF container
18+
description: For controlling PTF container, for example killing processes running in PTF container before stopping it.
19+
20+
Parameters:
21+
- ctn_name: Name of the PTF container
22+
- command: Command to run, currently only support "kill"
23+
24+
'''
25+
26+
EXAMPLES = '''
27+
- name: Kill exabgp and ptf_nn_agent processes in PTF container
28+
ptf_control:
29+
ctn_name: "ptf_vms6-1"
30+
command: kill
31+
'''
32+
33+
34+
class PtfControl(object):
35+
"""This class is for controlling PTF container
36+
"""
37+
38+
def __init__(self, module, ctn_name):
39+
self.module = module
40+
self.ctn_name = ctn_name
41+
42+
self.pid = PtfControl.get_pid(self.ctn_name)
43+
44+
def cmd(self, cmdline, use_unsafe_shell=False, ignore_failure=False, verbose=True):
45+
rc, out, err = self.module.run_command(cmdline, use_unsafe_shell=use_unsafe_shell)
46+
if verbose:
47+
msg = {
48+
'cmd': cmdline,
49+
'rc': rc,
50+
'stdout_lines': out.splitlines(),
51+
'stderr_lines': err.splitlines()
52+
}
53+
logging.debug('***** RUN CMD:\n%s' % json.dumps(msg, indent=2))
54+
55+
if rc != 0 and not ignore_failure:
56+
raise Exception("Failed to run command: %s, rc=%d, out=%s, err=%s" % (cmdline, rc, out, err))
57+
return rc, out, err
58+
59+
@staticmethod
60+
def get_pid(ctn_name):
61+
cli = docker.from_env()
62+
try:
63+
ctn = cli.containers.get(ctn_name)
64+
except Exception:
65+
return None
66+
67+
return ctn.attrs['State']['Pid']
68+
69+
def get_process_pids(self, process):
70+
cmd = 'docker exec -t {} bash -c "pgrep -f \'{}\'"'.format(self.ctn_name, process)
71+
_, out, _ = self.cmd(cmd, ignore_failure=True)
72+
return [int(pid.strip()) for pid in out.splitlines()]
73+
74+
def get_supervisord_processes(self):
75+
_, out, _ = self.cmd(
76+
'docker exec -t {} bash -c "supervisorctl status"'.format(self.ctn_name), ignore_failure=True
77+
)
78+
processes = [line.strip().split()[0] for line in out.splitlines() if "sshd" not in line]
79+
return processes
80+
81+
def kill_process(self, pid):
82+
self.cmd('docker exec -t {} bash -c "kill -9 {}"'.format(self.ctn_name, pid), ignore_failure=True)
83+
84+
def kill_processes(self):
85+
supervisord_processes = self.get_supervisord_processes()
86+
self.cmd('docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name))
87+
for i in range(3):
88+
logging.info("=== Attempt %d ===" % (i + 1))
89+
logging.info("=== Use supervisorctl to stop processes ===")
90+
for process in supervisord_processes:
91+
self.cmd(
92+
'docker exec -t {} bash -c "supervisorctl stop {}"'.format(self.ctn_name, process),
93+
ignore_failure=True
94+
)
95+
self.cmd(
96+
'docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name)
97+
)
98+
99+
for pattern in [
100+
"/usr/share/exabgp/http_api.py",
101+
"/usr/local/bin/exabgp",
102+
"ptf_nn_agent.py"
103+
]:
104+
logging.info("=== Kill process %s ===" % pattern)
105+
for pid in self.get_process_pids(pattern):
106+
self.kill_process(pid)
107+
108+
self.cmd('docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name))
109+
110+
111+
def main():
112+
module = AnsibleModule(
113+
argument_spec=dict(
114+
ctn_name=dict(required=True, type='str'),
115+
command=dict(required=True, type='str')
116+
),
117+
supports_check_mode=False)
118+
119+
ctn_name = module.params['ctn_name']
120+
command = module.params['command']
121+
if command not in ['kill']:
122+
module.fail_json(msg="command %s is not supported" % command)
123+
124+
config_module_logging('ptf_control_' + ctn_name)
125+
126+
try:
127+
ptf = PtfControl(module, ctn_name)
128+
if command == "kill":
129+
if ptf.pid is not None:
130+
ptf.kill_processes()
131+
except Exception as error:
132+
logging.error(traceback.format_exc())
133+
module.fail_json(msg=str(error))
134+
135+
module.exit_json(changed=True)
136+
137+
138+
if __name__ == "__main__":
139+
main()

ansible/roles/vm_set/library/vm_topology.py

Lines changed: 61 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -512,10 +512,8 @@ def add_br_if_to_docker(self, bridge, ext_if, int_if):
512512
VMTopology.iface_up(ext_if)
513513

514514
if VMTopology.intf_exists(tmp_int_if) and VMTopology.intf_not_exists(tmp_int_if, pid=self.pid):
515-
VMTopology.cmd("ip link set netns %s dev %s" %
516-
(self.pid, tmp_int_if))
517-
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" %
518-
(self.pid, tmp_int_if, int_if))
515+
VMTopology.cmd("ip link set dev %s netns %s " % (tmp_int_if, self.pid))
516+
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, tmp_int_if, int_if))
519517

520518
VMTopology.iface_up(int_if, pid=self.pid)
521519

@@ -537,10 +535,8 @@ def add_br_if_to_netns(self, bridge, ext_if, int_if):
537535
VMTopology.iface_up(ext_if)
538536

539537
if VMTopology.intf_exists(tmp_int_if) and VMTopology.intf_not_exists(tmp_int_if, netns=self.netns):
540-
VMTopology.cmd("ip link set netns %s dev %s" %
541-
(self.netns, tmp_int_if))
542-
VMTopology.cmd("ip netns exec %s ip link set dev %s name %s" % (
543-
self.netns, tmp_int_if, int_if))
538+
VMTopology.cmd("ip link set dev %s netns %s" % (tmp_int_if, self.netns))
539+
VMTopology.cmd("ip netns exec %s ip link set dev %s name %s" % (self.netns, tmp_int_if, int_if))
544540

545541
VMTopology.iface_up(int_if, netns=self.netns)
546542

@@ -606,12 +602,10 @@ def add_dut_if_to_docker(self, iface_name, dut_iface):
606602
if VMTopology.intf_exists(dut_iface) \
607603
and VMTopology.intf_not_exists(dut_iface, pid=self.pid) \
608604
and VMTopology.intf_not_exists(iface_name, pid=self.pid):
609-
VMTopology.cmd("ip link set netns %s dev %s" %
610-
(self.pid, dut_iface))
605+
VMTopology.cmd("ip link set dev %s netns %s" % (dut_iface, self.pid))
611606

612607
if VMTopology.intf_exists(dut_iface, pid=self.pid) and VMTopology.intf_not_exists(iface_name, pid=self.pid):
613-
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" %
614-
(self.pid, dut_iface, iface_name))
608+
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, dut_iface, iface_name))
615609

616610
VMTopology.iface_up(iface_name, pid=self.pid)
617611

@@ -626,7 +620,7 @@ def add_dut_vlan_subif_to_docker(self, iface_name, vlan_separator, vlan_id):
626620
(self.pid, vlan_sub_iface_name))
627621

628622
def remove_dut_if_from_docker(self, iface_name, dut_iface):
629-
623+
logging.info("=== Restore docker interface %s as dut interface %s ===" % (iface_name, dut_iface))
630624
if self.pid is None:
631625
return
632626

@@ -639,7 +633,7 @@ def remove_dut_if_from_docker(self, iface_name, dut_iface):
639633

640634
if VMTopology.intf_not_exists(dut_iface) and VMTopology.intf_exists(dut_iface, pid=self.pid):
641635
VMTopology.cmd(
642-
"nsenter -t %s -n ip link set netns 1 dev %s" % (self.pid, dut_iface))
636+
"nsenter -t %s -n ip link set dev %s netns 1" % (self.pid, dut_iface))
643637

644638
def remove_dut_vlan_subif_from_docker(self, iface_name, vlan_separator, vlan_id):
645639
"""Remove the vlan sub interface created for the ptf interface."""
@@ -648,6 +642,7 @@ def remove_dut_vlan_subif_from_docker(self, iface_name, vlan_separator, vlan_id)
648642

649643
vlan_sub_iface_name = iface_name + vlan_separator + vlan_id
650644
if VMTopology.intf_exists(vlan_sub_iface_name, pid=self.pid):
645+
VMTopology.iface_down(vlan_sub_iface_name, pid=self.pid)
651646
VMTopology.cmd("nsenter -t %s -n ip link del %s" %
652647
(self.pid, vlan_sub_iface_name))
653648

@@ -708,14 +703,14 @@ def add_veth_if_to_docker(self, ext_if, int_if, create_vlan_subintf=False, **kwa
708703
if VMTopology.intf_exists(t_int_if) \
709704
and VMTopology.intf_not_exists(t_int_if, pid=self.pid) \
710705
and VMTopology.intf_not_exists(int_if, pid=self.pid):
711-
VMTopology.cmd("ip link set netns %s dev %s" %
712-
(self.pid, t_int_if))
706+
VMTopology.cmd("ip link set dev %s netns %s" %
707+
(t_int_if, self.pid))
713708
if create_vlan_subintf \
714709
and VMTopology.intf_exists(t_int_sub_if) \
715710
and VMTopology.intf_not_exists(t_int_sub_if, pid=self.pid) \
716711
and VMTopology.intf_not_exists(int_sub_if, pid=self.pid):
717-
VMTopology.cmd("ip link set netns %s dev %s" %
718-
(self.pid, t_int_sub_if))
712+
VMTopology.cmd("ip link set dev %s netns %s" %
713+
(t_int_sub_if, self.pid))
719714

720715
if VMTopology.intf_exists(t_int_if, pid=self.pid) and VMTopology.intf_not_exists(int_if, pid=self.pid):
721716
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" %
@@ -761,8 +756,8 @@ def add_veth_if_to_netns(self, ext_if, int_if):
761756
if VMTopology.intf_exists(t_int_if) \
762757
and VMTopology.intf_not_exists(t_int_if, netns=self.netns) \
763758
and VMTopology.intf_not_exists(int_if, netns=self.netns):
764-
VMTopology.cmd("ip link set netns %s dev %s" %
765-
(self.netns, t_int_if))
759+
VMTopology.cmd("ip link set dev %s netns %s" %
760+
(t_int_if, self.netns))
766761

767762
if VMTopology.intf_exists(t_int_if, netns=self.netns) and VMTopology.intf_not_exists(int_if, netns=self.netns):
768763
VMTopology.cmd("ip netns exec %s ip link set dev %s name %s" % (
@@ -869,6 +864,7 @@ def bind_fp_ports(self, disconnect_vm=False):
869864
VS_CHASSIS_MIDPLANE_BRIDGE_NAME, self.topo['DUT']['vs_chassis']['midplane_port'])
870865

871866
def unbind_fp_ports(self):
867+
logging.info("=== unbind front panel ports ===")
872868
for attr in self.VMs.values():
873869
for vlan_num, vlan in enumerate(attr['vlans']):
874870
br_name = adaptive_name(
@@ -1266,6 +1262,7 @@ def remove_host_ports(self):
12661262
"""
12671263
remove dut port from the ptf docker
12681264
"""
1265+
logging.info("=== Remove host ports ===")
12691266
for i, intf in enumerate(self.host_interfaces):
12701267
if self._is_multi_duts:
12711268
if isinstance(intf, list):
@@ -1290,6 +1287,45 @@ def remove_host_ports(self):
12901287
self.remove_dut_vlan_subif_from_docker(
12911288
ptf_if, vlan_separator, vlan_id)
12921289

1290+
def remove_veth_if_from_docker(self, ext_if, int_if, tmp_name):
1291+
"""
1292+
Remove veth interface from docker
1293+
"""
1294+
logging.info("=== Cleanup port, int_if: %s, ext_if: %s, tmp_name: %s ===" % (ext_if, int_if, tmp_name))
1295+
if VMTopology.intf_exists(int_if, pid=self.pid):
1296+
# Name it back to temp name in PTF container to avoid potential conflicts
1297+
VMTopology.iface_down(int_if, pid=self.pid)
1298+
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, int_if, tmp_name))
1299+
# Set it to default namespace
1300+
VMTopology.cmd("nsenter -t %s -n ip link set dev %s netns 1" % (self.pid, tmp_name))
1301+
1302+
# Delete its peer in default namespace
1303+
if VMTopology.intf_exists(ext_if):
1304+
VMTopology.cmd("ip link delete dev %s" % ext_if)
1305+
1306+
def remove_ptf_mgmt_port(self):
1307+
ext_if = PTF_MGMT_IF_TEMPLATE % self.vm_set_name
1308+
tmp_name = MGMT_PORT_NAME + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(MGMT_PORT_NAME))
1309+
self.remove_veth_if_from_docker(ext_if, MGMT_PORT_NAME, tmp_name)
1310+
1311+
def remove_ptf_backplane_port(self):
1312+
ext_if = PTF_BP_IF_TEMPLATE % self.vm_set_name
1313+
tmp_name = BP_PORT_NAME + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(BP_PORT_NAME))
1314+
self.remove_veth_if_from_docker(ext_if, BP_PORT_NAME, tmp_name)
1315+
1316+
def remove_injected_fp_ports_from_docker(self):
1317+
for vm, vlans in self.injected_fp_ports.items():
1318+
for vlan in vlans:
1319+
(_, _, ptf_index) = VMTopology.parse_vm_vlan_port(vlan)
1320+
ext_if = adaptive_name(INJECTED_INTERFACES_TEMPLATE, self.vm_set_name, ptf_index)
1321+
int_if = PTF_FP_IFACE_TEMPLATE % ptf_index
1322+
properties = self.vm_properties.get(vm, {})
1323+
create_vlan_subintf = properties.get('device_type') in (
1324+
BACKEND_TOR_TYPE, BACKEND_LEAF_TYPE)
1325+
if not create_vlan_subintf:
1326+
tmp_name = int_if + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(int_if))
1327+
self.remove_veth_if_from_docker(ext_if, int_if, tmp_name)
1328+
12931329
@staticmethod
12941330
def _generate_fingerprint(name, digit=6):
12951331
"""
@@ -1867,10 +1903,14 @@ def main():
18671903
if vms_exists:
18681904
net.unbind_vm_backplane()
18691905
net.unbind_fp_ports()
1906+
net.remove_injected_fp_ports_from_docker()
18701907

18711908
if hostif_exists:
18721909
net.remove_host_ports()
18731910

1911+
net.remove_ptf_mgmt_port()
1912+
net.remove_ptf_backplane_port()
1913+
18741914
if net.netns:
18751915
net.unbind_mgmt_port(NETNS_MGMT_IF_TEMPLATE % net.vm_set_name)
18761916
net.delete_network_namespace()
@@ -1935,6 +1975,7 @@ def main():
19351975
net.unbind_fp_ports()
19361976
net.add_injected_fp_ports_to_docker()
19371977
net.bind_fp_ports()
1978+
net.bind_vm_backplane()
19381979
net.add_bp_port_to_docker(ptf_bp_ip_addr, ptf_bp_ipv6_addr)
19391980

19401981
if net.netns:

0 commit comments

Comments
 (0)