-
Notifications
You must be signed in to change notification settings - Fork 816
[201911] [multi_asic] Script to monitor errors on internal links #2966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,243 @@ | ||
| #!/usr/bin/env python2.7 | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| import syslog | ||
|
|
||
|
|
||
| from swsssdk import SonicV2Connector | ||
| from sonic_py_common import multi_asic | ||
|
|
||
| # Global variables | ||
| MIN_SCAN_INTERVAL = 60 # Every minute | ||
| MAX_SCAN_INTERVAL = 3600 # An hour | ||
|
|
||
| class InternalLinkErrMontoring(): | ||
| '''Class to monitor the internal link errors for a given namespace''' | ||
|
|
||
| def __init__(self, namespace, port_name_map, filter_for_backend_ports=None): | ||
| '''Initialize the class with the namespace and port_name_map in counters db''' | ||
| self.namespace = namespace | ||
| self.port_name_map = port_name_map | ||
| self.filter_for_backend_ports = filter_for_backend_ports | ||
| self.port_map = self.get_port_map() | ||
|
|
||
| @property | ||
| def counter_db(self): | ||
| '''Return the counter db object''' | ||
| db = SonicV2Connector(namespace=self.namespace) | ||
| db.connect('COUNTERS_DB') | ||
| return db | ||
|
|
||
| def get_port_map(self): | ||
| ''' | ||
| Return the port map from the counter db | ||
| for the port_names in the port_name_map | ||
| Additonally the port map can be filtered for backend ports | ||
| ''' | ||
| ports = self.counter_db.get_all('COUNTERS_DB', self.port_name_map) | ||
| if self.filter_for_backend_ports: | ||
| port_to_mon = self.filter_for_backend_ports(self.namespace, ports) | ||
| return port_to_mon | ||
|
|
||
| @property | ||
| def port_counters(self): | ||
| '''Return the port counters for all the port in the port_map''' | ||
| port_counters = {} | ||
| for port_name, port_oid in self.port_map.items(): | ||
| port_counter = self.counter_db.get_all( | ||
| "COUNTERS_DB", 'COUNTERS:{}'.format(port_oid)) | ||
| port_counters[port_name] = port_counter | ||
| return port_counters | ||
|
|
||
| def get_ports_error_above_threshold(self, counter_name, threshold): | ||
| ''' | ||
| Return the list of ports which have the counter value above the threshold | ||
| ''' | ||
| err_ports = [] | ||
| for port_name, port_counter in self.port_counters.items(): | ||
| try: | ||
| port_counter_value = port_counter[counter_name] | ||
| if int(port_counter_value) > int(threshold): | ||
| err_ports.append(port_name) | ||
| except KeyError: | ||
| print('Bad counter_name') | ||
| return err_ports | ||
|
|
||
| def monitor(self, namespace, error_counter_names, threshold ): | ||
| err_ports = [] | ||
| for counter_name in error_counter_names: | ||
| err_ports_per_counter = self.get_ports_error_above_threshold( | ||
| counter_name,threshold) | ||
| if err_ports_per_counter: | ||
| syslog.syslog(syslog.LOG_CRIT, | ||
| ' {} error above threshold on internal port {} in {} '.format(counter_name,err_ports_per_counter, namespace)) | ||
| err_ports.extend(err_ports_per_counter) | ||
| return err_ports | ||
|
|
||
|
|
||
| class PacketChassisInternalLinkMontoring(): | ||
| '''A class for monitoring internal links in a Packet Chassis. | ||
|
|
||
| Attributes: | ||
| link_monitor (dict): A dictionary of LinkMonitor objects, keyed by namespace. | ||
| error_counter_names (list): A list of error counter names to monitor. | ||
| threshold (int): The threshold for error counters. | ||
| ''' | ||
|
|
||
| def __init__(self): | ||
| '''Initializes a PacketChassisInternalLinkMontoring object. | ||
|
|
||
| Args: | ||
| link_monitor (dict): A dictionary of LinkMonitor objects, keyed by namespace. | ||
| error_counter_names (list): A list of error counter names to monitor. | ||
| threshold (int): The threshold for error counters. | ||
|
|
||
| Returns: | ||
| None | ||
| ''' | ||
| self.link_monitor = {} | ||
| self.appdb = {} | ||
| self.configdb = {} | ||
| for namespace in self.namespaces: | ||
| self.appdb[namespace] = self.appl_db(namespace) | ||
| self.configdb[namespace] = self.config_db(namespace) | ||
| self.link_monitor[namespace] = InternalLinkErrMontoring( | ||
| namespace=namespace, port_name_map='COUNTERS_PORT_NAME_MAP', | ||
| filter_for_backend_ports=self.filter_for_backend_ports) | ||
| self.error_counter_names = ['SAI_PORT_STAT_IF_IN_ERRORS', | ||
| 'SAI_PORT_STAT_IF_OUT_ERRORS'] | ||
| # Using fixed values for 201911 | ||
| self.threshold = 0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is comment applicable ? Is threshold configurable ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is fixed for 201911, but would be read from configDB for master and newer branches |
||
| self.mitigationActionEnabled = True | ||
|
|
||
| @property | ||
| def namespaces(self): | ||
| namespaces = [] | ||
| all_namespaces = multi_asic.get_all_namespaces() | ||
| namespaces = all_namespaces['front_ns'] + all_namespaces['back_ns'] | ||
| return namespaces | ||
|
|
||
| def config_db(self,namespace): | ||
| '''Returns the config db object for the namespace''' | ||
| db = SonicV2Connector(namespace=namespace) | ||
| db.connect('CONFIG_DB') | ||
| return db | ||
|
|
||
| def appl_db(self, namespace): | ||
| '''Returns the application db object for the namespace''' | ||
| db = SonicV2Connector(namespace=namespace) | ||
| db.connect('APPL_DB') | ||
| return db | ||
|
|
||
| def get_port_status(self, namespace, port_name): | ||
| ''' Returns operational status of give port | ||
| When port is admin shut in CONFIG_DB, it might take some time for port to be oper down in APPL_DB. | ||
| As such, status of ports which are admin shut in config DB is reported as down | ||
| ''' | ||
| port_cfg = self.configdb[namespace].get_all('CONFIG_DB', 'PORT|{}'.format(port_name)) | ||
| port_info = self.appdb[namespace].get_all('APPL_DB', 'PORT_TABLE:{}'.format(port_name)) | ||
| if port_cfg: | ||
| if port_cfg['admin_status'] == 'up': | ||
| if port_info: | ||
| return port_info['oper_status'] | ||
| return 'down' | ||
|
|
||
| def filter_for_backend_ports(self, namespace, port_map): | ||
| ''' Returns only internal (backend) ports which are operationally up for monitoring ''' | ||
| filtered_port_map = {} | ||
| for k,v in port_map.items(): | ||
| if k.startswith('Ethernet-BP'): | ||
| if self.get_port_status(namespace, k) == 'up': | ||
| filtered_port_map[k] = v | ||
| return filtered_port_map | ||
|
|
||
| def get_lag_name_for_port(self, namespace, port_name): | ||
| ''' Returns name of the portchannel in which the given port is a member ''' | ||
| lag_member = self.appdb[namespace].keys('APPL_DB', 'LAG_MEMBER_TABLE:*:{}'.format(port_name)) | ||
| if len(lag_member): | ||
| table = lag_member[0].find(":") + 1 | ||
| port = lag_member[0].find(":", table) | ||
| return lag_member[0][table:port] | ||
| return '' | ||
|
|
||
| def get_min_links_for_lag(self, namespace, lag_name): | ||
| ''' Returns min_links configuration of the given portchannel ''' | ||
| lag_info = self.configdb[namespace].get_all('CONFIG_DB', 'PORTCHANNEL|{}'.format(lag_name)) | ||
| return int(lag_info['min_links']) | ||
|
|
||
| def get_active_lag_member_count(self, namespace, lag_name): | ||
| ''' Returns number of member ports that are operationally up in the given portchannel ''' | ||
| lag_members = self.appdb[namespace].keys('APPL_DB', 'LAG_MEMBER_TABLE:{}:*'.format(lag_name)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we use config db instead of appl db ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since we are looking for active (operationally up) lag members, does it matter if we check in appl db? |
||
| active_count = 0 | ||
| for member in lag_members: | ||
| port = member.rsplit(':', 1)[1] | ||
| if self.get_port_status(namespace, port) == 'up': | ||
| active_count += 1 | ||
| return active_count | ||
|
|
||
| def isolate_lag_member(self, namespace, port_name): | ||
| ''' Set admin_status of given port to down in the CONFIG_DB ''' | ||
| ret = self.configdb[namespace].set('CONFIG_DB', 'PORT|{}'.format(port_name), 'admin_status', 'down') | ||
| if ret == 0: | ||
| syslog.syslog(syslog.LOG_CRIT, | ||
| 'Internal port {} has been shutdown to mitigate errors'.format(port_name)) | ||
| else: | ||
| syslog.syslog(syslog.LOG_CRIT, | ||
| 'Unable to shutdown internal port {}, return code {}'.format(port_name, ret)) | ||
| def attempt_to_mitigate_ports(self, namespace, err_port_per_ns): | ||
| ''' Attempt to isolate given list of ports | ||
| 1. Check if port is already down, then skip mitigation | ||
| 2. Check if number of active links in the portchannel where the port is a member is greater | ||
| than min_links. | ||
| a. If active links is greater, shutdown the port | ||
| b. If not, return after generating syslog | ||
| ''' | ||
|
|
||
| for port_name in err_port_per_ns: | ||
| if self.get_port_status(namespace, port_name) == 'down': | ||
tjchadaga marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| syslog.syslog(syslog.LOG_INFO, "Port {} is down, skipping mitigation".format(port_name)) | ||
| continue | ||
| syslog.syslog(syslog.LOG_INFO, "Attempting mitigation of port {}".format(port_name)) | ||
| lag_name = self.get_lag_name_for_port(namespace, port_name) | ||
| active_count = self.get_active_lag_member_count(namespace, lag_name) | ||
| min_links = self.get_min_links_for_lag(namespace, lag_name) | ||
|
|
||
| if active_count > min_links: | ||
| syslog.syslog(syslog.LOG_INFO, | ||
| "Active port count {} in {} sufficient to take mitigate {} (min_links {})".format( | ||
| active_count, lag_name, port_name, min_links)) | ||
| self.isolate_lag_member(namespace, port_name) | ||
| else: | ||
| syslog.syslog(syslog.LOG_CRIT, | ||
| "Active port count {} in {} insufficient to take mitigate {} (min_links {})".format( | ||
| active_count, lag_name, port_name, min_links)) | ||
|
|
||
| def monitor(self): | ||
| ''' | ||
| Monitors the error counters for each internal port in each namespace. | ||
| If any error counters are above the specified threshold, a syslog message is generated. | ||
| ''' | ||
| all_error_ports = [] | ||
| for namespace, link_mon in self.link_monitor.items(): | ||
| err_port_per_ns = link_mon.monitor(namespace, self.error_counter_names, self.threshold) | ||
| if err_port_per_ns: | ||
| all_error_ports.extend(err_port_per_ns) | ||
| syslog.syslog(syslog.LOG_CRIT, | ||
| '{} internal ports in {} have errors above threshold'.format(len(err_port_per_ns), namespace)) | ||
| if self.mitigationActionEnabled: | ||
| self.attempt_to_mitigate_ports(namespace, err_port_per_ns) | ||
| return len(all_error_ports) | ||
|
|
||
| def main(): | ||
| if multi_asic.is_multi_asic(): | ||
| link_monitor = PacketChassisInternalLinkMontoring() | ||
| num_of_err_ports = link_monitor.monitor() | ||
| if num_of_err_ports: | ||
| syslog.syslog(syslog.LOG_CRIT, '{} internal ports have errors above threshold'.format(num_of_err_ports)) | ||
| return -1 | ||
| return 0 | ||
tjchadaga marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| import imp | ||
| import os | ||
| import importlib | ||
| import mock_tables.dbconnector | ||
| import mock_tables.mock_multi_asic | ||
|
|
||
| int_errors = importlib.import_module("scripts.internal_links_monitor") | ||
|
|
||
| class TestPacketChassisInternalLinkMontoring(object): | ||
| @classmethod | ||
| def setup_class(cls): | ||
| imp.reload(mock_tables.mock_multi_asic) | ||
| mock_tables.dbconnector.load_namespace_config() | ||
| os.environ['UTILITIES_UNIT_TESTING'] = "2" | ||
|
|
||
| @classmethod | ||
| def teardown_class(cls): | ||
| os.environ['UTILITIES_UNIT_TESTING'] = "0" | ||
|
|
||
| def test_get_port_status(self): | ||
| port_name = 'Ethernet-BP16' | ||
| namespace = 'asic0' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| assert link_monitor.get_port_status(namespace, port_name) == 'up' | ||
|
|
||
| def test_isolate_lag_member(self): | ||
| port_name = 'Ethernet-BP4' | ||
| namespace = 'asic0' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| assert link_monitor.get_port_status(namespace, port_name) == 'up' | ||
| link_monitor.isolate_lag_member(namespace, port_name) | ||
| assert link_monitor.get_port_status(namespace, port_name) == 'down' | ||
|
|
||
| def test_get_lag_name_for_port(self): | ||
| port_name = 'Ethernet-BP4' | ||
| namespace = 'asic0' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| assert link_monitor.get_lag_name_for_port(namespace, port_name) == 'PortChannel4001' | ||
|
|
||
| def test_get_active_lag_member_count(self): | ||
| lag_name = 'PortChannel4001' | ||
| namespace = 'asic0' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| assert link_monitor.get_active_lag_member_count(namespace, lag_name) == 4 | ||
|
|
||
| def test_mem_down_get_active_lag_member_count(self): | ||
| lag_name = 'PortChannel4001' | ||
| namespace = 'asic0' | ||
| port_name = 'Ethernet-BP4' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| link_monitor.isolate_lag_member(namespace, port_name) | ||
| assert link_monitor.get_active_lag_member_count(namespace, lag_name) == 3 | ||
|
|
||
| def test_get_min_links_for_lag(self): | ||
| lag_name = 'PortChannel4001' | ||
| namespace = 'asic0' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| assert link_monitor.get_min_links_for_lag(namespace, lag_name) == 3 | ||
|
|
||
| def test_attempt_to_mitigate_ports(self): | ||
| port_list = ['Ethernet-BP0', 'Ethernet-BP4'] | ||
| namespace = 'asic0' | ||
| lag_name = 'PortChannel4001' | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| link_monitor.attempt_to_mitigate_ports(namespace, port_list) | ||
|
|
||
| # Only one port should be brought down due to min_links check | ||
| assert link_monitor.get_active_lag_member_count(namespace, lag_name) == 3 | ||
|
|
||
| def test_monitor(self): | ||
| link_monitor = int_errors.PacketChassisInternalLinkMontoring() | ||
| link_monitor.monitor() | ||
|
|
||
| # Validation on asic0 | ||
| assert link_monitor.get_port_status('asic0', 'Ethernet-BP0') == 'up' | ||
| assert link_monitor.get_port_status('asic0', 'Ethernet-BP4') == 'up' | ||
| assert link_monitor.get_port_status('asic0', 'Ethernet-BP8') == 'down' | ||
| # Action should be skipped due to min_links check | ||
| assert link_monitor.get_port_status('asic0', 'Ethernet-BP16') == 'up' | ||
|
|
||
| # Validation on asic1 | ||
| assert link_monitor.get_port_status('asic1', 'Ethernet-BP256') == 'down' | ||
| assert link_monitor.get_port_status('asic1', 'Ethernet-BP260') == 'up' | ||
| # Action should be skipped due to min_links check | ||
| assert link_monitor.get_port_status('asic1', 'Ethernet-BP264') == 'up' |
Uh oh!
There was an error while loading. Please reload this page.