-
Notifications
You must be signed in to change notification settings - Fork 216
[Smartswitch] Add module specific pcie attach/detach functions for smartswitch platforms #557
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cb310a0
6117834
8beb2d6
76c45f6
451bdfa
1708c0f
e4d32a9
6cfe8e9
e3c1654
57ca44e
74380d1
d4ffc7c
07808d8
e65cb1b
6849579
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,8 +6,18 @@ | |
| """ | ||
|
|
||
| import sys | ||
| import os | ||
| import fcntl | ||
| from . import device_base | ||
| import json | ||
| import threading | ||
| import contextlib | ||
| import shutil | ||
|
|
||
| # PCI state database constants | ||
| PCIE_DETACH_INFO_TABLE = "PCIE_DETACH_INFO" | ||
| PCIE_OPERATION_DETACHING = "detaching" | ||
| PCIE_OPERATION_ATTACHING = "attaching" | ||
|
|
||
| class ModuleBase(device_base.DeviceBase): | ||
| """ | ||
|
|
@@ -16,6 +26,7 @@ class ModuleBase(device_base.DeviceBase): | |
| """ | ||
| # Device type definition. Note, this is a constant. | ||
| DEVICE_TYPE = "module" | ||
| PCI_OPERATION_LOCK_FILE_PATH = "/var/lock/{}_pci.lock" | ||
|
|
||
| # Possible card types for modular chassis | ||
| MODULE_TYPE_SUPERVISOR = "SUPERVISOR" | ||
|
|
@@ -73,6 +84,8 @@ def __init__(self): | |
| self._thermal_list = [] | ||
| self._voltage_sensor_list = [] | ||
| self._current_sensor_list = [] | ||
| self.state_db_connector = None | ||
| self.pci_bus_info = None | ||
|
|
||
| # List of SfpBase-derived objects representing all sfps | ||
| # available on the module | ||
|
|
@@ -81,6 +94,17 @@ def __init__(self): | |
| # List of ASIC-derived objects representing all ASICs | ||
| # visibile in PCI domain on the module | ||
| self._asic_list = [] | ||
|
|
||
| @contextlib.contextmanager | ||
| def _pci_operation_lock(self): | ||
| """File-based lock for PCI operations using flock""" | ||
| lock_file_path = self.PCI_OPERATION_LOCK_FILE_PATH.format(self.get_name()) | ||
| with open(lock_file_path, 'w') as f: | ||
| try: | ||
| fcntl.flock(f.fileno(), fcntl.LOCK_EX) | ||
| yield | ||
| finally: | ||
| fcntl.flock(f.fileno(), fcntl.LOCK_UN) | ||
|
|
||
| def get_base_mac(self): | ||
| """ | ||
|
|
@@ -271,10 +295,70 @@ def get_pci_bus_info(self): | |
| Retrieves the bus information. | ||
|
|
||
| Returns: | ||
| Returns the PCI bus information in BDF format like "[DDDD:]BB:SS:F" | ||
| Returns the PCI bus information in list of BDF format like "[DDDD:]BB:SS:F" | ||
| """ | ||
| raise NotImplementedError | ||
|
|
||
| def handle_pci_removal(self): | ||
| """ | ||
| Handles PCI device removal by updating state database and detaching device. | ||
|
|
||
| Returns: | ||
| bool: True if operation was successful, False otherwise | ||
| """ | ||
| try: | ||
| bus_info_list = self.get_pci_bus_info() | ||
| with self._pci_operation_lock(): | ||
| for bus in bus_info_list: | ||
| self.pci_entry_state_db(bus, PCIE_OPERATION_DETACHING) | ||
| return self.pci_detach() | ||
| except Exception as e: | ||
| sys.stderr.write("Failed to handle PCI removal: {}\n".format(str(e))) | ||
| return False | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add an error message in case of failure?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
|
||
| def pci_entry_state_db(self, pcie_string, operation): | ||
| """ | ||
| Generic function to handle PCI device state database entry. | ||
|
|
||
| Args: | ||
| pcie_string (str): The PCI bus string to be written to state database | ||
| operation (str): The operation being performed ("detaching" or "attaching") | ||
|
|
||
| Raises: | ||
| RuntimeError: If state database connection fails | ||
| """ | ||
| try: | ||
| # Do not use import if swsscommon is not needed | ||
| import swsscommon | ||
| PCIE_DETACH_INFO_TABLE_KEY = PCIE_DETACH_INFO_TABLE+"|"+pcie_string | ||
| if not self.state_db_connector: | ||
| self.state_db_connector = swsscommon.swsscommon.DBConnector("STATE_DB", 0) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check if state_db connection is successful? |
||
| if operation == PCIE_OPERATION_ATTACHING: | ||
| self.state_db_connector.delete(PCIE_DETACH_INFO_TABLE_KEY) | ||
| return | ||
| self.state_db_connector.hset(PCIE_DETACH_INFO_TABLE_KEY, "bus_info", pcie_string) | ||
| self.state_db_connector.hset(PCIE_DETACH_INFO_TABLE_KEY, "dpu_state", operation) | ||
| except Exception as e: | ||
| sys.stderr.write("Failed to write pcie bus info to state database: {}\n".format(str(e))) | ||
|
|
||
| def handle_pci_rescan(self): | ||
| """ | ||
| Handles PCI device rescan by updating state database and reattaching device. | ||
|
|
||
| Returns: | ||
| bool: True if operation was successful, False otherwise | ||
| """ | ||
| try: | ||
| bus_info_list = self.get_pci_bus_info() | ||
| with self._pci_operation_lock(): | ||
| return_value = self.pci_reattach() | ||
| for bus in bus_info_list: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we do the attach first and then remove the entry just to be safe?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Order is changed |
||
| self.pci_entry_state_db(bus, PCIE_OPERATION_ATTACHING) | ||
| return return_value | ||
| except Exception as e: | ||
| sys.stderr.write("Failed to handle PCI rescan: {}\n".format(str(e))) | ||
| return False | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add an error message in case of failure?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
|
||
| def pci_detach(self): | ||
| """ | ||
| Detaches the PCI device. | ||
|
|
@@ -687,3 +771,81 @@ def get_all_asics(self): | |
| And '0000:05:00.0' is its PCI address. | ||
| """ | ||
| return self._asic_list | ||
|
|
||
| def handle_sensor_removal(self): | ||
| """ | ||
| Handles sensor removal by copying ignore configuration file from platform folder | ||
| to sensors.d directory and restarting sensord if the file exists. | ||
|
|
||
| Returns: | ||
| bool: True if operation was successful, False otherwise | ||
| """ | ||
| try: | ||
| module_name = self.get_name() | ||
| source_file = f"/usr/share/sonic/platform/module_sensors_ignore_conf/ignore_sensors_{module_name}.conf" | ||
| target_file = f"/etc/sensors.d/ignore_sensors_{module_name}.conf" | ||
|
|
||
| # If source file does not exist, we dont need to copy it and restart sensord | ||
| if not os.path.exists(source_file): | ||
| return True | ||
|
|
||
| shutil.copy2(source_file, target_file) | ||
|
|
||
| # Restart sensord | ||
| os.system("service sensord restart") | ||
|
|
||
| return True | ||
| except Exception as e: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add an error message in case of failure?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| sys.stderr.write("Failed to handle sensor removal: {}\n".format(str(e))) | ||
| return False | ||
|
|
||
| def handle_sensor_addition(self): | ||
| """ | ||
| Handles sensor addition by removing the ignore configuration file from | ||
| sensors.d directory and restarting sensord. | ||
|
|
||
| Returns: | ||
| bool: True if operation was successful, False otherwise | ||
| """ | ||
| try: | ||
| module_name = self.get_name() | ||
| target_file = f"/etc/sensors.d/ignore_sensors_{module_name}.conf" | ||
|
|
||
| # If target file does not exist, we dont need to remove it and restart sensord | ||
| if not os.path.exists(target_file): | ||
| return True | ||
|
|
||
| # Remove the file | ||
| os.remove(target_file) | ||
|
|
||
| # Restart sensord | ||
| os.system("service sensord restart") | ||
|
|
||
| return True | ||
| except Exception as e: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add an error message in case of failure?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| sys.stderr.write("Failed to handle sensor addition: {}\n".format(str(e))) | ||
| return False | ||
|
|
||
| def module_pre_shutdown(self): | ||
| """ | ||
| Handles module pre-shutdown operations by detaching PCI devices and handling sensor removal. | ||
| This function should be called before shutting down a module. | ||
|
|
||
| Returns: | ||
| bool: True if all operations were successful, False otherwise | ||
| """ | ||
| sensor_result = self.handle_sensor_removal() | ||
| pci_result = self.handle_pci_removal() | ||
| return pci_result and sensor_result | ||
|
|
||
| def module_post_startup(self): | ||
| """ | ||
| Handles module post-startup operations by reattaching PCI devices and handling sensor addition. | ||
| This function should be called after a module has started up. | ||
|
|
||
| Returns: | ||
| bool: True if all operations were successful, False otherwise | ||
| """ | ||
| pci_result = self.handle_pci_rescan() | ||
| sensor_result = self.handle_sensor_addition() | ||
| return pci_result and sensor_result | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Query: Is this file separate for each module?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the lock should be valid across all modules, so we are using a generic lock file
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed offline, could you check if this lock is still required?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Clarification, File lock is required, and the lock is applicable per module, just to prevent reattach of the module while we are removing the module