diff --git a/platform/mellanox/files/mlnx-fw-manager.service b/platform/mellanox/files/mlnx-fw-manager.service index a97d240edc3..76b9be128a2 100644 --- a/platform/mellanox/files/mlnx-fw-manager.service +++ b/platform/mellanox/files/mlnx-fw-manager.service @@ -22,9 +22,8 @@ Description=Mellanox Firmware Manager Service [Service] Type=oneshot RemainAfterExit=yes -ExecStartPre=/usr/bin/mst start --with_i2cdev +ExecCondition=/bin/grep -qv '\' /proc/cmdline ExecStart=/usr/local/bin/mlnx-fw-manager --clear-semaphore --verbose -ExecStop=/usr/bin/mst stop TimeoutSec=300 User=root diff --git a/platform/mellanox/fw-manager/mellanox_fw_manager/firmware_coordinator.py b/platform/mellanox/fw-manager/mellanox_fw_manager/firmware_coordinator.py index 5fd3f5beef8..d91c8e9b94a 100644 --- a/platform/mellanox/fw-manager/mellanox_fw_manager/firmware_coordinator.py +++ b/platform/mellanox/fw-manager/mellanox_fw_manager/firmware_coordinator.py @@ -40,7 +40,8 @@ class FirmwareCoordinator: """Main coordinator class that manages multiple ASIC firmware processes.""" - def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaphore: bool = False): + def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaphore: bool = False, + ignore_mst_start_failure: bool = False): """ Initialize the firmware coordinator. @@ -52,6 +53,7 @@ def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaph self.verbose = verbose self.from_image = from_image self.clear_semaphore = clear_semaphore + self.ignore_mst_start_failure = ignore_mst_start_failure self.logger = logging.getLogger() try: @@ -98,14 +100,58 @@ def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaph self.logger.info(f"Initialized firmware coordinator with {len(self.managers)} ASIC(s) and image from {self.fw_bin_path}") + def _start_mst(self) -> bool: + """ + Start the MST driver. + + Returns: + True if MST was successfully started, False if startup failed and + ignore_mst_start_failure is set. + + Raises: + FirmwareManagerError: If MST start fails and ignore_mst_start_failure is False. + """ + self.logger.info("Starting MST with i2cdev") + result = run_command(['/usr/bin/mst', 'start', '--with_i2cdev'], + logger=self.logger, capture_output=True, text=True) + if result.returncode != 0: + msg = f"MST start failed (rc={result.returncode}): {result.stderr}" + if self.ignore_mst_start_failure: + self.logger.warning(f"{msg} - continuing (ignore-mst-start-failure set)") + return False + raise FirmwareManagerError(msg) + return True + + def _stop_mst(self) -> None: + """Stop the MST driver. Logs errors but does not raise.""" + self.logger.info("Stopping MST service") + result = run_command(['/usr/bin/mst', 'stop'], + logger=self.logger, capture_output=True, text=True) + if result.returncode != 0: + self.logger.warning(f"MST stop failed (rc={result.returncode}): {result.stderr}") + def upgrade_firmware(self) -> None: """ Upgrade firmware for all ASICs using separate processes. + Starts the MST driver before upgrading and stops it on exit (whether + successful or not). MST start failure aborts the upgrade unless + ignore_mst_start_failure is set. + Raises: + FirmwareManagerError: If MST start fails and ignore_mst_start_failure is False. FirmwareUpgradeError: If all ASIC upgrades fail FirmwareUpgradePartialError: If some ASIC upgrades fail """ + mst_started = self._start_mst() + try: + self._upgrade_firmware_impl() + finally: + if mst_started: + self._stop_mst() + + def _upgrade_firmware_impl(self) -> None: + """Internal upgrade implementation, called after MST has been started.""" num_asics = len(self.managers) self.logger.info(f"Starting firmware upgrade for {num_asics} ASIC(s) from {self.fw_bin_path}") diff --git a/platform/mellanox/fw-manager/mellanox_fw_manager/main.py b/platform/mellanox/fw-manager/mellanox_fw_manager/main.py index 75828ce34ef..32ca9686166 100644 --- a/platform/mellanox/fw-manager/mellanox_fw_manager/main.py +++ b/platform/mellanox/fw-manager/mellanox_fw_manager/main.py @@ -227,7 +227,8 @@ def handle_dry_run(verbose: bool, upgrade: bool) -> int: return EXIT_FAILURE -def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int: +def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool, + ignore_mst_start_failure: bool = False) -> int: """ Handle firmware upgrade operation. @@ -235,6 +236,7 @@ def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int: verbose: Enable verbose logging upgrade: Use firmware from next SONiC image clear_semaphore: Clear hardware semaphore before upgrade + ignore_mst_start_failure: Continue upgrade even if MST driver fails to start Returns: Exit code @@ -243,7 +245,8 @@ def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int: fw_coordinator = FirmwareCoordinator( verbose=verbose, from_image=upgrade, - clear_semaphore=clear_semaphore + clear_semaphore=clear_semaphore, + ignore_mst_start_failure=ignore_mst_start_failure ) if not fw_coordinator.check_upgrade_required(): @@ -278,11 +281,14 @@ def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int: help='Clear hw semaphore before firmware upgrade') @click.option('-r', '--reset', is_flag=True, help='Reset firmware configuration (NVIDIA BlueField platform only)') +@click.option('-m', '--ignore-mst-start-failure', is_flag=True, + help='Continue firmware upgrade even if MST driver fails to start ' + '(useful on SmartSwitch when DPUs are powered off)') @click.option('--nosyslog', is_flag=True, help='Disable syslog and log to console only') @click.option('--status', 'status', type=str, default=None, is_flag=False, flag_value='__flag__', metavar='[ASIC_ID|all]', help='Show firmware version status. Single-ASIC: use as flag. Multi-ASIC: specify ASIC ID or "all".') -def main(upgrade, verbose, dry_run, clear_semaphore, reset, nosyslog, status): +def main(upgrade, verbose, dry_run, clear_semaphore, reset, ignore_mst_start_failure, nosyslog, status): """ Mellanox Firmware Manager @@ -333,7 +339,7 @@ def main(upgrade, verbose, dry_run, clear_semaphore, reset, nosyslog, status): elif dry_run: exit_code = handle_dry_run(verbose, upgrade) else: - exit_code = handle_upgrade(verbose, upgrade, clear_semaphore) + exit_code = handle_upgrade(verbose, upgrade, clear_semaphore, ignore_mst_start_failure) logger.info(f"Mellanox Firmware Manager finished with exit code {exit_code}") sys.exit(exit_code) diff --git a/platform/mellanox/fw-manager/tests/test_firmware_coordinator.py b/platform/mellanox/fw-manager/tests/test_firmware_coordinator.py index bc7db1a618f..e0e35f4daa2 100644 --- a/platform/mellanox/fw-manager/tests/test_firmware_coordinator.py +++ b/platform/mellanox/fw-manager/tests/test_firmware_coordinator.py @@ -228,7 +228,9 @@ def test_upgrade_firmware_queue_processing_exception(self, mock_create_manager, coordinator = FirmwareCoordinator() - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue: + with patch.object(coordinator, '_start_mst', return_value=True), \ + patch.object(coordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue: mock_queue_instance = MagicMock() mock_queue.return_value = mock_queue_instance @@ -255,7 +257,9 @@ def test_upgrade_firmware_all_failures(self, mock_create_manager, mock_asic_mana coordinator = FirmwareCoordinator() - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue: + with patch.object(coordinator, '_start_mst', return_value=True), \ + patch.object(coordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue: mock_queue_instance = MagicMock() mock_queue.return_value = mock_queue_instance @@ -295,7 +299,9 @@ def test_upgrade_firmware_partial_failures(self, mock_create_manager, mock_asic_ coordinator = FirmwareCoordinator() - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue: + with patch.object(coordinator, '_start_mst', return_value=True), \ + patch.object(coordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue: mock_queue_instance = MagicMock() mock_queue.return_value = mock_queue_instance @@ -599,7 +605,9 @@ def test_upgrade_firmware_4_asics_all_success(self, mock_create_manager, mock_as coordinator = FirmwareCoordinator() - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: + with patch.object(coordinator, '_start_mst', return_value=True), \ + patch.object(coordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: mock_queue_instance = MagicMock() mock_queue_class.return_value = mock_queue_instance @@ -664,7 +672,9 @@ def track_join(timeout=None): coordinator = FirmwareCoordinator() - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: + with patch.object(coordinator, '_start_mst', return_value=True), \ + patch.object(coordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: mock_queue_instance = MagicMock() mock_queue_class.return_value = mock_queue_instance diff --git a/platform/mellanox/fw-manager/tests/test_firmware_upgrade.py b/platform/mellanox/fw-manager/tests/test_firmware_upgrade.py index abc3bbda56d..f5ffee007e7 100644 --- a/platform/mellanox/fw-manager/tests/test_firmware_upgrade.py +++ b/platform/mellanox/fw-manager/tests/test_firmware_upgrade.py @@ -368,7 +368,9 @@ def test_coordinator_firmware_upgrade_success(self, mock_create_manager, mock_create_manager.side_effect = mock_managers - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: + with patch.object(FirmwareCoordinator, '_start_mst', return_value=True), \ + patch.object(FirmwareCoordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: mock_queue = MagicMock() mock_queue.empty.side_effect = [False, False, True] # Two items, then empty mock_queue.get_nowait.side_effect = [ @@ -417,7 +419,9 @@ def test_coordinator_partial_failure(self, mock_create_manager, mock_create_manager.side_effect = mock_managers - with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: + with patch.object(FirmwareCoordinator, '_start_mst', return_value=True), \ + patch.object(FirmwareCoordinator, '_stop_mst'), \ + patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class: mock_queue = MagicMock() mock_queue.empty.side_effect = [False, False, True] # Two items, then empty mock_queue.get_nowait.side_effect = [ diff --git a/platform/mellanox/fw-manager/tests/test_main.py b/platform/mellanox/fw-manager/tests/test_main.py index e2291b65f6d..a982e98fa56 100644 --- a/platform/mellanox/fw-manager/tests/test_main.py +++ b/platform/mellanox/fw-manager/tests/test_main.py @@ -390,7 +390,7 @@ def test_upgrade_with_flags_cli(self, mock_handle_upgrade, mock_lock, mock_exit_ result = runner.invoke(main, ['--upgrade', '--clear-semaphore']) - mock_handle_upgrade.assert_called_once_with(False, True, True) + mock_handle_upgrade.assert_called_once_with(False, True, True, False) def mock_open(): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/component.py b/platform/mellanox/mlnx-platform-api/sonic_platform/component.py index 0874497efdf..3ae2dceee6b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/component.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/component.py @@ -1,6 +1,6 @@ # # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2019-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,6 +23,7 @@ try: + import contextlib import os import io import re @@ -763,26 +764,21 @@ def __init__(self, idx): self.image_ext_name = self.COMPONENT_FIRMWARE_EXTENSION def __get_mst_device(self): - if not os.path.exists(self.MST_DEVICE_PATH): - print("ERROR: mst driver is not loaded") - return None - - pattern = os.path.join(self.MST_DEVICE_PATH, self.MST_DEVICE_PATTERN) - - mst_dev_list = glob.glob(pattern) - if not mst_dev_list or len(mst_dev_list) != 1: - devices = str(os.listdir(self.MST_DEVICE_PATH)) - print("ERROR: Failed to get mst device: pattern={}, devices={}".format(pattern, devices)) - return None - - return mst_dev_list[0] + output = None + try: + output = subprocess.check_output(['/usr/bin/asic_detect/asic_detect.sh', '-p']).decode('utf-8').strip() + except subprocess.CalledProcessError as e: + raise RuntimeError("Failed to get {} mst device: {}".format(self.name, str(e))) + return output def _install_firmware(self, image_path): if not self._check_file_validity(image_path): return False - mst_dev = self.__get_mst_device() - if mst_dev is None: + try: + mst_dev = self.__get_mst_device() + except RuntimeError as e: + print("ERROR: {}".format(e)) return False self.CPLD_FIRMWARE_UPDATE_COMMAND[2] = mst_dev self.CPLD_FIRMWARE_UPDATE_COMMAND[4] = image_path @@ -1027,12 +1023,27 @@ class ComponenetFPGADPU(ComponentCPLD): CPLD_FIRMWARE_UPDATE_COMMAND = ['cpldupdate', '--cpld_chain', '2', '--gpio', '--print-progress', ''] + @contextlib.contextmanager + def _mst_context(self): + try: + subprocess.check_call(['/usr/bin/mst', 'start'], universal_newlines=True) + yield + except subprocess.CalledProcessError as e: + logger.log_error("Failed to manage {} mst: {}".format(self.name, str(e))) + raise + finally: + try: + subprocess.check_call(['/usr/bin/mst', 'stop'], universal_newlines=True) + except subprocess.CalledProcessError as e: + logger.log_error("Failed to stop {} mst: {}".format(self.name, str(e))) + def _install_firmware(self, image_path): self.CPLD_FIRMWARE_UPDATE_COMMAND[5] = image_path try: print("INFO: Installing {} firmware update: path={}".format(self.name, image_path)) - subprocess.check_call(self.CPLD_FIRMWARE_UPDATE_COMMAND, universal_newlines=True) + with self._mst_context(): + subprocess.check_call(self.CPLD_FIRMWARE_UPDATE_COMMAND, universal_newlines=True) except subprocess.CalledProcessError as e: print("ERROR: Failed to update {} firmware: {}".format(self.name, str(e))) return False diff --git a/platform/mellanox/mlnx-platform-api/tests/test_component.py b/platform/mellanox/mlnx-platform-api/tests/test_component.py index 9c0649374da..f9255aebeba 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_component.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_component.py @@ -1,5 +1,6 @@ # -# Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -235,7 +236,7 @@ def test_cpld_component(self, mock_exists, mock_get_meta_data, mock_get_path, mo c._check_file_validity = mock.MagicMock(return_value=False) assert not c._install_firmware('') c._check_file_validity = mock.MagicMock(return_value=True) - c._ComponentCPLD__get_mst_device = mock.MagicMock(return_value=None) + c._ComponentCPLD__get_mst_device = mock.MagicMock(side_effect=RuntimeError('no device')) assert not c._install_firmware('') c._ComponentCPLD__get_mst_device = mock.MagicMock(return_value='some dev') assert c._install_firmware('') @@ -295,15 +296,18 @@ def test_cpld_get_component_list_dpu(self): for index, item in enumerate(component_list): assert item.name == 'DPU{}_FPGA'.format(index + 1) - def test_cpld_get_mst_device(self): + @mock.patch('sonic_platform.component.subprocess.check_output') + def test_cpld_get_mst_device(self, mock_check_output): ComponentCPLD.MST_DEVICE_PATH = '/tmp/mst' os.system('rm -rf /tmp/mst') c = ComponentCPLD(1) - assert c._ComponentCPLD__get_mst_device() is None + mock_check_output.return_value = b'' + assert c._ComponentCPLD__get_mst_device() == '' os.makedirs(ComponentCPLD.MST_DEVICE_PATH) - assert c._ComponentCPLD__get_mst_device() is None + assert c._ComponentCPLD__get_mst_device() == '' with open('/tmp/mst/mt0_pci_cr0', 'w+') as f: f.write('dummy') + mock_check_output.return_value = b'/tmp/mst/mt0_pci_cr0' assert c._ComponentCPLD__get_mst_device() == '/tmp/mst/mt0_pci_cr0' @mock.patch('sonic_platform.component.subprocess.check_call') diff --git a/platform/nvidia-bluefield/installer/install.sh.j2 b/platform/nvidia-bluefield/installer/install.sh.j2 index faf8c5755d8..16fe6d06930 100755 --- a/platform/nvidia-bluefield/installer/install.sh.j2 +++ b/platform/nvidia-bluefield/installer/install.sh.j2 @@ -264,11 +264,10 @@ if [[ $SKIP_FIRMWARE_UPGRADE != "true" ]]; then bfb_pre_fw_install fi - ex chroot $sonic_fs_mountpoint mst start - ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager --nosyslog --verbose + ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager -m --nosyslog --verbose if [[ $FORCE_FW_CONFIG_RESET == "true" ]]; then - ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager --reset --nosyslog --verbose + ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager -m --reset --nosyslog --verbose fi ex umount $sonic_fs_mountpoint/host