Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions platform/mellanox/files/mlnx-fw-manager.service
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ Description=Mellanox Firmware Manager Service
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStartPre=/usr/bin/mst start --with_i2cdev
ExecCondition=/bin/grep -qv '\<SONIC_BOOT_TYPE=fastfast\>' /proc/cmdline
ExecStart=/usr/local/bin/mlnx-fw-manager --clear-semaphore --verbose
ExecStop=/usr/bin/mst stop
TimeoutSec=300
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TimeoutSec=300 may be too low for firmware upgrades on multi-ASIC systems. The firmware coordinator uses a 600s-per-ASIC join timeout, so systemd can terminate mlnx-fw-manager before it finishes, potentially leaving the system mid-upgrade (and MST cleanup in the Python finally block won’t run if the process is SIGKILLed). Consider increasing TimeoutSec or setting it based on expected worst-case upgrade time.

Suggested change
TimeoutSec=300
TimeoutSec=infinity

Copilot uses AI. Check for mistakes.
User=root

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
class FirmwareCoordinator:
"""Main coordinator class that manages multiple ASIC firmware processes."""

def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaphore: bool = False):
def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaphore: bool = False,
ignore_mst_start_failure: bool = False):
"""
Initialize the firmware coordinator.

Expand All @@ -52,6 +53,7 @@ def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaph
self.verbose = verbose
self.from_image = from_image
self.clear_semaphore = clear_semaphore
self.ignore_mst_start_failure = ignore_mst_start_failure
self.logger = logging.getLogger()

try:
Expand Down Expand Up @@ -98,14 +100,58 @@ def __init__(self, verbose: bool = False, from_image: bool = False, clear_semaph

self.logger.info(f"Initialized firmware coordinator with {len(self.managers)} ASIC(s) and image from {self.fw_bin_path}")

def _start_mst(self) -> bool:
"""
Start the MST driver.

Returns:
True if MST was successfully started, False if startup failed and
ignore_mst_start_failure is set.

Raises:
FirmwareManagerError: If MST start fails and ignore_mst_start_failure is False.
"""
self.logger.info("Starting MST with i2cdev")
result = run_command(['/usr/bin/mst', 'start', '--with_i2cdev'],
logger=self.logger, capture_output=True, text=True)
if result.returncode != 0:
msg = f"MST start failed (rc={result.returncode}): {result.stderr}"
if self.ignore_mst_start_failure:
self.logger.warning(f"{msg} - continuing (ignore-mst-start-failure set)")
return False
raise FirmwareManagerError(msg)
return True

def _stop_mst(self) -> None:
"""Stop the MST driver. Logs errors but does not raise."""
self.logger.info("Stopping MST service")
result = run_command(['/usr/bin/mst', 'stop'],
logger=self.logger, capture_output=True, text=True)
if result.returncode != 0:
self.logger.warning(f"MST stop failed (rc={result.returncode}): {result.stderr}")

def upgrade_firmware(self) -> None:
"""
Upgrade firmware for all ASICs using separate processes.

Starts the MST driver before upgrading and stops it on exit (whether
successful or not). MST start failure aborts the upgrade unless
ignore_mst_start_failure is set.

Raises:
FirmwareManagerError: If MST start fails and ignore_mst_start_failure is False.
FirmwareUpgradeError: If all ASIC upgrades fail
FirmwareUpgradePartialError: If some ASIC upgrades fail
"""
mst_started = self._start_mst()
try:
self._upgrade_firmware_impl()
finally:
if mst_started:
self._stop_mst()

def _upgrade_firmware_impl(self) -> None:
"""Internal upgrade implementation, called after MST has been started."""
num_asics = len(self.managers)

self.logger.info(f"Starting firmware upgrade for {num_asics} ASIC(s) from {self.fw_bin_path}")
Expand Down
14 changes: 10 additions & 4 deletions platform/mellanox/fw-manager/mellanox_fw_manager/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,16 @@ def handle_dry_run(verbose: bool, upgrade: bool) -> int:
return EXIT_FAILURE


def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int:
def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool,
ignore_mst_start_failure: bool = False) -> int:
"""
Handle firmware upgrade operation.

Args:
verbose: Enable verbose logging
upgrade: Use firmware from next SONiC image
clear_semaphore: Clear hardware semaphore before upgrade
ignore_mst_start_failure: Continue upgrade even if MST driver fails to start

Returns:
Exit code
Expand All @@ -243,7 +245,8 @@ def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int:
fw_coordinator = FirmwareCoordinator(
verbose=verbose,
from_image=upgrade,
clear_semaphore=clear_semaphore
clear_semaphore=clear_semaphore,
ignore_mst_start_failure=ignore_mst_start_failure
)

if not fw_coordinator.check_upgrade_required():
Expand Down Expand Up @@ -278,11 +281,14 @@ def handle_upgrade(verbose: bool, upgrade: bool, clear_semaphore: bool) -> int:
help='Clear hw semaphore before firmware upgrade')
@click.option('-r', '--reset', is_flag=True,
help='Reset firmware configuration (NVIDIA BlueField platform only)')
@click.option('-m', '--ignore-mst-start-failure', is_flag=True,
help='Continue firmware upgrade even if MST driver fails to start '
'(useful on SmartSwitch when DPUs are powered off)')
@click.option('--nosyslog', is_flag=True,
help='Disable syslog and log to console only')
@click.option('--status', 'status', type=str, default=None, is_flag=False, flag_value='__flag__', metavar='[ASIC_ID|all]',
help='Show firmware version status. Single-ASIC: use as flag. Multi-ASIC: specify ASIC ID or "all".')
def main(upgrade, verbose, dry_run, clear_semaphore, reset, nosyslog, status):
def main(upgrade, verbose, dry_run, clear_semaphore, reset, ignore_mst_start_failure, nosyslog, status):
"""
Mellanox Firmware Manager

Expand Down Expand Up @@ -333,7 +339,7 @@ def main(upgrade, verbose, dry_run, clear_semaphore, reset, nosyslog, status):
elif dry_run:
exit_code = handle_dry_run(verbose, upgrade)
else:
exit_code = handle_upgrade(verbose, upgrade, clear_semaphore)
exit_code = handle_upgrade(verbose, upgrade, clear_semaphore, ignore_mst_start_failure)

logger.info(f"Mellanox Firmware Manager finished with exit code {exit_code}")
sys.exit(exit_code)
Expand Down
20 changes: 15 additions & 5 deletions platform/mellanox/fw-manager/tests/test_firmware_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,9 @@ def test_upgrade_firmware_queue_processing_exception(self, mock_create_manager,

coordinator = FirmwareCoordinator()

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue:
with patch.object(coordinator, '_start_mst', return_value=True), \
patch.object(coordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue:
mock_queue_instance = MagicMock()
mock_queue.return_value = mock_queue_instance

Expand All @@ -255,7 +257,9 @@ def test_upgrade_firmware_all_failures(self, mock_create_manager, mock_asic_mana

coordinator = FirmwareCoordinator()

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue:
with patch.object(coordinator, '_start_mst', return_value=True), \
patch.object(coordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue:
mock_queue_instance = MagicMock()
mock_queue.return_value = mock_queue_instance

Expand Down Expand Up @@ -295,7 +299,9 @@ def test_upgrade_firmware_partial_failures(self, mock_create_manager, mock_asic_

coordinator = FirmwareCoordinator()

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue:
with patch.object(coordinator, '_start_mst', return_value=True), \
patch.object(coordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue:
mock_queue_instance = MagicMock()
mock_queue.return_value = mock_queue_instance

Expand Down Expand Up @@ -599,7 +605,9 @@ def test_upgrade_firmware_4_asics_all_success(self, mock_create_manager, mock_as

coordinator = FirmwareCoordinator()

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
with patch.object(coordinator, '_start_mst', return_value=True), \
patch.object(coordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
mock_queue_instance = MagicMock()
mock_queue_class.return_value = mock_queue_instance

Expand Down Expand Up @@ -664,7 +672,9 @@ def track_join(timeout=None):

coordinator = FirmwareCoordinator()

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
with patch.object(coordinator, '_start_mst', return_value=True), \
patch.object(coordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
mock_queue_instance = MagicMock()
mock_queue_class.return_value = mock_queue_instance

Expand Down
8 changes: 6 additions & 2 deletions platform/mellanox/fw-manager/tests/test_firmware_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,9 @@ def test_coordinator_firmware_upgrade_success(self, mock_create_manager,

mock_create_manager.side_effect = mock_managers

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
with patch.object(FirmwareCoordinator, '_start_mst', return_value=True), \
patch.object(FirmwareCoordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
mock_queue = MagicMock()
mock_queue.empty.side_effect = [False, False, True] # Two items, then empty
mock_queue.get_nowait.side_effect = [
Expand Down Expand Up @@ -417,7 +419,9 @@ def test_coordinator_partial_failure(self, mock_create_manager,

mock_create_manager.side_effect = mock_managers

with patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
with patch.object(FirmwareCoordinator, '_start_mst', return_value=True), \
patch.object(FirmwareCoordinator, '_stop_mst'), \
patch('mellanox_fw_manager.firmware_coordinator.Queue') as mock_queue_class:
mock_queue = MagicMock()
mock_queue.empty.side_effect = [False, False, True] # Two items, then empty
mock_queue.get_nowait.side_effect = [
Expand Down
2 changes: 1 addition & 1 deletion platform/mellanox/fw-manager/tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def test_upgrade_with_flags_cli(self, mock_handle_upgrade, mock_lock, mock_exit_

result = runner.invoke(main, ['--upgrade', '--clear-semaphore'])

mock_handle_upgrade.assert_called_once_with(False, True, True)
mock_handle_upgrade.assert_called_once_with(False, True, True, False)


def mock_open():
Expand Down
45 changes: 28 additions & 17 deletions platform/mellanox/mlnx-platform-api/sonic_platform/component.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2019-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -23,6 +23,7 @@


try:
import contextlib
import os
import io
import re
Expand Down Expand Up @@ -763,26 +764,21 @@ def __init__(self, idx):
self.image_ext_name = self.COMPONENT_FIRMWARE_EXTENSION

def __get_mst_device(self):
if not os.path.exists(self.MST_DEVICE_PATH):
print("ERROR: mst driver is not loaded")
return None

pattern = os.path.join(self.MST_DEVICE_PATH, self.MST_DEVICE_PATTERN)

mst_dev_list = glob.glob(pattern)
if not mst_dev_list or len(mst_dev_list) != 1:
devices = str(os.listdir(self.MST_DEVICE_PATH))
print("ERROR: Failed to get mst device: pattern={}, devices={}".format(pattern, devices))
return None

return mst_dev_list[0]
output = None
try:
output = subprocess.check_output(['/usr/bin/asic_detect/asic_detect.sh', '-p']).decode('utf-8').strip()
except subprocess.CalledProcessError as e:
raise RuntimeError("Failed to get {} mst device: {}".format(self.name, str(e)))
Comment on lines +767 to +771
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__get_mst_device() can return an empty string (e.g., when asic_detect.sh -p outputs nothing). That empty value is then used as the --dev argument for cpldupdate, which will fail with a less clear error and makes it hard to distinguish “no MST device” from other failures. Consider treating empty output as an error (raise RuntimeError with a clear message) and also handle OSError/FileNotFoundError from check_output so _install_firmware() can fail gracefully instead of potentially crashing the caller.

Suggested change
output = None
try:
output = subprocess.check_output(['/usr/bin/asic_detect/asic_detect.sh', '-p']).decode('utf-8').strip()
except subprocess.CalledProcessError as e:
raise RuntimeError("Failed to get {} mst device: {}".format(self.name, str(e)))
try:
output = subprocess.check_output(
['/usr/bin/asic_detect/asic_detect.sh', '-p']
).decode('utf-8').strip()
except subprocess.CalledProcessError as e:
raise RuntimeError("Failed to get {} mst device: {}".format(self.name, str(e)))
except OSError as e:
raise RuntimeError("Failed to execute mst device detection for {}: {}".format(self.name, str(e)))
if not output:
raise RuntimeError("Failed to get {} mst device: empty output from asic_detect.sh".format(self.name))

Copilot uses AI. Check for mistakes.
return output

def _install_firmware(self, image_path):
if not self._check_file_validity(image_path):
return False

mst_dev = self.__get_mst_device()
if mst_dev is None:
try:
mst_dev = self.__get_mst_device()
except RuntimeError as e:
print("ERROR: {}".format(e))
return False
self.CPLD_FIRMWARE_UPDATE_COMMAND[2] = mst_dev
self.CPLD_FIRMWARE_UPDATE_COMMAND[4] = image_path
Expand Down Expand Up @@ -1027,12 +1023,27 @@ class ComponenetFPGADPU(ComponentCPLD):

CPLD_FIRMWARE_UPDATE_COMMAND = ['cpldupdate', '--cpld_chain', '2', '--gpio', '--print-progress', '']

@contextlib.contextmanager
def _mst_context(self):
try:
subprocess.check_call(['/usr/bin/mst', 'start'], universal_newlines=True)
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ComponenetFPGADPU._mst_context() starts MST without --with_i2cdev, while the firmware manager path starts MST with --with_i2cdev (see mellanox_fw_manager/firmware_coordinator.py). If i2cdev support is required for these updates, this inconsistency can cause failures. Consider using the same mst start --with_i2cdev invocation here (or document why the plain mst start is sufficient for DPU FPGA upgrades).

Suggested change
subprocess.check_call(['/usr/bin/mst', 'start'], universal_newlines=True)
subprocess.check_call(['/usr/bin/mst', 'start', '--with_i2cdev'], universal_newlines=True)

Copilot uses AI. Check for mistakes.
yield
except subprocess.CalledProcessError as e:
logger.log_error("Failed to manage {} mst: {}".format(self.name, str(e)))
raise
finally:
try:
subprocess.check_call(['/usr/bin/mst', 'stop'], universal_newlines=True)
except subprocess.CalledProcessError as e:
logger.log_error("Failed to stop {} mst: {}".format(self.name, str(e)))

def _install_firmware(self, image_path):
self.CPLD_FIRMWARE_UPDATE_COMMAND[5] = image_path

try:
print("INFO: Installing {} firmware update: path={}".format(self.name, image_path))
subprocess.check_call(self.CPLD_FIRMWARE_UPDATE_COMMAND, universal_newlines=True)
with self._mst_context():
subprocess.check_call(self.CPLD_FIRMWARE_UPDATE_COMMAND, universal_newlines=True)
except subprocess.CalledProcessError as e:
print("ERROR: Failed to update {} firmware: {}".format(self.name, str(e)))
return False
Expand Down
14 changes: 9 additions & 5 deletions platform/mellanox/mlnx-platform-api/tests/test_component.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#
# Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -235,7 +236,7 @@ def test_cpld_component(self, mock_exists, mock_get_meta_data, mock_get_path, mo
c._check_file_validity = mock.MagicMock(return_value=False)
assert not c._install_firmware('')
c._check_file_validity = mock.MagicMock(return_value=True)
c._ComponentCPLD__get_mst_device = mock.MagicMock(return_value=None)
c._ComponentCPLD__get_mst_device = mock.MagicMock(side_effect=RuntimeError('no device'))
assert not c._install_firmware('')
c._ComponentCPLD__get_mst_device = mock.MagicMock(return_value='some dev')
assert c._install_firmware('')
Expand Down Expand Up @@ -295,15 +296,18 @@ def test_cpld_get_component_list_dpu(self):
for index, item in enumerate(component_list):
assert item.name == 'DPU{}_FPGA'.format(index + 1)

def test_cpld_get_mst_device(self):
@mock.patch('sonic_platform.component.subprocess.check_output')
def test_cpld_get_mst_device(self, mock_check_output):
ComponentCPLD.MST_DEVICE_PATH = '/tmp/mst'
os.system('rm -rf /tmp/mst')
c = ComponentCPLD(1)
assert c._ComponentCPLD__get_mst_device() is None
mock_check_output.return_value = b''
assert c._ComponentCPLD__get_mst_device() == ''
os.makedirs(ComponentCPLD.MST_DEVICE_PATH)
assert c._ComponentCPLD__get_mst_device() is None
assert c._ComponentCPLD__get_mst_device() == ''
with open('/tmp/mst/mt0_pci_cr0', 'w+') as f:
f.write('dummy')
mock_check_output.return_value = b'/tmp/mst/mt0_pci_cr0'
assert c._ComponentCPLD__get_mst_device() == '/tmp/mst/mt0_pci_cr0'

@mock.patch('sonic_platform.component.subprocess.check_call')
Expand Down
5 changes: 2 additions & 3 deletions platform/nvidia-bluefield/installer/install.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,10 @@ if [[ $SKIP_FIRMWARE_UPGRADE != "true" ]]; then
bfb_pre_fw_install
fi

ex chroot $sonic_fs_mountpoint mst start
ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager --nosyslog --verbose
ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager -m --nosyslog --verbose

if [[ $FORCE_FW_CONFIG_RESET == "true" ]]; then
ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager --reset --nosyslog --verbose
ex chroot $sonic_fs_mountpoint /usr/local/bin/mlnx-fw-manager -m --reset --nosyslog --verbose
fi

ex umount $sonic_fs_mountpoint/host
Expand Down
Loading