Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion device/mellanox/x86_64-nvidia_sn4280-r0/installer.conf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ONIE_PLATFORM_EXTRA_CMDLINE_LINUX="libata.force=noncq module_blacklist=mlx5_ib,mlx5_core ima_hash=sha384 amd_iommu=off cpufreq.default_governor=performance"
ONIE_PLATFORM_EXTRA_CMDLINE_LINUX="libata.force=noncq ima_hash=sha384 amd_iommu=off cpufreq.default_governor=performance"
4 changes: 4 additions & 0 deletions device/mellanox/x86_64-nvidia_sn4280-r0/platform.json
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,7 @@
"baud-rate": "115200"
},
"rshim_info": "rshim0",
"rshim_bus_info": "0000:08:00.1",
"bus_info": "0000:08:00.0"
},
"dpu1": {
Expand All @@ -670,6 +671,7 @@
"baud-rate": "115200"
},
"rshim_info": "rshim1",
"rshim_bus_info": "0000:07:00.1",
"bus_info": "0000:07:00.0"
},
"dpu2": {
Expand All @@ -682,6 +684,7 @@
"baud-rate": "115200"
},
"rshim_info": "rshim2",
"rshim_bus_info": "0000:01:00.1",
"bus_info": "0000:01:00.0"
},
"dpu3": {
Expand All @@ -694,6 +697,7 @@
"baud-rate": "115200"
},
"rshim_info": "rshim3",
"rshim_bus_info": "0000:02:00.1",
"bus_info": "0000:02:00.0"
}
}
Expand Down
18 changes: 11 additions & 7 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -1114,20 +1114,24 @@ sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y in
sudo mkdir -p $FILESYSTEM_ROOT/etc/minicom/
sudo cp platform/mellanox/minirc.dfl $FILESYSTEM_ROOT/etc/minicom/

# Install rshim script
sudo install -m 755 platform/mellanox/rshim/files/rshim.sh $FILESYSTEM_ROOT/usr/bin/rshim.sh

# Install rshim services
sudo cp platform/mellanox/rshim/files/[email protected] $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM/
sudo cp platform/mellanox/rshim/files/rshim-manager.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM/

# Install dpuctl script and config file
sudo install -m 755 platform/mellanox/smartswitch/dpuctl/dpuctl.sh $FILESYSTEM_ROOT/usr/bin/dpuctl.sh
sudo install -m 755 platform/mellanox/smartswitch/dpuctl/dpu.conf $FILESYSTEM_ROOT_ETC/mlnx/

# Install dpuctl services
sudo cp platform/mellanox/smartswitch/dpuctl/dpuctl.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM/

# Install dpumap script
sudo install -m 755 platform/mellanox/smartswitch/dpumap.sh $FILESYSTEM_ROOT/usr/bin/dpumap.sh

# Install dpu-udev-manager script
sudo install -m 755 platform/mellanox/smartswitch/dpu-udev-manager/dpu-udev-manager.sh $FILESYSTEM_ROOT/usr/bin/dpu-udev-manager.sh

# Install dpu-udev-manager service
sudo install -m 644 platform/mellanox/smartswitch/dpu-udev-manager/dpu-udev-manager.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable dpu-udev-manager


{% endif %}

{% if sonic_asic_platform == "nvidia-bluefield" %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class DpuInterfaceEnum(Enum):
MIDPLANE_INT = "midplane_interface"
RSHIM_INT = "rshim_info"
PCIE_INT = "bus_info"
RSHIM_PCIE_INT = "rshim_bus_info"


dpu_interface_values = [item.value for item in DpuInterfaceEnum]
Expand Down
85 changes: 19 additions & 66 deletions platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@

WAIT_FOR_SHTDN = 120
WAIT_FOR_DPU_READY = 180
WAIT_FOR_PCI_DEV = 60


class OperationType(Enum):
Expand Down Expand Up @@ -104,10 +103,7 @@ def __init__(self, dpu_name):
self.shtdn_state = None
self.dpu_ready_state = None
self.setup_logger()
self.pci_dev_path = None
self.rshim_interface = None
# Use systemd dbus to execute start and stop rshim service
os.environ['DBUS_SESSION_BUS_ADDRESS'] = 'unix:path=/run/dbus/system_bus_socket'
self.pci_dev_path = []
self.verbosity = False

def setup_logger(self, use_print=False):
Expand Down Expand Up @@ -141,45 +137,11 @@ def run_cmd_output(self, cmd, raise_exception=True):

def dpu_pre_shutdown(self):
"""Method to execute shutdown activities for the DPU"""
rshim_op = self.dpu_rshim_service_control("stop")
pci_rem_op = self.dpu_pci_remove()
return rshim_op and pci_rem_op
return self.dpu_pci_remove()

def dpu_post_startup(self):
"""Method to execute all post startup activities for the DPU"""
pci_scan_op = self.dpu_pci_scan()
rshim_op = None
if self.wait_for_pci():
rshim_op = self.dpu_rshim_service_control("start")
if rshim_op and pci_scan_op:
return True
return False

def get_rshim_interface(self):
"""Parse the rshim interface from platform.json, raise Runtime error if the device id is not available"""
if not self.rshim_interface:
interface_name = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.RSHIM_INT.value)
if not interface_name:
raise RuntimeError(f"Unable to Parse rshim information for {self.dpu_name} from Platform.json")
# rshim1 -> rshim@1
self.rshim_interface = interface_name[:5] + "@" + interface_name[5:]
return self.rshim_interface

def dpu_rshim_service_control(self, op):
"""Start/Stop the RSHIM service for the current DPU"""
try:
rshim_cmd = ["dbus-send", "--dest=org.freedesktop.systemd1", "--type=method_call",
"--print-reply", "--reply-timeout=2000",
"/org/freedesktop/systemd1",
f"org.freedesktop.systemd1.Manager.{op.capitalize()}Unit",
f"string:{self.get_rshim_interface()}.service",
"string:replace"]
self.run_cmd_output(rshim_cmd)
# If command fails execution exception is raised , return true if control is still in try block
return True
except Exception as e:
self.log_error(f"Failed to {op} rshim!: {e}")
return False
return self.dpu_pci_scan()

@contextmanager
def get_open_fd(self, path, flag):
Expand All @@ -190,31 +152,19 @@ def get_open_fd(self, path, flag):
os.close(fd)

def get_pci_dev_path(self):
"""Parse the PCIE device ID from platform.json, raise Runtime error if the device id is not available"""
if not self.pci_dev_path:
pci_dev_id = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.PCIE_INT.value)
if not pci_dev_id:
raise RuntimeError(f"Unable to obtain pci device id for {self.dpu_name} from platform.json")
self.pci_dev_path = os.path.join(PCI_DEV_BASE, pci_dev_id, "remove")
return self.pci_dev_path
"""Parse the PCIE devices ID from platform.json, raise Runtime error if the device id is not available"""
if self.pci_dev_path:
return self.pci_dev_path

pci_dev_id = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.PCIE_INT.value)
rshim_pci_dev_id = DeviceDataManager.get_dpu_interface(self.dpu_name, DpuInterfaceEnum.RSHIM_PCIE_INT.value)
if not pci_dev_id or not rshim_pci_dev_id:
raise RuntimeError(f"Unable to obtain PCI device IDs for {self.dpu_name} from platform.json")

self.pci_dev_path = [os.path.join(PCI_DEV_BASE, pci_dev_id),
os.path.join(PCI_DEV_BASE, rshim_pci_dev_id)]

def wait_for_pci(self):
"""Wait for the PCI device folder in the PCI Path, required before starting rshim"""
try:
with self.get_open_fd(PCI_DEV_BASE, os.O_RDONLY) as dir_fd:
if os.path.exists(os.path.dirname(self.get_pci_dev_path())):
return True
poll_obj = poll()
poll_obj.register(dir_fd, POLLIN)
start = time.monotonic()
while (time.monotonic() - start) < WAIT_FOR_PCI_DEV:
events = poll_obj.poll(WAIT_FOR_PCI_DEV * 1000)
if events:
if os.path.exists(os.path.dirname(self.get_pci_dev_path())):
return True
return os.path.exists(os.path.dirname(self.get_pci_dev_path()))
except Exception as e:
self.log_error(f"Unable to wait for PCI device:{e}")
return self.pci_dev_path

def write_file(self, file_name, content_towrite):
"""Write given value to file only if file exists"""
Expand Down Expand Up @@ -297,7 +247,10 @@ def _power_on(self):
def dpu_pci_remove(self):
"""Per DPU PCI remove API"""
try:
self.write_file(self.get_pci_dev_path(), OperationType.SET.value)
for pci_dev_path in self.get_pci_dev_path():
remove_path = os.path.join(pci_dev_path, "remove")
if os.path.exists(remove_path):
self.write_file(remove_path, OperationType.SET.value)
return True
except Exception:
self.log_info(f"Failed PCI Removal!")
Expand Down
12 changes: 8 additions & 4 deletions platform/mellanox/mlnx-platform-api/tests/test_device_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,31 +120,35 @@ def test_dpu_interface_data(self, mock_load_json):
"Ethernet224": "Ethernet0"
},
"rshim_info": "rshim0",
"bus_info": "0000:08:00.0"
"bus_info": "0000:08:00.0",
"rshim_bus_info": "0000:08:00.1"
},
"dpu1": {
"midplane_interface": "dpu1",
"interface": {
"Ethernet232": "Ethernet0"
},
"rshim_info": "rshim1",
"bus_info": "0000:07:00.0"
"bus_info": "0000:07:00.0",
"rshim_bus_info": "0000:07:00.1"
},
"dpu2": {
"midplane_interface": "dpu2",
"interface": {
"Ethernet240": "Ethernet0"
},
"rshim_info": "rshim2",
"bus_info": "0000:01:00.0"
"bus_info": "0000:01:00.0",
"rshim_bus_info": "0000:01:00.1"
},
"dpu3": {
"midplane_interface": "dpu3",
"interface": {
"Ethernet248": "Ethernet0"
},
"rshim_info": "rshim3",
"bus_info": "0000:02:00.0"
"bus_info": "0000:02:00.0",
"rshim_bus_info": "0000:02:00.1"
}
}
mock_load_json.return_value = mock_value
Expand Down
Loading
Loading