Skip to content

Commit 3d5bef9

Browse files
[multi-asic][Mellanox] Add multi-ASIC support for generate_dump and update FW upgrade script (#4192)
- What I did Add multi-ASIC support for generate_dump and update FW upgrade script - How I did it 1. Refactor collect_mellanox() to support multi-ASIC architecture 2. Add collect_mellanox_sai_sdk_dump() function to collect SAI SDK dumps per ASIC 3. Process CMIS host management files for each ASIC instance separately 4. Collect SAI SDK dumps in parallel for all ASICs using background processes 5. Update fast-reboot to use mlnx-fw-manager instead of mlnx-fw-upgrade.sh 6. Fix file paths to be relative to SKU folder for multi-ASIC setups 7. Support namespace-aware command execution for multi-ASIC environments - How to verify it Run regression tests Signed-off-by: Oleksandr Ivantsiv <oivantsiv@nvidia.com>
1 parent 8451f01 commit 3d5bef9

2 files changed

Lines changed: 77 additions & 36 deletions

File tree

scripts/fast-reboot

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,7 @@ if [[ "$sonic_asic_type" == "mellanox" ]]; then
757757
MLNX_EXIT_FW_ERROR=100
758758
MLNX_EXIT_FFB_FAILURE=101
759759
760-
MLNX_FW_UPGRADE_SCRIPT="/usr/bin/mlnx-fw-upgrade.sh"
760+
MLNX_FW_UPGRADE_SCRIPT="/usr/local/bin/mlnx-fw-manager"
761761
762762
763763
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then

scripts/generate_dump

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,43 @@ save_symlink() {
14281428
echo "[ save_symlink:$filename] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
14291429
}
14301430

1431+
###############################################################################
1432+
# Collect Mellanox SAI SDK dump
1433+
# Globals:
1434+
# CMD_PREFIX
1435+
# Arguments:
1436+
# None
1437+
# Returns:
1438+
# None
1439+
###############################################################################
1440+
collect_mellanox_sai_sdk_dump() {
1441+
local namespace=$1
1442+
local asic_id=$2
1443+
local container_name=$3
1444+
1445+
local sai_dump_folder="/tmp/saisdkdump$asic_id"
1446+
local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump${asic_id}_$(date +"%m_%d_%Y_%I_%M_%p")"
1447+
1448+
if [[ "$( docker container inspect -f '{{.State.Running}}' $container_name )" == "true" ]]; then
1449+
if [[ x"$(sonic-db-cli $namespace APPL_DB EXISTS PORT_TABLE:PortInitDone)" == x"1" ]]; then
1450+
${CMD_PREFIX}docker exec $container_name mkdir -p $sai_dump_folder
1451+
${CMD_PREFIX}docker exec $container_name saisdkdump -f $sai_dump_filename
1452+
1453+
if [ $? != 0 ]; then
1454+
echo "Failed to collect saisdkdump."
1455+
fi
1456+
1457+
copy_from_docker $container_name $sai_dump_folder $sai_dump_folder
1458+
for file in `ls $sai_dump_folder`; do
1459+
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
1460+
done
1461+
1462+
${CMD_PREFIX}rm -rf $sai_dump_folder
1463+
${CMD_PREFIX}docker exec $container_name rm -rf $sai_dump_folder
1464+
fi
1465+
fi
1466+
}
1467+
14311468
###############################################################################
14321469
# Collect Mellanox specific information
14331470
# Globals:
@@ -1440,40 +1477,57 @@ save_symlink() {
14401477
collect_mellanox() {
14411478
trap 'handle_error $? $LINENO' ERR
14421479
local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m"
1443-
local sai_dump_folder="/tmp/saisdkdump"
1444-
local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump_$(date +"%m_%d_%Y_%I_%M_%p")"
1480+
14451481
local platform=$(python3 -c "from sonic_py_common import device_info; print(device_info.get_platform())")
14461482
local platform_folder="/usr/share/sonic/device/${platform}"
14471483
local hwsku=$(python3 -c "from sonic_py_common import device_info; print(device_info.get_hwsku())")
14481484
local is_smartswitch=$(python3 -c "from sonic_py_common import device_info; print(device_info.is_smartswitch())")
14491485
local sku_folder="/usr/share/sonic/device/${platform}/${hwsku}"
14501486
local cmis_host_mgmt_files=(
1451-
"/tmp/nv-syncd-shared/sai.profile"
1452-
"${sku_folder}/pmon_daemon_control.json"
1453-
"${sku_folder}/media_settings.json"
1454-
"${sku_folder}/optics_si_settings.json"
1487+
"pmon_daemon_control.json"
1488+
"media_settings.json"
1489+
"optics_si_settings.json"
14551490
)
14561491

1457-
if [[ "$( docker container inspect -f '{{.State.Running}}' syncd )" == "true" ]]; then
1458-
if [[ x"$(sonic-db-cli APPL_DB EXISTS PORT_TABLE:PortInitDone)" == x"1" ]]; then
1459-
# Run saisdkdump only after the create_switch is known to be successful
1460-
${CMD_PREFIX}docker exec syncd mkdir -p $sai_dump_folder
1461-
${CMD_PREFIX}docker exec syncd saisdkdump -f $sai_dump_filename
1492+
local pids=()
1493+
for ((i=0; i<NUM_ASICS; i++)); do
1494+
local namespace=""
1495+
local asic_id=
1496+
local container_name="syncd"
14621497

1463-
if [ $? != 0 ]; then
1464-
echo "Failed to collect saisdkdump."
1465-
fi
1498+
local cmis_host_mgmt_path="cmis-host-mgmt"
1499+
local sku_folder_path="${sku_folder}"
14661500

1467-
copy_from_docker syncd $sai_dump_folder $sai_dump_folder
1468-
echo "$sai_dump_folder"
1469-
for file in `ls $sai_dump_folder`; do
1470-
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
1471-
done
1501+
if [[ "$NUM_ASICS" > 1 ]]; then
1502+
namespace="-n asic$i"
1503+
asic_id="_asic${i}"
1504+
container_name="syncd$i"
14721505

1473-
${CMD_PREFIX}rm -rf $sai_dump_folder
1474-
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder
1506+
cmis_host_mgmt_path="cmis-host-mgmt/asic${i}"
1507+
sku_folder_path="${sku_folder}/$i"
1508+
fi
1509+
1510+
${CMD_PREFIX}save_file "/tmp/nv-syncd-shared/sai.profile" "$cmis_host_mgmt_path" false true
1511+
1512+
if [[ ! -f "${sku_folder_path}/pmon_daemon_control.json" && -f "${platform_folder}/pmon_daemon_control.json" ]]; then
1513+
${CMD_PREFIX}save_file "${platform_folder}/pmon_daemon_control.json" "$cmis_host_mgmt_path" false true
14751514
fi
1476-
fi
1515+
1516+
for file in "${cmis_host_mgmt_files[@]}"; do
1517+
if [[ -f "${sku_folder_path}/${file}" ]]; then
1518+
${CMD_PREFIX}save_file "${sku_folder_path}/${file}" "$cmis_host_mgmt_path" false true
1519+
fi
1520+
done
1521+
1522+
collect_mellanox_sai_sdk_dump "$namespace" "$asic_id" "$container_name" &
1523+
pids+=($!)
1524+
1525+
done
1526+
1527+
# Wait for all background processes to complete
1528+
for pid in "${pids[@]}"; do
1529+
wait $pid
1530+
done
14771531

14781532
# collect the sdk dump
14791533
local sdk_dbg_folder="/var/log/sdk_dbg"
@@ -1517,19 +1571,6 @@ collect_mellanox() {
15171571

15181572
save_sdk_sysfs "$sdk_sysfs_src_path" "$sdk_sysfs_dest_path" "${excludes_sysfs_files[@]}" &
15191573

1520-
# Save CMIS-host-management related files
1521-
local cmis_host_mgmt_path="cmis-host-mgmt"
1522-
1523-
for file in "${cmis_host_mgmt_files[@]}"; do
1524-
if [[ -f "${file}" ]]; then
1525-
${CMD_PREFIX}save_file "${file}" "$cmis_host_mgmt_path" false true
1526-
fi
1527-
done
1528-
1529-
if [[ ! -f "${sku_folder}/pmon_daemon_control.json" && -f "${platform_folder}/pmon_daemon_control.json" ]]; then
1530-
${CMD_PREFIX}save_file "${platform_folder}/pmon_daemon_control.json" "$cmis_host_mgmt_path" false true
1531-
fi
1532-
15331574
save_cmd "show interfaces autoneg status" "autoneg.status"
15341575

15351576
wait

0 commit comments

Comments
 (0)