Skip to content

Commit 8ec48b6

Browse files
ggoklaniGaurav Goklani
andauthored
rdma naming infra changes (#67)
Co-authored-by: Gaurav Goklani <ggoklani@ggoklani-thinkpadt14gen4.punetw6.csb>
1 parent 7897f66 commit 8ec48b6

11 files changed

Lines changed: 255 additions & 0 deletions

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,20 @@ Default: `true`
105105

106106
Type: `bool`
107107

108+
### hpc_enable_azure_persistent_rdma_naming
109+
110+
Whether to configure a persistent RDMA device naming scheme on Azure:
111+
112+
* Installs `/usr/sbin/azure_persistent_rdma_naming.sh`
113+
* Installs and enables `azure_persistent_rdma_naming.service`
114+
* Installs a udev rule that triggers the service on InfiniBand device add/change events
115+
116+
This is automatically skipped on non-Azure systems.
117+
118+
Default: `true`
119+
120+
Type: `bool`
121+
108122
### hpc_install_system_openmpi
109123

110124
Whether to install OpenMPI that comes from AppStream repositories and does not have Nvidia GPU support.

defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ hpc_install_cuda_toolkit: true
2121
hpc_install_hpc_nvidia_nccl: true
2222
hpc_install_nvidia_fabric_manager: true
2323
hpc_install_rdma: true
24+
hpc_enable_azure_persistent_rdma_naming: true
2425
hpc_install_system_openmpi: true
2526
hpc_build_openmpi_w_nvidia_gpu_support: true
2627
hpc_install_moneo: true

handlers/main.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,18 @@
88
name: waagent
99
state: restarted
1010

11+
- name: Clean dnf metadata
12+
command: dnf clean all
13+
changed_when: false
14+
1115
- name: Reload udev
1216
command: udevadm control --reload
1317
changed_when: true
1418

19+
- name: Trigger udev for infiniband
20+
command: udevadm trigger --subsystem-match=infiniband
21+
changed_when: true
22+
1523
- name: Reload sysctl
1624
command: sysctl --system
1725
changed_when: true

tasks/main.yml

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,117 @@
460460
mode: "0644"
461461
notify: Restart waagent
462462

463+
- name: Install DOCA host RPM and minimal DOCA packages (RHEL 9 x86_64)
464+
when:
465+
- not ((__hpc_server_is_ostree | d(false)) | bool)
466+
- __hpc_is_rh_distro
467+
block:
468+
- name: Install DOCA host repo RPM (adds doca repositories)
469+
when: ansible_facts.packages['doca-host'] is not defined
470+
block:
471+
- name: Create a temporary directory for DOCA host RPM download
472+
tempfile:
473+
state: directory
474+
prefix: doca_host_rpm_
475+
register: __hpc_doca_rpm_tempdir
476+
477+
- name: Download DOCA host RPM
478+
get_url:
479+
url: "{{ __hpc_doca_host_rpm_url }}"
480+
dest: "{{ __hpc_doca_rpm_tempdir.path }}/{{ __hpc_doca_host_rpm_url | basename }}"
481+
mode: "0644"
482+
483+
- name: Import DOCA host RPM GPG key
484+
rpm_key:
485+
key: "{{ __hpc_doca_host_rpm_gpg_key_url }}"
486+
state: present
487+
488+
- name: Install DOCA host RPM
489+
package:
490+
name: "{{ __hpc_doca_rpm_tempdir.path }}/{{ __hpc_doca_host_rpm_url | basename }}"
491+
state: present
492+
use: dnf
493+
register: __hpc_doca_host_rpm_install
494+
notify: Clean dnf metadata
495+
496+
- name: Remove temporary DOCA host RPM download directory
497+
file:
498+
path: "{{ __hpc_doca_rpm_tempdir.path }}"
499+
state: absent
500+
changed_when: false
501+
502+
- name: Install minimal DOCA packages for RDMA
503+
package:
504+
name: "{{ __hpc_doca_packages }}"
505+
state: present
506+
use: dnf
507+
register: __hpc_doca_packages_install
508+
until: __hpc_doca_packages_install is success
509+
510+
- name: Configure Azure persistent RDMA naming (systemd + udev)
511+
when:
512+
- hpc_enable_azure_persistent_rdma_naming
513+
- ansible_facts['system_vendor'] == 'Microsoft Corporation'
514+
block:
515+
- name: Install Azure persistent RDMA naming script
516+
template:
517+
src: rdma/azure_persistent_rdma_naming.sh.j2
518+
dest: /usr/sbin/azure_persistent_rdma_naming.sh
519+
owner: root
520+
group: root
521+
mode: "0755"
522+
523+
- name: Install systemd service for Azure persistent RDMA naming
524+
template:
525+
src: rdma/azure_persistent_rdma_naming.service.j2
526+
dest: /etc/systemd/system/azure_persistent_rdma_naming.service
527+
owner: root
528+
group: root
529+
mode: "0644"
530+
register: __hpc_azure_persistent_rdma_naming_unit
531+
532+
- name: Install udev rule to trigger persistent naming on IB device changes
533+
template:
534+
src: rdma/99-azure-persistent-rdma-naming.rules.j2
535+
dest: /etc/udev/rules.d/99-azure-persistent-rdma-naming.rules
536+
owner: root
537+
group: root
538+
mode: "0644"
539+
notify:
540+
- Reload udev
541+
- Trigger udev for infiniband
542+
543+
- name: Enable and start Azure persistent RDMA naming service
544+
systemd:
545+
name: azure_persistent_rdma_naming.service
546+
enabled: true
547+
state: started
548+
daemon_reload: "{{ __hpc_azure_persistent_rdma_naming_unit.changed | d(false) }}"
549+
550+
- name: Install Azure persistent RDMA naming monitor script
551+
template:
552+
src: rdma/azure_persistent_rdma_naming_monitor.sh.j2
553+
dest: /usr/sbin/azure_persistent_rdma_naming_monitor.sh
554+
owner: root
555+
group: root
556+
mode: "0755"
557+
558+
- name: Install systemd service for Azure persistent RDMA naming monitor
559+
template:
560+
src: rdma/azure_persistent_rdma_naming_monitor.service.j2
561+
dest: /etc/systemd/system/azure_persistent_rdma_naming_monitor.service
562+
owner: root
563+
group: root
564+
mode: "0644"
565+
register: __hpc_azure_persistent_rdma_naming_monitor_unit
566+
567+
- name: Enable and start Azure persistent RDMA naming monitor service
568+
systemd:
569+
name: azure_persistent_rdma_naming_monitor.service
570+
enabled: true
571+
state: started
572+
daemon_reload: "{{ __hpc_azure_persistent_rdma_naming_monitor_unit.changed | d(false) }}"
573+
463574
- name: Install common OpenMPI packages
464575
when: hpc_install_system_openmpi or hpc_build_openmpi_w_nvidia_gpu_support
465576
package:
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{{ ansible_managed | comment }}
2+
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
3+
# Trigger persistent naming on infiniband device changes.
4+
# Use systemd activation instead of direct udev RUN= to keep things reliable.
5+
SUBSYSTEM=="infiniband", ACTION=="add|change", TAG+="systemd", ENV{SYSTEMD_WANTS}="azure_persistent_rdma_naming.service"
6+
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{{ ansible_managed | comment }}
2+
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
3+
[Unit]
4+
Description=Azure persistent RDMA naming
5+
After=network.target
6+
7+
[Service]
8+
Type=oneshot
9+
ExecStart=/usr/sbin/azure_persistent_rdma_naming.sh
10+
StandardOutput=journal
11+
12+
[Install]
13+
WantedBy=multi-user.target
14+
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env bash
2+
# These are templates, not actual shell scripts, so tell shellcheck to
3+
# ignore the templated parts
4+
# shellcheck disable=all
5+
{{ ansible_managed | comment }}
6+
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
7+
# shellcheck enable=all
8+
set -euo pipefail
9+
10+
rdma_rename="{{ __hpc_rdma_rename_path }}"
11+
12+
an_index=0
13+
ib_index=0
14+
15+
if ! command -v ibdev2netdev >/dev/null 2>&1; then
16+
echo "ibdev2netdev not found; ensure RDMA tools are installed."
17+
exit
18+
fi
19+
20+
if ! command -v ibv_devinfo >/dev/null 2>&1; then
21+
echo "ibv_devinfo not found; ensure libibverbs-utils is installed."
22+
exit 0
23+
fi
24+
25+
for old_device in $(ibdev2netdev -v | sort -n | cut -f2 -d' '); do
26+
link_layer=$(ibv_devinfo -d "$old_device" | sed -n 's/^[ \t]*link_layer:[ \t]*\([a-zA-Z]*\)$/\1/p')
27+
28+
if [ "$link_layer" = "InfiniBand" ]; then
29+
"$rdma_rename" "$old_device" NAME_FIXED "mlx5_ib${ib_index}"
30+
ib_index=$((ib_index + 1))
31+
elif [ "$link_layer" = "Ethernet" ]; then
32+
"$rdma_rename" "$old_device" NAME_FIXED "mlx5_an${an_index}"
33+
an_index=$((an_index + 1))
34+
else
35+
echo "Unknown device type for $old_device - $link_layer."
36+
fi
37+
done
38+
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{{ ansible_managed | comment }}
2+
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
3+
[Unit]
4+
Description=Azure persistent RDMA naming Monitor
5+
After=network.target
6+
7+
[Service]
8+
Type=simple
9+
ExecStart=/usr/sbin/azure_persistent_rdma_naming_monitor.sh
10+
Restart=always
11+
RestartSec=60
12+
StandardOutput=journal
13+
14+
[Install]
15+
WantedBy=multi-user.target
16+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env bash
2+
# These are templates, not actual shell scripts, so tell shellcheck to
3+
# ignore the templated parts
4+
# shellcheck disable=all
5+
{{ ansible_managed | comment }}
6+
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
7+
# shellcheck enable=all
8+
set -euo pipefail
9+
10+
# monitoring service to check that hca_id's are named correctly
11+
# if incorrect, run azure_persistent_rdma_naming.sh again
12+
13+
if ! command -v ibdev2netdev >/dev/null 2>&1; then
14+
echo "ibdev2netdev not found; skipping RDMA naming monitor."
15+
exit 0
16+
fi
17+
18+
if ! command -v ibv_devinfo >/dev/null 2>&1; then
19+
echo "ibv_devinfo not found; skipping RDMA naming monitor."
20+
exit 0
21+
fi
22+
23+
while true; do
24+
for device in $(ibdev2netdev -v | sort -n | cut -f2 -d' '); do
25+
if [[ "${device}" != *"an"* && "${device}" != *"ib"* ]]; then
26+
/usr/sbin/azure_persistent_rdma_naming.sh >/dev/null 2>&1 || true
27+
sleep 60
28+
break
29+
fi
30+
done
31+
sleep 60
32+
done
33+

vars/RedHat_9.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ __hpc_rhui_azure_rhel_9_eus_repo:
2222
key: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-microsoft-azure-release
2323
baseurl: https://rhui4-1.microsoft.com/pulp/repos/unprotected/microsoft-azure-rhel9-eus
2424

25+
# RHEL 9 specific RDMA/DOCA settings
26+
__hpc_doca_host_rpm_url: "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel9.x86_64.rpm"
27+
__hpc_doca_host_rpm_gpg_key_url: "https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox"
28+
2529
# Vars related to RPMs
2630
__hpc_nvidia_driver_stream: 580-dkms
2731
__hpc_nvidia_driver_module: nvidia-driver:{{ __hpc_nvidia_driver_stream }}

0 commit comments

Comments
 (0)