Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ vars:
project_id: # supply project ID
region: # supply region
zone: # supply zone
a3u_cluster_size: # supply cluster size
cluster_size: # supply cluster size
# Image settings
base_image:
project: ubuntu-os-accelerator-images
Expand All @@ -44,9 +44,9 @@ vars:
base_network_name: $(vars.deployment_name)

#Provisioning models (set to true or fill in reservation name, pick only one)
a3u_reservation_name: "" # supply reservation name
a3u_dws_flex_enabled: false
a3u_enable_spot_vm: false
reservation_name: "" # supply reservation name
dws_flex_enabled: false
enable_spot_vm: false

deployment_groups:
- group: image-env
Expand Down Expand Up @@ -120,7 +120,7 @@ deployment_groups:
destination: install_slurm.sh
content: |
#!/bin/bash
set -e -o pipefail
set -ex -o pipefail
ansible-pull \
-U https://github.com/GoogleCloudPlatform/slurm-gcp -C $(vars.build_slurm_from_git_ref) \
-i localhost, --limit localhost --connection=local \
Expand All @@ -140,16 +140,17 @@ deployment_groups:
destination: install-cuda-toolkit.sh
content: |
#!/bin/bash
set -e -o pipefail
set -ex -o pipefail
add-nvidia-repositories -y
apt update -y
apt install -y cuda-toolkit-12-8
apt install -y datacenter-gpu-manager-4-cuda12
apt install -y datacenter-gpu-manager-4-dev
- type: ansible-local
destination: settings_nvidia_dcgm.yml
content: |
---
- name: Nvidia and DGMA settings
- name: Nvidia CUDA and DGMA settings
hosts: all
become: true
tasks:
Expand Down Expand Up @@ -217,7 +218,7 @@ deployment_groups:

- group: image
modules:
- id: slurm-a3ultra-image
- id: slurm-image
source: modules/packer/custom-image
kind: packer
settings:
Expand Down Expand Up @@ -253,7 +254,7 @@ deployment_groups:

- group: cluster-env
modules:
- id: a3ultra-slurm-net-0
- id: slurm-net-0
source: modules/network/vpc
settings:
network_name: $(vars.base_network_name)-net-0
Expand All @@ -271,7 +272,7 @@ deployment_groups:
- protocol: udp
- protocol: icmp

- id: a3ultra-slurm-net-1
- id: slurm-net-1
source: modules/network/vpc
settings:
network_name: $(vars.base_network_name)-net-1
Expand All @@ -289,7 +290,7 @@ deployment_groups:
- protocol: udp
- protocol: icmp

- id: a3ultra-slurm-rdma-net
- id: slurm-rdma-net
source: modules/network/gpu-rdma-vpc
settings:
network_name: $(vars.base_network_name)-rdma-net
Expand All @@ -304,7 +305,7 @@ deployment_groups:
- id: homefs
source: modules/file-system/filestore
use:
- a3ultra-slurm-net-0
- slurm-net-0
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
Expand All @@ -318,7 +319,7 @@ deployment_groups:

# - id: private_service_access
# source: community/modules/network/private-service-access
# use: [a3ultra-slurm-net-0]
# use: [slurm-net-0]

# To use Managed Lustre as for the shared /home directory:
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
Expand All @@ -327,7 +328,7 @@ deployment_groups:
# - id: homefs
# source: modules/file-system/managed-lustre
# use:
# - a3ultra-slurm-net-0
# - slurm-net-0
# - private_service_access
# settings:
# size_gib: 36000
Expand Down Expand Up @@ -431,7 +432,7 @@ deployment_groups:

- group: cluster
modules:
- id: a3ultra_startup
- id: startup
source: modules/scripts/startup-script
settings:
local_ssd_filesystem:
Expand Down Expand Up @@ -541,9 +542,9 @@ deployment_groups:
fi
mode: '0644'
handlers:
- name: Reload SystemD
ansible.builtin.systemd:
daemon_reload: true
- name: Reload SystemD
ansible.builtin.systemd:
daemon_reload: true
Comment on lines +545 to +547
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The indentation of this handler definition is incorrect, which will cause a YAML parsing error. The list item (- name: ...) under handlers must be indented. It seems to have been incorrectly de-indented.

              - name: Reload SystemD
                ansible.builtin.systemd:
                  daemon_reload: true


- type: ansible-local
destination: enable_dcgm.yml
Expand Down Expand Up @@ -596,25 +597,24 @@ deployment_groups:
state: "{{ 'started' if enable_nvidia_persistenced else 'stopped' }}"
enabled: "{{ enable_nvidia_persistenced }}"

- id: a3_ultra_nodeset
- id: nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [a3ultra-slurm-net-0, a3ultra_startup]
use: [slurm-net-0, startup]
settings:
bandwidth_tier: gvnic_enabled
machine_type: a3-ultragpu-8g

enable_public_ips: true
node_count_static: $(vars.a3u_cluster_size)
node_count_static: $(vars.cluster_size)
node_count_dynamic_max: 0
enable_placement: false
disk_type: hyperdisk-balanced
on_host_maintenance: TERMINATE

#Provisioning models
reservation_name: $(vars.a3u_reservation_name)
enable_spot_vm: $(vars.a3u_enable_spot_vm)
reservation_name: $(vars.reservation_name)
enable_spot_vm: $(vars.enable_spot_vm)
dws_flex:
enabled: $(vars.a3u_dws_flex_enabled)
enabled: $(vars.dws_flex_enabled)

advanced_machine_features:
threads_per_core: null # Use platform default value
Expand All @@ -626,7 +626,7 @@ deployment_groups:
$(concat(
[{
network=null,
subnetwork=a3ultra-slurm-net-1.subnetwork_self_link,
subnetwork=slurm-net-1.subnetwork_self_link,
subnetwork_project=vars.project_id,
nic_type="GVNIC",
queue_count=null,
Expand All @@ -636,13 +636,13 @@ deployment_groups:
ipv6_access_config=[],
alias_ip_range=[]
}],
a3ultra-slurm-rdma-net.subnetwork_interfaces
slurm-rdma-net.subnetwork_interfaces
))

- id: a3_ultra_partition
- id: partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- a3_ultra_nodeset
- nodeset
settings:
exclusive: false
partition_name: a3ultra
Expand All @@ -654,7 +654,7 @@ deployment_groups:

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [a3ultra-slurm-net-0]
use: [slurm-net-0]
settings:
disk_size_gb: 300
enable_login_public_ips: true
Expand All @@ -669,7 +669,7 @@ deployment_groups:
content: |
#!/bin/bash
SLURM_ROOT=/opt/apps/adm/slurm
PARTITION_NAME=$(a3_ultra_partition.partitions[0].partition_name)
PARTITION_NAME=$(partition.partitions[0].partition_name)
mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
# enable a GPU health check that runs at the completion of all jobs on A3U nodes
mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
Expand All @@ -688,8 +688,8 @@ deployment_groups:
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- a3ultra-slurm-net-0
- a3_ultra_partition
- slurm-net-0
- partition
- slurm_login
- homefs
- gcs_bucket
Expand Down
Loading
Loading