Skip to content

Commit c0b7ace

Browse files
committed
Adding necessary test files and changes to test using spot VMs
1 parent 5b89bdc commit c0b7ace

File tree

7 files changed

+389
-1
lines changed

7 files changed

+389
-1
lines changed

tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@
209209
- name: Wait until Slurm key exists
210210
ansible.builtin.wait_for:
211211
path: /etc/slurm/slurm.key
212-
timeout: 600 # Waits for up to 10 minutes for the file to appear
212+
timeout: 600 # Waits for up to 10 minutes for the file to appear
213213
when: key_type == 'slurm'
214214

215215
- name: Count Slurm nodes
@@ -232,6 +232,23 @@
232232
loop_control:
233233
loop_var: test
234234

235+
rescue:
236+
- name: Display test failure message
237+
ansible.builtin.debug:
238+
msg: "A task within the integration tests failed. Conditional preemption check will run for a3-megagpu-8g spot instances."
239+
240+
- name: Check for recent preemptions for a3-megagpu-8g
241+
ansible.builtin.include_tasks: test-validation/test-a3m-preemption.yml
242+
when:
243+
- custom_vars.enable_spot | default(false) | bool
244+
- custom_vars.machine_type | default('') == 'a3-megagpu-8g'
245+
vars:
246+
custom_vars: "{{ custom_vars }}"
247+
248+
- name: Propagate original failure after rescue tasks
249+
ansible.builtin.fail:
250+
msg: "Integration tests failed. Rescue tasks were executed."
251+
235252
## Always cleanup, even on failure
236253
always:
237254
- name: Ensure all nodes are powered down
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
---
15+
- name: Check if instance_labels are provided and are a dictionary
16+
ansible.builtin.assert:
17+
that:
18+
- custom_vars.instance_labels is defined
19+
- custom_vars.instance_labels is mapping
20+
- custom_vars.instance_labels | length > 0
21+
fail_msg: "custom_vars.instance_labels must be a non-empty dictionary."
22+
23+
# Creates label string of the form "key1=value1,key2=value2" to add to the instances created during the test.
24+
# Also, transforms key and value to lowercase to be compliant with the gcloud command.
25+
- name: Prepare label string
26+
ansible.builtin.set_fact:
27+
labels_string: "{{ custom_vars.instance_labels.items() | map(attribute=0) | map('string') | map('lower') | zip(custom_vars.instance_labels.items() | map(attribute=1) | map('string') | map('lower')) | map('join', '=') | join(',') }}"
28+
29+
- name: Gather instance information
30+
ansible.builtin.import_tasks: tasks/get_instance_ids.yml
31+
32+
- name: Report if no instances found
33+
ansible.builtin.debug:
34+
msg: "No instances found with label ghpc_deployment={{ deployment_name }} in project {{ project }}."
35+
when: instances | length == 0
36+
37+
- name: Set up variable for instances found
38+
ansible.builtin.set_fact:
39+
gce_instances_list: "{{ instances.stdout | default('[]') | from_json }}"
40+
41+
# Updates labels on instances created during the build to monitor metrics such as cost, etc.
42+
- name: Add spot label to each node
43+
ansible.builtin.command: >-
44+
gcloud compute instances update {{ item.name }}
45+
--project={{ project }}
46+
--zone={{ item.zone | basename }}
47+
--update-labels={{ labels_string }}
48+
loop: "{{ gce_instances_list }}"
49+
loop_control:
50+
label: "{{ item.name }} ({{ item.zone | basename }})"
51+
when: gce_instances_list | length > 0
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
- name: Gather instance information
17+
ansible.builtin.import_tasks: tasks/get_instance_ids.yml
18+
19+
- name: Report if no instances found
20+
ansible.builtin.debug:
21+
msg: "No instances found with label ghpc_deployment={{ deployment_name }} in project {{ project }}."
22+
when: instances | length == 0
23+
24+
- name: Set up variables for preemption query
25+
ansible.builtin.set_fact:
26+
gce_instances_list: "{{ instances.stdout | default('[]') | from_json }}"
27+
query_window: "4h" # How far back to look for preemptions
28+
29+
# Creating a filter string of the form "resource.labels.instance_id=id1 OR resource.labels.instance_id=id2"
30+
- name: Build instance ID filter string
31+
ansible.builtin.set_fact:
32+
instance_id_filter: "{{ gce_instances_list | map(attribute='id') | map('regex_replace', '^(.*)$', 'resource.labels.instance_id=\"\\1\"') | join(' OR ') }}"
33+
when: gce_instances_list | length > 0
34+
35+
- name: Query Cloud Audit Logs for any preemption events for these nodes
36+
ansible.builtin.shell: |
37+
gcloud logging read '
38+
resource.type="gce_instance"
39+
AND protoPayload.methodName="compute.instances.preempted"
40+
AND log_id("cloudaudit.googleapis.com/system_event")
41+
AND ({{ instance_id_filter }})
42+
' --project={{ project }} --freshness={{ query_window }} --format="json"
43+
register: preemption_logs
44+
when: instance_id_filter is defined and instance_id_filter != ""
45+
46+
- name: Parse preemption logs
47+
ansible.builtin.set_fact:
48+
preemption_logs_parsed: "{{ (preemption_logs | default({})).get('stdout', '[]') | from_json }}"
49+
50+
- name: Display Instance Names being checked
51+
ansible.builtin.debug:
52+
msg: "Instance Names being checked for preemption in {{ deployment_name }}: {{ gce_instances_list | map(attribute='name') | list }}"
53+
when: gce_instances_list | length > 0
54+
55+
# Displays preempted nodes if any, else, gives message that no preemption events found
56+
- name: Report preemption status
57+
ansible.builtin.debug:
58+
msg: >-
59+
{% if preemption_logs_parsed | length > 0 %}
60+
Preemption Events Found in the last {{ query_window }}: {{ preemption_logs_parsed }}
61+
{% else %}
62+
No preemption events found for the checked instances in {{ deployment_name }} in the last {{ query_window }}.
63+
{% endif %}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
tags:
17+
- m.custom-image
18+
- m.startup-script
19+
- slurm6
20+
- m.filestore
21+
- m.schedmd-slurm-gcp-v6-controller
22+
- m.schedmd-slurm-gcp-v6-login
23+
- m.schedmd-slurm-gcp-v6-nodeset
24+
- m.schedmd-slurm-gcp-v6-partition
25+
- m.vpc
26+
- m.cloud-storage-bucket
27+
- m.multivpc
28+
- m.private-service-access
29+
30+
timeout: 14400s # 4hr
31+
steps:
32+
- id: check_for_running_build
33+
name: gcr.io/cloud-builders/gcloud
34+
script: "/workspace/tools/cloud-build/check_running_build.sh /workspace/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml"
35+
36+
- id: ml-a3-megagpu-slurm
37+
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
38+
entrypoint: /bin/bash
39+
env:
40+
# General Ansible configuration
41+
- "ANSIBLE_HOST_KEY_CHECKING=false"
42+
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
43+
44+
# Configuration Variables for the A3-MegaGPU Test.
45+
- "MACHINE_TYPE=a3-megagpu-8g"
46+
- "IMAGE_FAMILY=slurm-a3mega"
47+
- "NUM_NODES=4"
48+
- "BLUEPRINT_PATH=/workspace/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml"
49+
- "TEST_VARS_FILE=@tools/cloud-build/daily-tests/tests/ml-a3-megagpu-onspot-slurm-ubuntu.yml"
50+
- "INSTANCE_PREFIX=a3msp"
51+
- "PROJECT_ID=$PROJECT_ID"
52+
- "BUILD_ID=$BUILD_ID"
53+
args:
54+
- -c
55+
- |
56+
set -x
57+
/workspace/tools/cloud-build/test_spot_vm.sh
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
17+
# region, zone must be defined in build file with --extra-vars flag!
18+
test_name: a3m-onspot-slurm
19+
deployment_name: a3m-{{ build }}-onspot-slurm
20+
slurm_cluster_name: "a3msp{{ build[0:5] }}"
21+
workspace: /workspace
22+
blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml"
23+
login_node: "{{ slurm_cluster_name }}-login-*"
24+
controller_node: "{{ slurm_cluster_name }}-controller"
25+
network: "{{ deployment_name }}-net-0"
26+
sub_network: "{{ deployment_name }}-sub-net-0"
27+
post_deploy_tests:
28+
- tasks/add-labels.yml
29+
- test-validation/test-mounts.yml
30+
- test-validation/test-partitions.yml
31+
- test-validation/test-enroot.yml
32+
- test-validation/test-gpus-slurm.yml
33+
post_destroy_tasks:
34+
- post-destroy-tasks/delete-image.yml
35+
custom_vars:
36+
gpu_count: 8
37+
gpu_partition: a3mega
38+
test_persistenced: true
39+
partitions:
40+
- a3mega
41+
- debug
42+
mounts:
43+
- /home
44+
instance_labels:
45+
a3mega_onspot: true
46+
enable_spot: true
47+
machine_type: "a3-megagpu-8g"
48+
cli_deployment_vars:
49+
network_name_system: "{{ network }}"
50+
subnetwork_name_system: "{{ sub_network}}"
51+
region: "{{ region }}"
52+
zone: "{{ zone }}"
53+
slurm_cluster_name: "{{ slurm_cluster_name }}"
54+
disk_size_gb: 200
55+
a3mega_cluster_size: 2
56+
enable_ops_agent: "true"
57+
enable_nvidia_dcgm: "true"
58+
enable_nvidia_persistenced: true
59+
final_image_family: "{{ deployment_name}}-u22"
60+
a3mega_enable_spot_vm: true

tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-ubuntu.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ network: "{{ deployment_name }}-net-0"
2828
nccl_test_path: "examples/machine-learning/a3-megagpu-8g/nccl-tests"
2929
sub_network: "{{ deployment_name }}-sub-net-0"
3030
post_deploy_tests:
31+
- tasks/add-labels.yml
3132
- test-validation/test-mounts.yml
3233
- test-validation/test-partitions.yml
3334
- test-validation/test-default-partition.yml
@@ -45,6 +46,8 @@ custom_vars:
4546
- debug
4647
mounts:
4748
- /home
49+
instance_labels:
50+
a3mega_onspot: false
4851
cli_deployment_vars:
4952
network_name_system: "{{ network }}"
5053
subnetwork_name_system: "{{ sub_network}}"

0 commit comments

Comments
 (0)