Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/ec2geninfo/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"os"
"regexp"
"text/template"

"github.com/aws/aws-sdk-go-v2/aws"
Expand Down Expand Up @@ -120,6 +121,7 @@ func getEC2Instances(region string, instances map[string]InstanceInfo) (map[stri
}

paginator := ec2.NewDescribeInstanceTypesPaginator(client, input)
unsupportedRegexp, _ := regexp.Compile("^(p2).*")

for paginator.HasMorePages() {
page, err := paginator.NextPage(context.TODO())
Expand All @@ -130,6 +132,10 @@ func getEC2Instances(region string, instances map[string]InstanceInfo) (map[stri
for _, inst := range page.InstanceTypes {
itype := string(inst.InstanceType)

if unsupportedRegexp.MatchString(itype) {
continue
}

efaSupported := inst.NetworkInfo != nil && inst.NetworkInfo.EfaSupported != nil && *inst.NetworkInfo.EfaSupported

nvidiaGPUSupported := false
Expand Down
46 changes: 23 additions & 23 deletions examples/23-kubeflow-spot-instance.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Cost-Optimized EKS cluster for Kubeflow with spot GPU instances and node scale down to zero
# Built in efforts to reducing training costs of ML workloads.
# Supporting tutorial can be found at the following link:
# Supporting tutorial can be found at the following link:
# https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf
# This spec creates a cluster on EKS with the following active nodes
# This spec creates a cluster on EKS with the following active nodes
# - 2x m5a.2xlarge - Accomodates all pods of Kubeflow
# It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up
# - m5a.2xlarge -- Max Allowed 10 worker nodes
# - p2.xlarge -- Max Allowed 10 worker nodes
# - g5.xlarge -- Max Allowed 10 worker nodes
# - p3.2xlarge -- Max Allowed 10 worker nodes
# - p3.8xlarge -- Max Allowed 04 worker nodes
# - p3dn.24xlarge -- Max Allowed 01 worker nodes
Expand All @@ -16,7 +16,7 @@ kind: ClusterConfig

metadata:
# Name of your cluster, change to whatever you find fit.
# If changed, make sure to change all nodegroup tags from
# If changed, make sure to change all nodegroup tags from
# 'k8s.io/cluster-autoscaler/cluster-23: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"'
name: cluster-23
# choose your region wisely, this will significantly impact the cost incurred
Expand All @@ -27,7 +27,7 @@ metadata:
# Add more cloud tags if needed for billing
environment: staging

# Add all possible AZs to ensure nodes can be spun up in any AZ later on.
# Add all possible AZs to ensure nodes can be spun up in any AZ later on.
# THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT.
# This list applies to the whole cluster and isn't specific to nodegroups
availabilityZones: ["us-east-1a", "us-east-1b", "us-east-1d", "us-east-1f"]
Expand All @@ -37,8 +37,8 @@ nodeGroups:
desiredCapacity: 2
minSize: 0
maxSize: 3
# Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
# Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
# Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
# Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
volumeSize: 100
volumeType: gp2
instanceType: m5a.2xlarge
Expand Down Expand Up @@ -78,23 +78,23 @@ nodeGroups:
autoScaler: true
cloudWatch: true

- name: 1-gpu-spot-p2-xlarge
- name: 1-gpu-spot-g5-xlarge
minSize: 0
maxSize: 10
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
maxPrice: 1.2
instanceTypes: ["p2.xlarge"]
instanceTypes: ["g5.xlarge"]
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
spotAllocationStrategy: capacity-optimized
labels:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "1"
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# if the new instance got spun up in another AZ.
availabilityZones: ["us-east-1a"]
taints:
Expand All @@ -118,8 +118,8 @@ nodeGroups:
minSize: 0
maxSize: 10
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
maxPrice: 1.2
instanceTypes: ["p3.2xlarge"]
onDemandBaseCapacity: 0
Expand All @@ -129,8 +129,8 @@ nodeGroups:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "1"
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# if the new instance got spun up in another AZ.
availabilityZones: ["us-east-1a"]
taints:
Expand All @@ -154,8 +154,8 @@ nodeGroups:
minSize: 0
maxSize: 4
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
# maxPrice: 4.4
instanceTypes: ["p3.8xlarge"]
onDemandBaseCapacity: 0
Expand All @@ -165,8 +165,8 @@ nodeGroups:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "4"
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# if the new instance got spun up in another AZ.
availabilityZones: ["us-east-1a"]
taints:
Expand All @@ -190,8 +190,8 @@ nodeGroups:
minSize: 0
maxSize: 1
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
maxPrice: 11
instanceTypes: ["p3dn.24xlarge"]
onDemandBaseCapacity: 0
Expand Down
2 changes: 1 addition & 1 deletion pkg/ami/auto_resolver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ var _ = Describe("AMI Auto Resolution", func() {

Context("and gpu instance type", func() {
BeforeEach(func() {
instanceType = "p2.xlarge"
instanceType = "g5.xlarge"
})

Context("and ami is available", func() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/ami/ssm_resolver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ var _ = Describe("AMI Auto Resolution", func() {

Context("and gpu instance type", func() {
BeforeEach(func() {
instanceType = "p2.xlarge"
instanceType = "g5.xlarge"
})

Context("and ami is available", func() {
Expand Down
6 changes: 3 additions & 3 deletions pkg/cfn/builder/managed_nodegroup_ami_type_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
[]string{}, // local zones
[]ec2types.InstanceType{
ec2types.InstanceTypeM5Large,
ec2types.InstanceTypeP2Xlarge,
ec2types.InstanceTypeG5Xlarge,
ec2types.InstanceTypeA12xlarge,
ec2types.InstanceTypeG5gXlarge,
ec2types.InstanceTypeG4dnXlarge,
Expand Down Expand Up @@ -81,7 +81,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
InstanceType: "p2.8xlarge",
InstanceType: "g5.8xlarge",
},
},
expectedAMIType: "AL2023_x86_64_NVIDIA",
Expand All @@ -102,7 +102,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyAmazonLinux2,
InstanceType: "p2.xlarge",
InstanceType: "g5.xlarge",
},
},
expectedAMIType: "AL2_x86_64_GPU",
Expand Down
2 changes: 1 addition & 1 deletion pkg/eks/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ var _ = Describe("eksctl API", func() {

It("should retrieve the AMI from EC2 when AMI is auto", func() {
ng.AMI = "auto"
ng.InstanceType = "p2.xlarge"
ng.InstanceType = "g5.xlarge"
mockDescribeImages(provider, "ami-auto", func(input *ec2.DescribeImagesInput) bool {
return len(input.ImageIds) == 0
})
Expand Down
4 changes: 2 additions & 2 deletions pkg/eks/instance_selection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ var _ = DescribeTable("Instance type selection", func(t instanceTypeCase) {
}),

Entry("all GPU instances", instanceTypeCase{
instanceTypes: []string{"p2.8xlarge", "p3.8xlarge", "g4dn.xlarge"},
instanceTypes: []string{"g5.8xlarge", "p3.8xlarge", "g4dn.xlarge"},

expectedInstanceType: "p2.8xlarge",
expectedInstanceType: "g5.8xlarge",
}),

Entry("single instance type", instanceTypeCase{
Expand Down
8 changes: 4 additions & 4 deletions pkg/eks/tasks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
nodeGroups: []*v1alpha5.NodeGroup{
{
NodeGroupBase: &v1alpha5.NodeGroupBase{
InstanceType: "p2.xlarge",
InstanceType: "g5.xlarge",
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2,
},
},
Expand All @@ -46,7 +46,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
nodeGroups: []*v1alpha5.NodeGroup{
{
NodeGroupBase: &v1alpha5.NodeGroupBase{
InstanceType: "p2.xlarge",
InstanceType: "g5.xlarge",
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2,
},
},
Expand Down Expand Up @@ -117,7 +117,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
nodeGroups: []*v1alpha5.NodeGroup{
{
NodeGroupBase: &v1alpha5.NodeGroupBase{
InstanceType: "p2.xlarge",
InstanceType: "g5.xlarge",
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2023,
},
},
Expand All @@ -140,7 +140,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
nodeGroups: []*v1alpha5.NodeGroup{
{
NodeGroupBase: &v1alpha5.NodeGroupBase{
InstanceType: "p2.xlarge",
InstanceType: "g5.xlarge",
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2023,
},
},
Expand Down
33 changes: 0 additions & 33 deletions pkg/utils/instance/instance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5160,39 +5160,6 @@ var InstanceTypes = []InstanceInfo{
CBRSupported: false,
CPUArch: "arm64",
},
{
InstanceType: "p2.16xlarge",
InstanceStorageSupported: false,
EFASupported: false,
NvidiaGPUSupported: true,
NvidiaGPUType: "K80",
NeuronSupported: false,
NeuronDeviceType: "",
CBRSupported: false,
CPUArch: "x86-64",
},
{
InstanceType: "p2.8xlarge",
InstanceStorageSupported: false,
EFASupported: false,
NvidiaGPUSupported: true,
NvidiaGPUType: "K80",
NeuronSupported: false,
NeuronDeviceType: "",
CBRSupported: false,
CPUArch: "x86-64",
},
{
InstanceType: "p2.xlarge",
InstanceStorageSupported: false,
EFASupported: false,
NvidiaGPUSupported: true,
NvidiaGPUType: "K80",
NeuronSupported: false,
NeuronDeviceType: "",
CBRSupported: false,
CPUArch: "x86-64",
},
{
InstanceType: "p3.16xlarge",
InstanceStorageSupported: false,
Expand Down
2 changes: 1 addition & 1 deletion userdocs/src/usage/custom-ami-support.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Config file example:
```yaml
nodeGroups:
- name: ng1
instanceType: p2.xlarge
instanceType: g5.xlarge
amiFamily: AmazonLinux2
ami: auto
- name: ng2
Expand Down
4 changes: 2 additions & 2 deletions userdocs/src/usage/gpu-support.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Eksctl supports selecting GPU instance types for nodegroups. Simply supply a
compatible instance type to the create command, or via the config file.

```
eksctl create cluster --node-type=p2.xlarge
eksctl create cluster --node-type=g5.xlarge
```

???+ note
Expand All @@ -23,7 +23,7 @@ To disable the automatic plugin installation, and manually install a specific ve
use `--install-nvidia-plugin=false` with the create command. For example:

```
eksctl create cluster --node-type=p2.xlarge --install-nvidia-plugin=false
eksctl create cluster --node-type=g5.xlarge --install-nvidia-plugin=false
```

and, for versions 0.15.0 and above,
Expand Down
6 changes: 3 additions & 3 deletions userdocs/src/usage/spot-instances.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ nodeGroups:
desiredCapacity: 1
instancesDistribution:
instanceTypes:
- p2.xlarge
- p2.8xlarge
- p2.16xlarge
- g5.xlarge
- g5.8xlarge
- g5.16xlarge
maxPrice: 0.50
```

Expand Down
Loading