Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/karpenter-custom-resources/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates.
version: 1.0.0
version: 1.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application.
Expand Down
62 changes: 62 additions & 0 deletions charts/karpenter-custom-resources/examples/gpu-nodepool.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# GPU NodePool example for ML/AI workloads
# Copy these values into your values.yaml file

ec2NodeClasses:
- name: "gpu-compute"
amiFamily: "AL2"
role: "KarpenterNodeRole"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "my-cluster"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "my-cluster"
tags:
WorkloadType: "GPU"
Purpose: "ML-Training"
kubelet:
maxPods: 110
clusterDNS: ["10.100.0.10"]
blockDeviceMappings:
- deviceName: "/dev/xvda"
ebs:
volumeSize: "100Gi" # Larger storage for ML frameworks
volumeType: "gp3"
encrypted: true
deleteOnTermination: true
metadataOptions:
httpEndpoint: "enabled"
httpTokens: "required"

nodePools:
- name: "gpu-workloads"
ec2NodeClassName: "gpu-compute"
template:
metadata:
labels:
workload-type: "gpu"
spec:
# Taint GPU nodes so only GPU workloads schedule here
taints:
- key: "nvidia.com/gpu"
effect: "NoSchedule"
requirements:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand"] # GPU instances are typically on-demand
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["g", "p"] # GPU instance families
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "kubernetes.io/os"
operator: In
values: ["linux"]
disruption:
consolidationPolicy: "WhenEmpty" # Conservative for expensive GPU nodes
consolidateAfter: "5m"
limits:
cpu: "500"
memory: "2000Gi"
"nvidia.com/gpu": "16" # Limit total GPUs
121 changes: 121 additions & 0 deletions charts/karpenter-custom-resources/examples/multi-environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Multi-environment NodePool example
# Shows how to create separate node pools for different environments

ec2NodeClasses:
- name: "production-nodes"
amiFamily: "AL2"
role: "KarpenterNodeRole"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "prod-cluster"
Environment: "production"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "prod-cluster"
Environment: "production"
tags:
Environment: "production"
CostCenter: "engineering"
kubelet:
maxPods: 110
clusterDNS: ["10.100.0.10"]
# Resource reservations for production stability
systemReserved:
cpu: "200m"
memory: "500Mi"
ephemeral-storage: "1Gi"
kubeReserved:
cpu: "200m"
memory: "500Mi"
blockDeviceMappings:
- deviceName: "/dev/xvda"
ebs:
volumeSize: "50Gi" # Larger storage for production
volumeType: "gp3"
encrypted: true
deleteOnTermination: true
metadataOptions:
httpEndpoint: "enabled"
httpTokens: "required"
detailedMonitoring: true

- name: "development-nodes"
amiFamily: "AL2"
role: "KarpenterNodeRole"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "dev-cluster"
Environment: "development"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "dev-cluster"
Environment: "development"
tags:
Environment: "development"
CostCenter: "engineering"
kubelet:
maxPods: 110
clusterDNS: ["10.100.0.10"]
blockDeviceMappings:
- deviceName: "/dev/xvda"
ebs:
volumeSize: "20Gi"
volumeType: "gp3"
encrypted: true
deleteOnTermination: true

nodePools:
- name: "production-workloads"
ec2NodeClassName: "production-nodes"
template:
metadata:
labels:
environment: "production"
team: "platform"
spec:
requirements:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["on-demand"] # Production prefers reliability
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["m", "c", "r"]
- key: "karpenter.k8s.aws/instance-generation"
operator: Gt
values: ["4"] # Latest generation for production
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
disruption:
consolidationPolicy: "WhenEmpty" # Conservative for production
consolidateAfter: "10m"
limits:
cpu: "2000"
memory: "8000Gi"

- name: "development-workloads"
ec2NodeClassName: "development-nodes"
template:
metadata:
labels:
environment: "development"
team: "platform"
spec:
requirements:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot"] # Development can use cheaper spot instances
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["m", "c", "r", "t"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
# Dev environments can expire nodes to save costs
expireAfter: "24h"
disruption:
consolidationPolicy: "WhenEmptyOrUnderutilized"
consolidateAfter: "30s"
limits:
cpu: "500"
memory: "2000Gi"
48 changes: 48 additions & 0 deletions charts/karpenter-custom-resources/examples/simple-nodepool.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Simple NodePool example for general workloads
# Copy these values into your values.yaml file

ec2NodeClasses:
- name: "general-compute"
amiFamily: "AL2"
role: "KarpenterNodeRole"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "my-cluster"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "my-cluster"
kubelet:
maxPods: 110
clusterDNS: ["10.100.0.10"]
blockDeviceMappings:
- deviceName: "/dev/xvda"
ebs:
volumeSize: "20Gi"
volumeType: "gp3"
encrypted: true
deleteOnTermination: true

nodePools:
- name: "general-purpose"
ec2NodeClassName: "general-compute"
template:
spec:
requirements:
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["m", "c", "r"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "kubernetes.io/os"
operator: In
values: ["linux"]
disruption:
consolidationPolicy: "WhenEmptyOrUnderutilized"
consolidateAfter: "30s"
limits:
cpu: "1000"
memory: "4000Gi"
Loading
Loading