Skip to content

Commit f9e36fc

Browse files
authored
chore: Karpenter resources chart (#327)
* karpenter resources nodepool and ec2 nodeclass helm chart * changes after testing * conflicts fixed
1 parent be5f806 commit f9e36fc

File tree

6 files changed

+672
-253
lines changed

6 files changed

+672
-253
lines changed

charts/karpenter-custom-resources/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ type: application
55

66
# This is the chart version. This version number should be incremented each time you make changes
77
# to the chart and its templates.
8-
version: 1.0.0
8+
version: 1.1.0
99

1010
# This is the version number of the application being deployed. This version number should be
1111
# incremented each time you make changes to the application.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# GPU NodePool example for ML/AI workloads
2+
# Copy these values into your values.yaml file
3+
4+
ec2NodeClasses:
5+
- name: "gpu-compute"
6+
amiFamily: "AL2"
7+
role: "KarpenterNodeRole"
8+
subnetSelectorTerms:
9+
- tags:
10+
karpenter.sh/discovery: "my-cluster"
11+
securityGroupSelectorTerms:
12+
- tags:
13+
karpenter.sh/discovery: "my-cluster"
14+
tags:
15+
WorkloadType: "GPU"
16+
Purpose: "ML-Training"
17+
kubelet:
18+
maxPods: 110
19+
clusterDNS: ["10.100.0.10"]
20+
blockDeviceMappings:
21+
- deviceName: "/dev/xvda"
22+
ebs:
23+
volumeSize: "100Gi" # Larger storage for ML frameworks
24+
volumeType: "gp3"
25+
encrypted: true
26+
deleteOnTermination: true
27+
metadataOptions:
28+
httpEndpoint: "enabled"
29+
httpTokens: "required"
30+
31+
nodePools:
32+
- name: "gpu-workloads"
33+
ec2NodeClassName: "gpu-compute"
34+
template:
35+
metadata:
36+
labels:
37+
workload-type: "gpu"
38+
spec:
39+
# Taint GPU nodes so only GPU workloads schedule here
40+
taints:
41+
- key: "nvidia.com/gpu"
42+
effect: "NoSchedule"
43+
requirements:
44+
- key: "karpenter.sh/capacity-type"
45+
operator: In
46+
values: ["on-demand"] # GPU instances are typically on-demand
47+
- key: "karpenter.k8s.aws/instance-category"
48+
operator: In
49+
values: ["g", "p"] # GPU instance families
50+
- key: "kubernetes.io/arch"
51+
operator: In
52+
values: ["amd64"]
53+
- key: "kubernetes.io/os"
54+
operator: In
55+
values: ["linux"]
56+
disruption:
57+
consolidationPolicy: "WhenEmpty" # Conservative for expensive GPU nodes
58+
consolidateAfter: "5m"
59+
limits:
60+
cpu: "500"
61+
memory: "2000Gi"
62+
"nvidia.com/gpu": "16" # Limit total GPUs
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Multi-environment NodePool example
2+
# Shows how to create separate node pools for different environments
3+
4+
ec2NodeClasses:
5+
- name: "production-nodes"
6+
amiFamily: "AL2"
7+
role: "KarpenterNodeRole"
8+
subnetSelectorTerms:
9+
- tags:
10+
karpenter.sh/discovery: "prod-cluster"
11+
Environment: "production"
12+
securityGroupSelectorTerms:
13+
- tags:
14+
karpenter.sh/discovery: "prod-cluster"
15+
Environment: "production"
16+
tags:
17+
Environment: "production"
18+
CostCenter: "engineering"
19+
kubelet:
20+
maxPods: 110
21+
clusterDNS: ["10.100.0.10"]
22+
# Resource reservations for production stability
23+
systemReserved:
24+
cpu: "200m"
25+
memory: "500Mi"
26+
ephemeral-storage: "1Gi"
27+
kubeReserved:
28+
cpu: "200m"
29+
memory: "500Mi"
30+
blockDeviceMappings:
31+
- deviceName: "/dev/xvda"
32+
ebs:
33+
volumeSize: "50Gi" # Larger storage for production
34+
volumeType: "gp3"
35+
encrypted: true
36+
deleteOnTermination: true
37+
metadataOptions:
38+
httpEndpoint: "enabled"
39+
httpTokens: "required"
40+
detailedMonitoring: true
41+
42+
- name: "development-nodes"
43+
amiFamily: "AL2"
44+
role: "KarpenterNodeRole"
45+
subnetSelectorTerms:
46+
- tags:
47+
karpenter.sh/discovery: "dev-cluster"
48+
Environment: "development"
49+
securityGroupSelectorTerms:
50+
- tags:
51+
karpenter.sh/discovery: "dev-cluster"
52+
Environment: "development"
53+
tags:
54+
Environment: "development"
55+
CostCenter: "engineering"
56+
kubelet:
57+
maxPods: 110
58+
clusterDNS: ["10.100.0.10"]
59+
blockDeviceMappings:
60+
- deviceName: "/dev/xvda"
61+
ebs:
62+
volumeSize: "20Gi"
63+
volumeType: "gp3"
64+
encrypted: true
65+
deleteOnTermination: true
66+
67+
nodePools:
68+
- name: "production-workloads"
69+
ec2NodeClassName: "production-nodes"
70+
template:
71+
metadata:
72+
labels:
73+
environment: "production"
74+
team: "platform"
75+
spec:
76+
requirements:
77+
- key: "karpenter.sh/capacity-type"
78+
operator: In
79+
values: ["on-demand"] # Production prefers reliability
80+
- key: "karpenter.k8s.aws/instance-category"
81+
operator: In
82+
values: ["m", "c", "r"]
83+
- key: "karpenter.k8s.aws/instance-generation"
84+
operator: Gt
85+
values: ["4"] # Latest generation for production
86+
- key: "kubernetes.io/arch"
87+
operator: In
88+
values: ["amd64"]
89+
disruption:
90+
consolidationPolicy: "WhenEmpty" # Conservative for production
91+
consolidateAfter: "10m"
92+
limits:
93+
cpu: "2000"
94+
memory: "8000Gi"
95+
96+
- name: "development-workloads"
97+
ec2NodeClassName: "development-nodes"
98+
template:
99+
metadata:
100+
labels:
101+
environment: "development"
102+
team: "platform"
103+
spec:
104+
requirements:
105+
- key: "karpenter.sh/capacity-type"
106+
operator: In
107+
values: ["spot"] # Development can use cheaper spot instances
108+
- key: "karpenter.k8s.aws/instance-category"
109+
operator: In
110+
values: ["m", "c", "r", "t"]
111+
- key: "kubernetes.io/arch"
112+
operator: In
113+
values: ["amd64"]
114+
# Dev environments can expire nodes to save costs
115+
expireAfter: "24h"
116+
disruption:
117+
consolidationPolicy: "WhenEmptyOrUnderutilized"
118+
consolidateAfter: "30s"
119+
limits:
120+
cpu: "500"
121+
memory: "2000Gi"
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Simple NodePool example for general workloads
2+
# Copy these values into your values.yaml file
3+
4+
ec2NodeClasses:
5+
- name: "general-compute"
6+
amiFamily: "AL2"
7+
role: "KarpenterNodeRole"
8+
subnetSelectorTerms:
9+
- tags:
10+
karpenter.sh/discovery: "my-cluster"
11+
securityGroupSelectorTerms:
12+
- tags:
13+
karpenter.sh/discovery: "my-cluster"
14+
kubelet:
15+
maxPods: 110
16+
clusterDNS: ["10.100.0.10"]
17+
blockDeviceMappings:
18+
- deviceName: "/dev/xvda"
19+
ebs:
20+
volumeSize: "20Gi"
21+
volumeType: "gp3"
22+
encrypted: true
23+
deleteOnTermination: true
24+
25+
nodePools:
26+
- name: "general-purpose"
27+
ec2NodeClassName: "general-compute"
28+
template:
29+
spec:
30+
requirements:
31+
- key: "karpenter.sh/capacity-type"
32+
operator: In
33+
values: ["spot", "on-demand"]
34+
- key: "karpenter.k8s.aws/instance-category"
35+
operator: In
36+
values: ["m", "c", "r"]
37+
- key: "kubernetes.io/arch"
38+
operator: In
39+
values: ["amd64"]
40+
- key: "kubernetes.io/os"
41+
operator: In
42+
values: ["linux"]
43+
disruption:
44+
consolidationPolicy: "WhenEmptyOrUnderutilized"
45+
consolidateAfter: "30s"
46+
limits:
47+
cpu: "1000"
48+
memory: "4000Gi"

0 commit comments

Comments
 (0)