devtron-labs · rupinSec · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/charts/karpenter-custom-resources/Chart.yaml b/charts/karpenter-custom-resources/Chart.yaml
@@ -5,7 +5,7 @@ type: application
 
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates.
-version: 1.0.0
+version: 1.1.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application.

diff --git a/charts/karpenter-custom-resources/examples/gpu-nodepool.yaml b/charts/karpenter-custom-resources/examples/gpu-nodepool.yaml
@@ -0,0 +1,62 @@
+# GPU NodePool example for ML/AI workloads
+# Copy these values into your values.yaml file
+
+ec2NodeClasses:
+  - name: "gpu-compute"
+    amiFamily: "AL2"
+    role: "KarpenterNodeRole"
+    subnetSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "my-cluster"
+    securityGroupSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "my-cluster"
+    tags:
+      WorkloadType: "GPU"
+      Purpose: "ML-Training"
+    kubelet:
+      maxPods: 110
+      clusterDNS: ["10.100.0.10"]
+    blockDeviceMappings:
+      - deviceName: "/dev/xvda"
+        ebs:
+          volumeSize: "100Gi"  # Larger storage for ML frameworks
+          volumeType: "gp3"
+          encrypted: true
+          deleteOnTermination: true
+    metadataOptions:
+      httpEndpoint: "enabled"
+      httpTokens: "required"
+
+nodePools:
+  - name: "gpu-workloads"
+    ec2NodeClassName: "gpu-compute"
+    template:
+      metadata:
+        labels:
+          workload-type: "gpu"
+      spec:
+        # Taint GPU nodes so only GPU workloads schedule here
+        taints:
+          - key: "nvidia.com/gpu"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["on-demand"]  # GPU instances are typically on-demand
+          - key: "karpenter.k8s.aws/instance-category"
+            operator: In
+            values: ["g", "p"]  # GPU instance families
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "kubernetes.io/os"
+            operator: In
+            values: ["linux"]
+    disruption:
+      consolidationPolicy: "WhenEmpty"  # Conservative for expensive GPU nodes
+      consolidateAfter: "5m"
+    limits:
+      cpu: "500"
+      memory: "2000Gi"
+      "nvidia.com/gpu": "16"  # Limit total GPUs
diff --git a/charts/karpenter-custom-resources/examples/multi-environment.yaml b/charts/karpenter-custom-resources/examples/multi-environment.yaml
@@ -0,0 +1,121 @@
+# Multi-environment NodePool example
+# Shows how to create separate node pools for different environments
+
+ec2NodeClasses:
+  - name: "production-nodes"
+    amiFamily: "AL2"
+    role: "KarpenterNodeRole"
+    subnetSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "prod-cluster"
+          Environment: "production"
+    securityGroupSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "prod-cluster"
+          Environment: "production"
+    tags:
+      Environment: "production"
+      CostCenter: "engineering"
+    kubelet:
+      maxPods: 110
+      clusterDNS: ["10.100.0.10"]
+      # Resource reservations for production stability
+      systemReserved:
+        cpu: "200m"
+        memory: "500Mi"
+        ephemeral-storage: "1Gi"
+      kubeReserved:
+        cpu: "200m"
+        memory: "500Mi"
+    blockDeviceMappings:
+      - deviceName: "/dev/xvda"
+        ebs:
+          volumeSize: "50Gi"  # Larger storage for production
+          volumeType: "gp3"
+          encrypted: true
+          deleteOnTermination: true
+    metadataOptions:
+      httpEndpoint: "enabled"
+      httpTokens: "required"
+    detailedMonitoring: true
+
+  - name: "development-nodes"
+    amiFamily: "AL2"
+    role: "KarpenterNodeRole"
+    subnetSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "dev-cluster"
+          Environment: "development"
+    securityGroupSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "dev-cluster"
+          Environment: "development"
+    tags:
+      Environment: "development"
+      CostCenter: "engineering"
+    kubelet:
+      maxPods: 110
+      clusterDNS: ["10.100.0.10"]
+    blockDeviceMappings:
+      - deviceName: "/dev/xvda"
+        ebs:
+          volumeSize: "20Gi"
+          volumeType: "gp3"
+          encrypted: true
+          deleteOnTermination: true
+
+nodePools:
+  - name: "production-workloads"
+    ec2NodeClassName: "production-nodes"
+    template:
+      metadata:
+        labels:
+          environment: "production"
+          team: "platform"
+      spec:
+        requirements:
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["on-demand"]  # Production prefers reliability
+          - key: "karpenter.k8s.aws/instance-category"
+            operator: In
+            values: ["m", "c", "r"]
+          - key: "karpenter.k8s.aws/instance-generation"
+            operator: Gt
+            values: ["4"]  # Latest generation for production
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+    disruption:
+      consolidationPolicy: "WhenEmpty"  # Conservative for production
+      consolidateAfter: "10m"
+    limits:
+      cpu: "2000"
+      memory: "8000Gi"
+
+  - name: "development-workloads"
+    ec2NodeClassName: "development-nodes"
+    template:
+      metadata:
+        labels:
+          environment: "development"
+          team: "platform"
+      spec:
+        requirements:
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot"]  # Development can use cheaper spot instances
+          - key: "karpenter.k8s.aws/instance-category"
+            operator: In
+            values: ["m", "c", "r", "t"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+        # Dev environments can expire nodes to save costs
+        expireAfter: "24h"
+    disruption:
+      consolidationPolicy: "WhenEmptyOrUnderutilized"
+      consolidateAfter: "30s"
+    limits:
+      cpu: "500"
+      memory: "2000Gi"
diff --git a/charts/karpenter-custom-resources/examples/simple-nodepool.yaml b/charts/karpenter-custom-resources/examples/simple-nodepool.yaml
@@ -0,0 +1,48 @@
+# Simple NodePool example for general workloads
+# Copy these values into your values.yaml file
+
+ec2NodeClasses:
+  - name: "general-compute"
+    amiFamily: "AL2"
+    role: "KarpenterNodeRole"
+    subnetSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "my-cluster"
+    securityGroupSelectorTerms:
+      - tags:
+          karpenter.sh/discovery: "my-cluster"
+    kubelet:
+      maxPods: 110
+      clusterDNS: ["10.100.0.10"]
+    blockDeviceMappings:
+      - deviceName: "/dev/xvda"
+        ebs:
+          volumeSize: "20Gi"
+          volumeType: "gp3"
+          encrypted: true
+          deleteOnTermination: true
+
+nodePools:
+  - name: "general-purpose"
+    ec2NodeClassName: "general-compute"
+    template:
+      spec:
+        requirements:
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+          - key: "karpenter.k8s.aws/instance-category"
+            operator: In
+            values: ["m", "c", "r"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "kubernetes.io/os"
+            operator: In
+            values: ["linux"]
+    disruption:
+      consolidationPolicy: "WhenEmptyOrUnderutilized"
+      consolidateAfter: "30s"
+    limits:
+      cpu: "1000"
+      memory: "4000Gi"