Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions cluster-autoscaler/cloudprovider/coreweave/coreweave_instance_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package coreweave

import "fmt"

// InstanceType represents the resource specifications for a CoreWeave instance type.
// Units are chosen to match the Kubernetes Node resource representation.
type InstanceType struct {
// VCPU is the number of virtual CPU cores
VCPU int64
// MemoryKi is the amount of memory in kibibytes (1 Ki = 1024 bytes)
MemoryKi int64
// GPU is the number of GPUs
GPU int64
// EphemeralStorageKi is the amount of ephemeral storage in kibibytes (1 Ki = 1024 bytes)
EphemeralStorageKi int64
// Architecture is the CPU architecture (e.g., "amd64", "arm64")
Architecture string
// MaxPods is the maximum number of pods that can run on this instance type
MaxPods int64
}

// InstanceTypes is a map of CoreWeave instance type names to their specifications.
// This map should be populated with the actual instance types supported by CoreWeave.
var InstanceTypes = map[string]*InstanceType{
"b200-8x": {
VCPU: 128,
MemoryKi: 2112277172,
GPU: 8,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"cd-gp-a192-genoa": {
VCPU: 192,
MemoryKi: 1583811548,
GPU: 0,
EphemeralStorageKi: 7499230528,
Architecture: "amd64",
MaxPods: 110,
},
"cd-gp-l-a192-genoa": {
VCPU: 192,
MemoryKi: 1583796048,
GPU: 0,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"cd-gp-i64-erapids": {
VCPU: 64,
MemoryKi: 526674536,
GPU: 0,
EphemeralStorageKi: 7499230528,
Architecture: "amd64",
MaxPods: 110,
},
"cd-gp-l-i64-erapids": {
VCPU: 64,
MemoryKi: 526668108,
GPU: 0,
EphemeralStorageKi: 15000547328,
Architecture: "amd64",
MaxPods: 110,
},
"cd-gp-i96-icelake": {
VCPU: 96,
MemoryKi: 394209340,
GPU: 0,
EphemeralStorageKi: 6248987968,
Architecture: "amd64",
MaxPods: 110,
},
"cd-hc-a384ib-genoa": {
VCPU: 384,
MemoryKi: 1583672504,
GPU: 0,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"cd-hc-a384-genoa": {
VCPU: 384,
MemoryKi: 1583673336,
GPU: 0,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 300,
},
"cd-hp-a96-genoa": {
VCPU: 96,
MemoryKi: 791111968,
GPU: 0,
EphemeralStorageKi: 7499230528,
Architecture: "amd64",
MaxPods: 110,
},
"gd-1xgh200": {
VCPU: 72,
MemoryKi: 600218240,
GPU: 1,
EphemeralStorageKi: 7499362648,
Architecture: "arm64",
MaxPods: 110,
},
"gd-8xa100-i128": {
VCPU: 128,
MemoryKi: 2112249840,
GPU: 8,
EphemeralStorageKi: 7499362648,
Architecture: "amd64",
MaxPods: 110,
},
"gd-8xh100ib-i128": {
VCPU: 128,
MemoryKi: 2112109804,
GPU: 8,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"gd-8xh200ib-i128": {
VCPU: 128,
MemoryKi: 2112109800,
GPU: 8,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"gd-8xl40-i128": {
VCPU: 128,
MemoryKi: 1055335508,
GPU: 8,
EphemeralStorageKi: 7499362648,
Architecture: "amd64",
MaxPods: 110,
},
"gd-8xl40s-i128": {
VCPU: 128,
MemoryKi: 1055337468,
GPU: 8,
EphemeralStorageKi: 7499362648,
Architecture: "amd64",
MaxPods: 110,
},
"rtxp6000-8x": {
VCPU: 128,
MemoryKi: 1055335468,
GPU: 8,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"turin-gp-l": {
VCPU: 192,
MemoryKi: 1583282436,
GPU: 0,
EphemeralStorageKi: 30003181568,
Architecture: "amd64",
MaxPods: 110,
},
"turin-gp": {
VCPU: 192,
MemoryKi: 1583297960,
GPU: 0,
EphemeralStorageKi: 7499230528,
Architecture: "amd64",
MaxPods: 110,
},
}

// GetInstanceType returns the InstanceType for the given instance type name.
// It returns an error if the instance type is not found in the InstanceTypes map.
func GetInstanceType(instanceTypeName string) (*InstanceType, error) {
instanceType, exists := InstanceTypes[instanceTypeName]
if !exists {
return nil, fmt.Errorf("unknown instance type: %s", instanceTypeName)
}
return instanceType, nil
}
107 changes: 105 additions & 2 deletions cluster-autoscaler/cloudprovider/coreweave/coreweave_nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@ package coreweave

import (
"fmt"
"math/rand"
"sync"

apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/klog/v2"
)

Expand Down Expand Up @@ -123,9 +127,108 @@ func (ng *CoreWeaveNodeGroup) Nodes() ([]cloudprovider.Instance, error) {
}

// TemplateNodeInfo returns a template NodeInfo for the node group.
// This method is not implemented for CoreWeaveNodeGroup.
// This is used by the autoscaler to simulate what a new node would look like
// when scaling from zero or when no nodes currently exist in the node group.
func (ng *CoreWeaveNodeGroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
return nil, cloudprovider.ErrNotImplemented
instanceTypeName := ng.nodepool.GetInstanceType()
if instanceTypeName == "" {
return nil, fmt.Errorf("node pool %s has no instance type defined", ng.Name)
}

instanceType, err := GetInstanceType(instanceTypeName)
if err != nil {
return nil, fmt.Errorf("failed to get instance type info for %s: %v", instanceTypeName, err)
}

node, err := ng.buildNodeFromInstanceType(instanceTypeName, instanceType)
if err != nil {
return nil, fmt.Errorf("failed to build node from instance type: %v", err)
}

// The second parameter is for ResourceSlices when using DRA. CoreWeave only DRA for rack based instances which are
// not supported by the Cluster Autoscaler at this time
nodeInfo := framework.NewNodeInfo(node, nil)

return nodeInfo, nil
}

// buildNodeFromInstanceType creates a template Node from the instance type and node pool configuration.
func (ng *CoreWeaveNodeGroup) buildNodeFromInstanceType(instanceTypeName string, instanceType *InstanceType) (*apiv1.Node, error) {
nodeName := fmt.Sprintf("%s-template-%d", ng.Name, rand.Int63())

capacity := ng.buildResourceList(instanceType)

labels := ng.buildNodeLabels(nodeName, instanceTypeName, instanceType)

taints := ng.nodepool.GetNodeTaints()

node := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Labels: labels,
},
Status: apiv1.NodeStatus{
// Capacity and Allocatable are set to the same value, ignoring system pods
Capacity: capacity,
Allocatable: capacity,
Conditions: cloudprovider.BuildReadyConditions(),
},
Spec: apiv1.NodeSpec{
Taints: taints,
},
}

return node, nil
}

// buildResourceList creates a ResourceList from the instance type specifications.
func (ng *CoreWeaveNodeGroup) buildResourceList(instanceType *InstanceType) apiv1.ResourceList {
resources := apiv1.ResourceList{}

// CPU
resources[apiv1.ResourceCPU] = *resource.NewQuantity(instanceType.VCPU, resource.DecimalSI)

// Memory - stored in kibibytes (Ki), convert to bytes for template
resources[apiv1.ResourceMemory] = *resource.NewQuantity(instanceType.MemoryKi*1024, resource.BinarySI)

// Ephemeral storage - stored in kibibytes (Ki), convert to bytes for template
if instanceType.EphemeralStorageKi > 0 {
resources[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(instanceType.EphemeralStorageKi*1024, resource.BinarySI)
}

// GPU - use nvidia.com/gpu as the resource name
if instanceType.GPU > 0 {
resources[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(instanceType.GPU, resource.DecimalSI)
}

// Default to max of 110 pods if not specified (Kubernetes default)
resources[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
if instanceType.MaxPods > 0 {
resources[apiv1.ResourcePods] = *resource.NewQuantity(instanceType.MaxPods, resource.DecimalSI)
}

return resources
}

// buildNodeLabels creates the labels for a template node.
func (ng *CoreWeaveNodeGroup) buildNodeLabels(nodeName, instanceTypeName string, instanceType *InstanceType) map[string]string {
labels := make(map[string]string)

labels[apiv1.LabelInstanceTypeStable] = instanceTypeName
labels[apiv1.LabelArchStable] = cloudprovider.DefaultArch
if instanceType.Architecture != "" {
labels[apiv1.LabelArchStable] = instanceType.Architecture
}
labels[apiv1.LabelOSStable] = cloudprovider.DefaultOS

labels[coreWeaveNodePoolUID] = ng.nodepool.GetUID()
labels[coreWeaveNodePoolName] = ng.nodepool.GetName()

for k, v := range ng.nodepool.GetNodeLabels() {
labels[k] = v
}

return labels
}

// Exist checks if the node group exists.
Expand Down
Loading