Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 9 additions & 15 deletions controllers/providers/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
"github.com/aws/aws-sdk-go/service/ec2"
Expand Down Expand Up @@ -112,11 +111,12 @@ var (
)

type AwsWorker struct {
AsgClient autoscalingiface.AutoScalingAPI
EksClient eksiface.EKSAPI
IamClient iamiface.IAMAPI
Ec2Client ec2iface.EC2API
Parameters map[string]interface{}
AsgClient autoscalingiface.AutoScalingAPI
EksClient eksiface.EKSAPI
IamClient iamiface.IAMAPI
Ec2Client ec2iface.EC2API
Ec2Metadata *ec2metadata.EC2Metadata
Parameters map[string]interface{}
}

func (w *AwsWorker) WithRetries(f func() bool) error {
Expand Down Expand Up @@ -164,18 +164,12 @@ func GetTagValueByKey(tags []*autoscaling.TagDescription, key string) string {
return ""
}

func GetRegion() (string, error) {
func GetRegion(metadata *ec2metadata.EC2Metadata) (string, error) {
if os.Getenv("AWS_REGION") != "" {
return os.Getenv("AWS_REGION"), nil
}
// Try Derive
var config aws.Config
sess := session.Must(session.NewSessionWithOptions(session.Options{
SharedConfigState: session.SharedConfigEnable,
Config: config,
}))
c := ec2metadata.New(sess)
region, err := c.Region()

region, err := metadata.Region()
if err != nil {
return "", err
}
Expand Down
10 changes: 10 additions & 0 deletions controllers/providers/aws/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"strings"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/ec2"
Expand Down Expand Up @@ -56,6 +57,15 @@ func GetAwsEc2Client(region string, cacheCfg *cache.Config, maxRetries int, coll
return ec2.New(sess)
}

func GetAwsEc2MetadataClient() *ec2metadata.EC2Metadata {
var config aws.Config
sess := session.Must(session.NewSessionWithOptions(session.Options{
SharedConfigState: session.SharedConfigEnable,
Config: config,
}))
return ec2metadata.New(sess)
}

func (w *AwsWorker) DescribeInstanceOfferings() ([]*ec2.InstanceTypeOffering, error) {
offerings := []*ec2.InstanceTypeOffering{}
err := w.Ec2Client.DescribeInstanceTypeOfferingsPages(&ec2.DescribeInstanceTypeOfferingsInput{}, func(page *ec2.DescribeInstanceTypeOfferingsOutput, lastPage bool) bool {
Expand Down
11 changes: 11 additions & 0 deletions controllers/providers/aws/eks.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.
package aws

import (
"strings"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/request"
Expand Down Expand Up @@ -210,3 +212,12 @@ func (w *AwsWorker) DescribeFargateProfile() (*eks.FargateProfile, error) {
}
return output.FargateProfile, nil
}

func (w *AwsWorker) GetDNSClusterIP(cluster *eks.Cluster) string {
if cluster == nil {
return ""
}
serviceCidr := aws.StringValue(cluster.KubernetesNetworkConfig.ServiceIpv4Cidr)
// addresses assigned from either the 10.100.0.0/16 or 172.20.0.0/16 CIDR blocks
return strings.ReplaceAll(serviceCidr, "0/16", "10")
}
4 changes: 4 additions & 0 deletions controllers/provisioners/eks/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,10 @@ func (d *DiscoveredState) SetCluster(cluster *eks.Cluster) {
d.Cluster = cluster
}

func (d *DiscoveredState) GetCluster() *eks.Cluster {
return d.Cluster
}

func (d *DiscoveredState) SetVPCId(id string) {
d.VPCId = id
}
Expand Down
7 changes: 5 additions & 2 deletions controllers/provisioners/eks/eks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,14 @@ func MockAwsWorker(asgClient *MockAutoScalingClient, iamClient *MockIamClient, e
func MockEksCluster(version string) *eks.Cluster {
return &eks.Cluster{
CertificateAuthority: &eks.Certificate{
Data: aws.String(""),
Data: aws.String("dGVzdA=="),
},
Endpoint: aws.String("foo.amazonaws.com"),
ResourcesVpcConfig: &eks.VpcConfigResponse{},
Version: &version,
KubernetesNetworkConfig: &eks.KubernetesNetworkConfigResponse{
ServiceIpv4Cidr: aws.String("172.20.0.0/16"),
},
Version: &version,
}
}

Expand Down
29 changes: 25 additions & 4 deletions controllers/provisioners/eks/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func (ctx *EksInstanceGroupContext) GetBasicUserData(clusterName, args string, k
Echo "Not starting Kubelet due to warmed state."
& C:\ProgramData\Amazon\EC2-Windows\Launch\Scripts\InitializeInstance.ps1 –Schedule
} else {
& $EKSBootstrapScriptFile -EKSClusterName {{ .ClusterName }} -KubeletExtraArgs '{{ .KubeletExtraArgs }}' 3>&1 4>&1 5>&1 6>&1
& $EKSBootstrapScriptFile -EKSClusterName {{ .ClusterName }} {{ .Arguments }} 3>&1 4>&1 5>&1 6>&1
{{range $post := .PostBootstrap}}{{$post}}{{end}}
}
</powershell>`
Expand Down Expand Up @@ -513,13 +513,34 @@ func (ctx *EksInstanceGroupContext) GetComputedBootstrapOptions() *v1alpha1.Boot
func (ctx *EksInstanceGroupContext) GetBootstrapArgs() string {
var (
bootstrapOptions = ctx.GetComputedBootstrapOptions()
state = ctx.GetDiscoveredState()
osFamily = ctx.GetOsFamily()
cluster = state.GetCluster()
clusterIP = ctx.AwsWorker.GetDNSClusterIP(cluster)
)
var sb strings.Builder
switch strings.ToLower(osFamily) {
case OsFamilyWindows:
if state.Cluster != nil {
sb.WriteString(fmt.Sprintf("-Base64ClusterCA %v ", aws.StringValue(state.Cluster.CertificateAuthority.Data)))
sb.WriteString(fmt.Sprintf("-APIServerEndpoint %v ", aws.StringValue(state.Cluster.Endpoint)))
}
sb.WriteString(fmt.Sprintf("-KubeletExtraArgs '%v'", ctx.GetKubeletExtraArgs()))
case OsFamilyAmazonLinux2:
if bootstrapOptions != nil && bootstrapOptions.MaxPods > 0 {
sb.WriteString("--use-max-pods false ")
}
if state.Cluster != nil {
sb.WriteString(fmt.Sprintf("--b64-cluster-ca %v ", aws.StringValue(state.Cluster.CertificateAuthority.Data)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's some tricky behavior in the EKS AL2 AMI that we might want to solve for before merging this change.
See https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L332

Basically, if the cluster ca and api server endpoint are defined, the AMI bypasses calling EKS - which is what we want! Unfortunately, that call also grabs the serviceIpv4Cidr field, which is used later in the bootstrap script to calculate the default ip that should be used for DNS (the kube-dns/core-dns service ip basically). We probably want to replicate this logic and inject the dns ip address in case the user is relying on this behavior.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, will look into this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@backjo take a look again, I added discovery of cluster ip in the case of AL2

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes look pretty good! If I might suggest one more change - looking up the IPv4 CIDR range of the node that IM is running on adds a somewhat implicit assumption that Instance Manager is running in the same VPC - which I think is generally a fair assumption, though it could make debugging with a remote cluster from an IDE a bit more difficult :P

I think a cleaner approach to determine the VPC CIDR range for the nodes might be to lookup the VPC attached to the EKS cluster - and use the CIDR range associated with it for the fallback logic.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah.. technically we can remove the fallback all together, since reconcile depends on getting cluster information.
I think it's safe to assume we will always have the ServiceCIDR and remove the metadata fallback logic. WDYT?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with it - the only open question I have is whether ServiceCIDR is always defined - I know that it was only added in October 2020 - and all of our clusters are newer than that - so could use some help verifying that even for older clusters, the value is defined.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm I have clusters from 2019 which have this populated.
Makes sense when they add a new API to make sure it's populated across all objects otherwise this would be impossible to use

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've also removed the ec2metadata discovery fallback since we guarantee to always have the cluster payload or fail the reconcile.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool - sounds good!

sb.WriteString(fmt.Sprintf("--apiserver-endpoint %v ", aws.StringValue(state.Cluster.Endpoint)))
if !common.StringEmpty(clusterIP) {
sb.WriteString(fmt.Sprintf("--dns-cluster-ip %v ", clusterIP))
}
}

if bootstrapOptions != nil && bootstrapOptions.MaxPods > 0 {
sb.WriteString("--use-max-pods false ")
sb.WriteString(fmt.Sprintf("--kubelet-extra-args '%v'", ctx.GetKubeletExtraArgs()))
}
sb.WriteString(fmt.Sprintf("--kubelet-extra-args '%v'", ctx.GetKubeletExtraArgs()))

return sb.String()
}

Expand Down
82 changes: 78 additions & 4 deletions controllers/provisioners/eks/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func TestGetBasicUserDataAmazonLinux2(t *testing.T) {
mounts = ctx.GetMountOpts()
)

expectedData := `#!/bin/bash
expectedDataLinux := `#!/bin/bash
foo
mkfs.xfs /dev/xvda
mkdir /mnt/foo
Expand All @@ -159,16 +159,90 @@ if [[ $(type -P $(which aws)) ]] && [[ $(type -P $(which jq)) ]] ; then
fi
fi
set -o xtrace
/etc/eks/bootstrap.sh foo --use-max-pods false --kubelet-extra-args '--node-labels=foo=bar,instancemgr.keikoproj.io/image=ami-123456789012,node.kubernetes.io/role=instance-group-1 --register-with-taints=foo=bar:NoSchedule --eviction-hard=memory.available<300Mi,nodefs.available<5% --system-reserved=memory=2.5Gi --v=2 --max-pods=4'
/etc/eks/bootstrap.sh foo --use-max-pods false --b64-cluster-ca dGVzdA== --apiserver-endpoint foo.amazonaws.com --dns-cluster-ip 172.20.0.10 --kubelet-extra-args '--node-labels=foo=bar,instancemgr.keikoproj.io/image=ami-123456789012,node.kubernetes.io/role=instance-group-1 --register-with-taints=foo=bar:NoSchedule --eviction-hard=memory.available<300Mi,nodefs.available<5% --system-reserved=memory=2.5Gi --v=2 --max-pods=4'
set +o xtrace
bar`
userData := ctx.GetBasicUserData("foo", args, kubeletArgs, userDataPayload, mounts)
basicUserDataDecoded, _ := base64.StdEncoding.DecodeString(userData)
basicUserDataString := string(basicUserDataDecoded)
if basicUserDataString != expectedData {
t.Fatalf("\nExpected: START>%v<END\n Got: START>%v<END", expectedData, basicUserDataString)
if basicUserDataString != expectedDataLinux {
t.Fatalf("\nExpected: START>%v<END\n Got: START>%v<END", expectedDataLinux, basicUserDataString)
}
}

func TestGetBasicUserDataWindows(t *testing.T) {
var (
k = MockKubernetesClientSet()
ig = MockInstanceGroup()
asgMock = NewAutoScalingMocker()
iamMock = NewIamMocker()
eksMock = NewEksMocker()
ec2Mock = NewEc2Mocker()
configuration = ig.GetEKSConfiguration()
)

w := MockAwsWorker(asgMock, iamMock, eksMock, ec2Mock)
ctx := MockContext(ig, k, w)

configuration.BootstrapOptions = &v1alpha1.BootstrapOptions{
MaxPods: 4,
}
configuration.Labels = map[string]string{
"foo": "bar",
}
configuration.Taints = []corev1.Taint{
{
Key: "foo",
Value: "bar",
Effect: "NoSchedule",
},
}

configuration.BootstrapArguments = "--eviction-hard=memory.available<300Mi,nodefs.available<5% --system-reserved=memory=2.5Gi --v=2"
configuration.UserData = []v1alpha1.UserDataStage{
{
Stage: "PreBootstrap",
Data: "foo",
},
{
Stage: "PostBootstrap",
Data: "bar",
},
}

ig.Annotations[OsFamilyAnnotation] = OsFamilyWindows

expectedDataWindows := `
<powershell>
foo
[string]$EKSBinDir = "$env:ProgramFiles\Amazon\EKS"
[string]$EKSBootstrapScriptName = 'Start-EKSBootstrap.ps1'
[string]$EKSBootstrapScriptFile = "$EKSBinDir\$EKSBootstrapScriptName"
[string]$IMDSToken=(curl -UseBasicParsing -Method PUT "http://169.254.169.254/latest/api/token" -H @{ "X-aws-ec2-metadata-token-ttl-seconds" = "21600"} | % { Echo $_.Content})
[string]$InstanceID=(curl -UseBasicParsing -Method GET "http://169.254.169.254/latest/meta-data/instance-id" -H @{ "X-aws-ec2-metadata-token" = "$IMDSToken"} | % { Echo $_.Content})
[string]$Lifecycle = Get-ASAutoScalingInstance $InstanceID | % { Echo $_.LifecycleState}
if ($Lifecycle -like "*Warmed*") {
Echo "Not starting Kubelet due to warmed state."
& C:\ProgramData\Amazon\EC2-Windows\Launch\Scripts\InitializeInstance.ps1 –Schedule
} else {
& $EKSBootstrapScriptFile -EKSClusterName foo -Base64ClusterCA dGVzdA== -APIServerEndpoint foo.amazonaws.com -KubeletExtraArgs '--node-labels=foo=bar,instancemgr.keikoproj.io/image=ami-123456789012,node.kubernetes.io/role=instance-group-1 --register-with-taints=foo=bar:NoSchedule --eviction-hard=memory.available<300Mi,nodefs.available<5% --system-reserved=memory=2.5Gi --v=2 --max-pods=4' 3>&1 4>&1 5>&1 6>&1
bar
}
</powershell>`

var (
args = ctx.GetBootstrapArgs()
kubeletArgs = ctx.GetKubeletExtraArgs()
userDataPayload = ctx.GetUserDataStages()
mounts = ctx.GetMountOpts()
)

userData := ctx.GetBasicUserData("foo", args, kubeletArgs, userDataPayload, mounts)
basicUserDataDecoded, _ := base64.StdEncoding.DecodeString(userData)
basicUserDataString := string(basicUserDataDecoded)
if basicUserDataString != expectedDataWindows {
t.Fatalf("\nExpected: START>%v<END\n Got: START>%v<END", expectedDataWindows, basicUserDataString)
}
}

func TestCustomNetworkingMaxPods(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ func TestLaunchConfigurationDrifted(t *testing.T) {
LaunchConfigurationName: aws.String("my-launch-config"),
},
input: &CreateConfigurationInput{
SecurityGroups: []string{},
SecurityGroups: []string{},
MetadataOptions: &v1alpha1.MetadataOptions{HttpEndpoint: "enabled"},
},
shouldDrift: true,
Expand Down
12 changes: 7 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ func main() {
os.Exit(1)
}

awsRegion, err := aws.GetRegion()
metadata := aws.GetAwsEc2MetadataClient()
awsRegion, err := aws.GetRegion(metadata)
if err != nil {
setupLog.Error(err, "unable to get AWS region")
os.Exit(1)
Expand All @@ -120,10 +121,11 @@ func main() {
cacheCollector := cacheCfg.NewCacheCollector("instance_manager")
controllerCollector := common.NewMetricsCollector()
awsWorker := aws.AwsWorker{
Ec2Client: aws.GetAwsEc2Client(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
IamClient: aws.GetAwsIamClient(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
AsgClient: aws.GetAwsAsgClient(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
EksClient: aws.GetAwsEksClient(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
Ec2Client: aws.GetAwsEc2Client(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
IamClient: aws.GetAwsIamClient(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
AsgClient: aws.GetAwsAsgClient(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
EksClient: aws.GetAwsEksClient(awsRegion, cacheCfg, maxAPIRetries, controllerCollector),
Ec2Metadata: metadata,
}

metrics.Registry.MustRegister(cacheCollector, controllerCollector)
Expand Down