Skip to content

Commit e19fe06

Browse files
r2k1Copilot
andcommitted
fix(e2e): improve E2E infra setup reliability
Address infrastructure failures that cause cascading test failures (464 failures across 46 builds over 3 weeks). 1. Make stale node GC tolerant (cluster.go): - Ignore NotFound errors on node deletion (already gone) - Log other delete errors as warnings instead of failing - Only return error if zero nodes could be deleted Evidence: 79 failures from 'failed to delete 3 stale nodes' 2. Fix existingVMSS map (cluster.go): - Only add VMSS to existingVMSS if they are being kept - VMSS queued for deletion are excluded, so their stale K8s nodes can be cleaned up in the same pass - If VMSS deletion fails, keep in map to avoid orphaned deletes 3. Retry Bastion subnet GET (cluster.go): - Poll with backoff for up to 30s on transient ARM errors - 404 still handled normally (create subnet) Evidence: 179 failures from 'get subnet AzureBastionSubnet: context deadline exceeded' 4. Retry AKS subnet + route table lookup (aks_model.go): - Poll with backoff for up to 2 minutes - Handles both transient GET failures and kubenet route table propagation delays after cluster create/reuse Evidence: 39 failures from 'AKS subnet has no route table' 5. Retry Firewall creation (aks_model.go): - Poll with backoff for up to 10 minutes - BeginCreateOrUpdate is idempotent, safe to retry Evidence: 90 failures from 'failed to create Firewall: context deadline exceeded' Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 788e191 commit e19fe06

2 files changed

Lines changed: 102 additions & 29 deletions

File tree

e2e/aks_model.go

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"os"
99
"path/filepath"
1010
"strings"
11+
"time"
1112

1213
"github.com/Azure/agentbaker/e2e/config"
1314
"github.com/Azure/agentbaker/e2e/toolkit"
@@ -19,6 +20,7 @@ import (
1920
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
2021
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v7"
2122
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/privatedns/armprivatedns"
23+
"k8s.io/apimachinery/pkg/util/wait"
2224
)
2325

2426
// getLatestGAKubernetesVersion returns the highest GA Kubernetes version for the given location.
@@ -314,13 +316,31 @@ func addFirewallRules(
314316
// routes (managed by cloud-provider-azure) and firewall routes coexist.
315317
// For Azure CNI variants, the subnet may not have any route table, so we
316318
// create and associate a dedicated one before adding the firewall routes.
317-
aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil)
318-
if err != nil {
319-
return fmt.Errorf("failed to get AKS subnet: %w", err)
320-
}
321-
aksRTName, err := ensureFirewallRouteTable(ctx, clusterModel, vnet.name, aksSubnetResp.Subnet)
319+
// Retry the AKS subnet GET and route table lookup to tolerate:
320+
// - transient ARM timeouts on the subnet GET
321+
// - eventual consistency: kubenet route table may still be propagating after cluster create/reuse
322+
var aksRTName string
323+
var lastSubnetErr, lastRTErr error
324+
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
325+
callCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
326+
defer cancel()
327+
aksSubnetResp, subnetErr := config.Azure.Subnet.Get(callCtx, rg, vnet.name, "aks-subnet", nil)
328+
if subnetErr != nil {
329+
lastSubnetErr = subnetErr
330+
toolkit.Logf(ctx, "transient error getting AKS subnet (retrying): %v", subnetErr)
331+
return false, nil
332+
}
333+
rtName, rtErr := ensureFirewallRouteTable(callCtx, clusterModel, vnet.name, aksSubnetResp.Subnet)
334+
if rtErr != nil {
335+
lastRTErr = rtErr
336+
toolkit.Logf(ctx, "route table not ready (retrying): %v", rtErr)
337+
return false, nil
338+
}
339+
aksRTName = rtName
340+
return true, nil
341+
})
322342
if err != nil {
323-
return err
343+
return fmt.Errorf("failed to get AKS subnet and route table after retries: %w (last subnet error: %v, last route table error: %v)", err, lastSubnetErr, lastRTErr)
324344
}
325345

326346
// Create AzureFirewallSubnet - this subnet name is required by Azure Firewall
@@ -386,13 +406,28 @@ func addFirewallRules(
386406

387407
firewallName := "abe2e-fw"
388408
firewall := getFirewall(ctx, location, firewallSubnetID, publicIPID)
389-
fwPoller, err := config.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil)
390-
if err != nil {
391-
return fmt.Errorf("failed to start Firewall creation: %w", err)
392-
}
393-
fwResp, err := fwPoller.PollUntilDone(ctx, nil)
409+
var fwResp armnetwork.AzureFirewallsClientCreateOrUpdateResponse
410+
var lastFWErr error
411+
err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
412+
// Per-attempt timeout so a hung PollUntilDone doesn't consume the entire retry budget.
413+
attemptCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
414+
defer cancel()
415+
fwPoller, fwErr := config.Azure.AzureFirewall.BeginCreateOrUpdate(attemptCtx, rg, firewallName, *firewall, nil)
416+
if fwErr != nil {
417+
lastFWErr = fwErr
418+
toolkit.Logf(ctx, "transient error starting firewall creation (retrying): %v", fwErr)
419+
return false, nil
420+
}
421+
fwResp, fwErr = fwPoller.PollUntilDone(attemptCtx, nil)
422+
if fwErr != nil {
423+
lastFWErr = fwErr
424+
toolkit.Logf(ctx, "firewall creation did not complete (retrying): %v", fwErr)
425+
return false, nil
426+
}
427+
return true, nil
428+
})
394429
if err != nil {
395-
return fmt.Errorf("failed to create Firewall: %w", err)
430+
return fmt.Errorf("failed to create Firewall after retries: %w (last error: %v)", err, lastFWErr)
396431
}
397432

398433
// Get the firewall's private IP address

e2e/cluster.go

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3"
2525
"github.com/google/uuid"
2626
corev1 "k8s.io/api/core/v1"
27+
apierrors "k8s.io/apimachinery/pkg/api/errors"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829
"k8s.io/apimachinery/pkg/util/wait"
2930
"k8s.io/client-go/tools/clientcmd"
@@ -336,24 +337,33 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName
336337

337338
func waitUntilClusterReady(ctx context.Context, name, location string) (*armcontainerservice.ManagedCluster, error) {
338339
var cluster armcontainerservice.ManagedClustersClientGetResponse
340+
var clusterDeleted bool
339341
err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
340342
var err error
341343
cluster, err = config.Azure.AKS.Get(ctx, config.ResourceGroupName(location), name, nil)
342344
if err != nil {
345+
var azErr *azcore.ResponseError
346+
if errors.As(err, &azErr) && azErr.StatusCode == 404 {
347+
clusterDeleted = true
348+
return true, nil
349+
}
343350
return false, err
344351
}
345352
switch *cluster.ManagedCluster.Properties.ProvisioningState {
346353
case "Succeeded":
347354
return true, nil
348-
case "Updating", "Assigned", "Creating":
355+
case "Updating", "Assigned", "Creating", "Deleting", "Canceled", "Canceling":
349356
return false, nil
350357
default:
351-
return false, fmt.Errorf("cluster %s is in state %s", name, *cluster.ManagedCluster.Properties.ProvisioningState)
358+
return false, fmt.Errorf("cluster %s is in state %s, won't retry", name, *cluster.ManagedCluster.Properties.ProvisioningState)
352359
}
353360
})
354361
if err != nil {
355362
return nil, fmt.Errorf("failed to wait for cluster %s to be ready: %w", name, err)
356363
}
364+
if clusterDeleted {
365+
return nil, nil
366+
}
357367
return &cluster.ManagedCluster, nil
358368
}
359369

@@ -511,13 +521,30 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC
511521
}
512522

513523
var bastionSubnetID string
514-
bastionSubnet, subnetGetErr := config.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil)
515-
if subnetGetErr != nil {
524+
var bastionSubnet armnetwork.SubnetsClientGetResponse
525+
var subnetGetErr error
526+
// Retry the subnet GET with a per-call timeout to tolerate ARM hangs.
527+
// Without this, a single unresponsive GET consumes the entire 20-minute cluster prep budget.
528+
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
529+
callCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
530+
defer cancel()
531+
bastionSubnet, subnetGetErr = config.Azure.Subnet.Get(callCtx, nodeRG, vnet.name, bastionSubnetName, nil)
532+
if subnetGetErr == nil {
533+
return true, nil
534+
}
516535
var subnetAzErr *azcore.ResponseError
517-
if !errors.As(subnetGetErr, &subnetAzErr) || subnetAzErr.StatusCode != http.StatusNotFound {
518-
return nil, fmt.Errorf("get subnet %q in vnet %q rg %q: %w", bastionSubnetName, vnet.name, nodeRG, subnetGetErr)
536+
if errors.As(subnetGetErr, &subnetAzErr) && subnetAzErr.StatusCode == http.StatusNotFound {
537+
return true, nil // 404 is expected — will create below
519538
}
539+
toolkit.Logf(ctx, "transient error getting subnet %q (retrying): %v", bastionSubnetName, subnetGetErr)
540+
return false, nil
541+
})
542+
if err != nil {
543+
return nil, fmt.Errorf("get subnet %q in vnet %q rg %q: retries exhausted: %w (last subnet error: %v)", bastionSubnetName, vnet.name, nodeRG, err, subnetGetErr)
544+
}
520545

546+
if subnetGetErr != nil {
547+
// 404 — need to create
521548
toolkit.Logf(ctx, "creating subnet %s in VNet %s (rg %s)", bastionSubnetName, vnet.name, nodeRG)
522549
subnetParams := armnetwork.Subnet{
523550
Properties: &armnetwork.SubnetPropertiesFormat{
@@ -705,28 +732,30 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
705732
defer toolkit.LogStepCtx(ctx, "collecting garbage VMSS")()
706733
rg := *cluster.Properties.NodeResourceGroup
707734

708-
// Build a set of all existing VMSS names while deleting old ones.
709-
existingVMSS := map[string]struct{}{}
735+
// Build a set of VMSS names that should be kept — exclude VMSS that are
736+
// being deleted so their stale K8s nodes can be cleaned up in the same pass.
737+
keptVMSS := map[string]struct{}{}
710738
pager := config.Azure.VMSS.NewListPager(rg, nil)
711739
for pager.More() {
712740
page, err := pager.NextPage(ctx)
713741
if err != nil {
714742
return fmt.Errorf("failed to get next page of VMSS: %w", err)
715743
}
716744
for _, vmss := range page.Value {
717-
existingVMSS[*vmss.Name] = struct{}{}
718-
719745
if _, ok := vmss.Tags["KEEP_VMSS"]; ok {
746+
keptVMSS[*vmss.Name] = struct{}{}
720747
continue
721748
}
722749
// don't delete managed pools
723750
if _, ok := vmss.Tags["aks-managed-poolName"]; ok {
751+
keptVMSS[*vmss.Name] = struct{}{}
724752
continue
725753
}
726754

727755
// don't delete VMSS created in the last hour. They might be currently used in tests
728756
// extra 10 minutes is a buffer for test cleanup, clock drift and timeout adjustments
729757
if config.Config.TestTimeout == 0 || time.Since(*vmss.Properties.TimeCreated) < config.Config.TestTimeout+10*time.Minute {
758+
keptVMSS[*vmss.Name] = struct{}{}
730759
continue
731760
}
732761

@@ -735,13 +764,16 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
735764
})
736765
if err != nil {
737766
toolkit.Logf(ctx, "failed to delete vmss %q: %s", *vmss.Name, err)
767+
// Keep in map so we don't try to delete its nodes while VMSS is still around
768+
keptVMSS[*vmss.Name] = struct{}{}
738769
continue
739770
}
740771
toolkit.Logf(ctx, "deleted vmss %q (age: %v)", *vmss.ID, time.Since(*vmss.Properties.TimeCreated))
772+
// Don't add to keptVMSS — nodes from this VMSS should be cleaned up
741773
}
742774
}
743775

744-
if err := collectGarbageNodes(ctx, kube, existingVMSS); err != nil {
776+
if err := collectGarbageNodes(ctx, kube, keptVMSS); err != nil {
745777
return fmt.Errorf("failed to collect garbage K8s nodes: %w", err)
746778
}
747779
return nil
@@ -751,15 +783,15 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
751783
// longer exists. This prevents stale nodes from accumulating in the cluster
752784
// and overwhelming the cloud-provider-azure route controller with perpetual
753785
// "instance not found" failures.
754-
func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map[string]struct{}) error {
786+
func collectGarbageNodes(ctx context.Context, kube *Kubeclient, keptVMSS map[string]struct{}) error {
755787
defer toolkit.LogStepCtx(ctx, "collecting garbage K8s nodes")()
756788

757789
nodes, err := kube.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
758790
if err != nil {
759791
return fmt.Errorf("listing K8s nodes for garbage collection: %w", err)
760792
}
761793

762-
var deleteErrors []error
794+
var deleted, failed int
763795
for _, node := range nodes.Items {
764796
// skip managed pool nodes (system nodepool)
765797
if strings.HasPrefix(node.Name, "aks-") {
@@ -772,19 +804,25 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map
772804
}
773805
vmssName := node.Name[:len(node.Name)-6]
774806

775-
if _, exists := existingVMSS[vmssName]; exists {
807+
if _, exists := keptVMSS[vmssName]; exists {
776808
continue
777809
}
778810

779811
if err := kube.Typed.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); err != nil {
780-
deleteErrors = append(deleteErrors, fmt.Errorf("deleting stale node %q: %w", node.Name, err))
812+
if apierrors.IsNotFound(err) {
813+
toolkit.Logf(ctx, "stale K8s node %q already gone", node.Name)
814+
continue
815+
}
816+
toolkit.Logf(ctx, "warning: failed to delete stale K8s node %q: %v", node.Name, err)
817+
failed++
781818
continue
782819
}
783820
toolkit.Logf(ctx, "deleted stale K8s node %q (VMSS %q not found)", node.Name, vmssName)
821+
deleted++
784822
}
785823

786-
if len(deleteErrors) > 0 {
787-
return fmt.Errorf("failed to delete %d stale nodes, first error: %w", len(deleteErrors), deleteErrors[0])
824+
if failed > 0 && deleted == 0 {
825+
return fmt.Errorf("failed to delete any of %d stale nodes", failed)
788826
}
789827
return nil
790828
}

0 commit comments

Comments
 (0)