@@ -26,6 +26,7 @@ import (
2626 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3"
2727 "github.com/google/uuid"
2828 corev1 "k8s.io/api/core/v1"
29+ apierrors "k8s.io/apimachinery/pkg/api/errors"
2930 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031 "k8s.io/apimachinery/pkg/util/wait"
3132 "k8s.io/client-go/tools/clientcmd"
@@ -348,24 +349,33 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName
348349
349350func waitUntilClusterReady (ctx context.Context , name , location string ) (* armcontainerservice.ManagedCluster , error ) {
350351 var cluster armcontainerservice.ManagedClustersClientGetResponse
352+ var clusterDeleted bool
351353 err := wait .PollUntilContextCancel (ctx , time .Second , true , func (ctx context.Context ) (bool , error ) {
352354 var err error
353355 cluster , err = config .Azure .AKS .Get (ctx , config .ResourceGroupName (location ), name , nil )
354356 if err != nil {
357+ var azErr * azcore.ResponseError
358+ if errors .As (err , & azErr ) && azErr .StatusCode == 404 {
359+ clusterDeleted = true
360+ return true , nil
361+ }
355362 return false , err
356363 }
357364 switch * cluster .ManagedCluster .Properties .ProvisioningState {
358365 case "Succeeded" :
359366 return true , nil
360- case "Updating" , "Assigned" , "Creating" :
367+ case "Updating" , "Assigned" , "Creating" , "Deleting" , "Canceled" , "Canceling" :
361368 return false , nil
362369 default :
363- return false , fmt .Errorf ("cluster %s is in state %s" , name , * cluster .ManagedCluster .Properties .ProvisioningState )
370+ return false , fmt .Errorf ("cluster %s is in state %s, won't retry " , name , * cluster .ManagedCluster .Properties .ProvisioningState )
364371 }
365372 })
366373 if err != nil {
367374 return nil , fmt .Errorf ("failed to wait for cluster %s to be ready: %w" , name , err )
368375 }
376+ if clusterDeleted {
377+ return nil , nil
378+ }
369379 return & cluster .ManagedCluster , nil
370380}
371381
@@ -533,13 +543,30 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC
533543 }
534544
535545 var bastionSubnetID string
536- bastionSubnet , subnetGetErr := config .Azure .Subnet .Get (ctx , nodeRG , vnet .name , bastionSubnetName , nil )
537- if subnetGetErr != nil {
546+ var bastionSubnet armnetwork.SubnetsClientGetResponse
547+ var subnetGetErr error
548+ // Retry the subnet GET with a per-call timeout to tolerate ARM hangs.
549+ // Without this, a single unresponsive GET consumes the entire 20-minute cluster prep budget.
550+ err = wait .PollUntilContextTimeout (ctx , 5 * time .Second , 2 * time .Minute , true , func (ctx context.Context ) (bool , error ) {
551+ callCtx , cancel := context .WithTimeout (ctx , 30 * time .Second )
552+ defer cancel ()
553+ bastionSubnet , subnetGetErr = config .Azure .Subnet .Get (callCtx , nodeRG , vnet .name , bastionSubnetName , nil )
554+ if subnetGetErr == nil {
555+ return true , nil
556+ }
538557 var subnetAzErr * azcore.ResponseError
539- if ! errors .As (subnetGetErr , & subnetAzErr ) || subnetAzErr .StatusCode ! = http .StatusNotFound {
540- return nil , fmt . Errorf ( "get subnet %q in vnet %q rg %q: %w" , bastionSubnetName , vnet . name , nodeRG , subnetGetErr )
558+ if errors .As (subnetGetErr , & subnetAzErr ) && subnetAzErr .StatusCode = = http .StatusNotFound {
559+ return true , nil // 404 is expected — will create below
541560 }
561+ toolkit .Logf (ctx , "transient error getting subnet %q (retrying): %v" , bastionSubnetName , subnetGetErr )
562+ return false , nil
563+ })
564+ if err != nil {
565+ return nil , fmt .Errorf ("get subnet %q in vnet %q rg %q: retries exhausted: %w (last subnet error: %v)" , bastionSubnetName , vnet .name , nodeRG , err , subnetGetErr )
566+ }
542567
568+ if subnetGetErr != nil {
569+ // 404 — need to create
543570 toolkit .Logf (ctx , "creating subnet %s in VNet %s (rg %s)" , bastionSubnetName , vnet .name , nodeRG )
544571 subnetParams := armnetwork.Subnet {
545572 Properties : & armnetwork.SubnetPropertiesFormat {
@@ -727,28 +754,30 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
727754 defer toolkit .LogStepCtx (ctx , "collecting garbage VMSS" )()
728755 rg := * cluster .Properties .NodeResourceGroup
729756
730- // Build a set of all existing VMSS names while deleting old ones.
731- existingVMSS := map [string ]struct {}{}
757+ // Build a set of VMSS names that should be kept — exclude VMSS that are
758+ // being deleted so their stale K8s nodes can be cleaned up in the same pass.
759+ keptVMSS := map [string ]struct {}{}
732760 pager := config .Azure .VMSS .NewListPager (rg , nil )
733761 for pager .More () {
734762 page , err := pager .NextPage (ctx )
735763 if err != nil {
736764 return fmt .Errorf ("failed to get next page of VMSS: %w" , err )
737765 }
738766 for _ , vmss := range page .Value {
739- existingVMSS [* vmss .Name ] = struct {}{}
740-
741767 if _ , ok := vmss .Tags ["KEEP_VMSS" ]; ok {
768+ keptVMSS [* vmss .Name ] = struct {}{}
742769 continue
743770 }
744771 // don't delete managed pools
745772 if _ , ok := vmss .Tags ["aks-managed-poolName" ]; ok {
773+ keptVMSS [* vmss .Name ] = struct {}{}
746774 continue
747775 }
748776
749777 // don't delete VMSS created in the last hour. They might be currently used in tests
750778 // extra 10 minutes is a buffer for test cleanup, clock drift and timeout adjustments
751779 if config .Config .TestTimeout == 0 || time .Since (* vmss .Properties .TimeCreated ) < config .Config .TestTimeout + 10 * time .Minute {
780+ keptVMSS [* vmss .Name ] = struct {}{}
752781 continue
753782 }
754783
@@ -757,13 +786,16 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
757786 })
758787 if err != nil {
759788 toolkit .Logf (ctx , "failed to delete vmss %q: %s" , * vmss .Name , err )
789+ // Keep in map so we don't try to delete its nodes while VMSS is still around
790+ keptVMSS [* vmss .Name ] = struct {}{}
760791 continue
761792 }
762793 toolkit .Logf (ctx , "deleted vmss %q (age: %v)" , * vmss .ID , time .Since (* vmss .Properties .TimeCreated ))
794+ // Don't add to keptVMSS — nodes from this VMSS should be cleaned up
763795 }
764796 }
765797
766- if err := collectGarbageNodes (ctx , kube , existingVMSS ); err != nil {
798+ if err := collectGarbageNodes (ctx , kube , keptVMSS ); err != nil {
767799 return fmt .Errorf ("failed to collect garbage K8s nodes: %w" , err )
768800 }
769801 return nil
@@ -773,15 +805,15 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
773805// longer exists. This prevents stale nodes from accumulating in the cluster
774806// and overwhelming the cloud-provider-azure route controller with perpetual
775807// "instance not found" failures.
776- func collectGarbageNodes (ctx context.Context , kube * Kubeclient , existingVMSS map [string ]struct {}) error {
808+ func collectGarbageNodes (ctx context.Context , kube * Kubeclient , keptVMSS map [string ]struct {}) error {
777809 defer toolkit .LogStepCtx (ctx , "collecting garbage K8s nodes" )()
778810
779811 nodes , err := kube .Typed .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
780812 if err != nil {
781813 return fmt .Errorf ("listing K8s nodes for garbage collection: %w" , err )
782814 }
783815
784- var deleteErrors [] error
816+ var deleted , failed int
785817 for _ , node := range nodes .Items {
786818 // skip managed pool nodes (system nodepool)
787819 if strings .HasPrefix (node .Name , "aks-" ) {
@@ -794,19 +826,25 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map
794826 }
795827 vmssName := node .Name [:len (node .Name )- 6 ]
796828
797- if _ , exists := existingVMSS [vmssName ]; exists {
829+ if _ , exists := keptVMSS [vmssName ]; exists {
798830 continue
799831 }
800832
801833 if err := kube .Typed .CoreV1 ().Nodes ().Delete (ctx , node .Name , metav1.DeleteOptions {}); err != nil {
802- deleteErrors = append (deleteErrors , fmt .Errorf ("deleting stale node %q: %w" , node .Name , err ))
834+ if apierrors .IsNotFound (err ) {
835+ toolkit .Logf (ctx , "stale K8s node %q already gone" , node .Name )
836+ continue
837+ }
838+ toolkit .Logf (ctx , "warning: failed to delete stale K8s node %q: %v" , node .Name , err )
839+ failed ++
803840 continue
804841 }
805842 toolkit .Logf (ctx , "deleted stale K8s node %q (VMSS %q not found)" , node .Name , vmssName )
843+ deleted ++
806844 }
807845
808- if len ( deleteErrors ) > 0 {
809- return fmt .Errorf ("failed to delete %d stale nodes, first error: %w " , len ( deleteErrors ), deleteErrors [ 0 ] )
846+ if failed > 0 && deleted == 0 {
847+ return fmt .Errorf ("failed to delete any of %d stale nodes" , failed )
810848 }
811849 return nil
812850}
0 commit comments