Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions api/hypershift/v1beta1/hostedcluster_conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,13 @@ const (
// This condition is used to track the status of the recovery process and to determine if the HostedCluster
// is ready to be used after restoration.
HostedClusterRestoredFromBackup ConditionType = "HostedClusterRestoredFromBackup"

// ControlPlaneToDataPlaneConnectivityHealthy indicates whether the control plane can successfully
// reach the data plane components.
// When True, control plane has healthy connectivity to data plane nodes.
// When False, there are network connectivity issues preventing control plane from reaching the data plane.
// A failure here may indicate network policy issues, firewall rules, or infrastructure problems.
ControlPlaneToDataPlaneConnectivityHealthy ConditionType = "ControlPlaneToDataPlaneConnectivityHealthy"
)

// Reasons.
Expand Down Expand Up @@ -251,6 +258,12 @@ const (
RecoveryFinishedReason = "RecoveryFinished"

CloudResourcesCleanupSkippedReason = "CloudResourcesCleanupSkipped"

ControlPlaneToDataPlaneReasonNoKonnectivityAgentPodsFound = "NoKonnectivityAgentPodsFound"

ControlPlaneToDataPlaneReasonLogAccessFailed = "ControlPlaneToDataPlaneLogAccessFailed"

ControlPlaneToDataPlaneReasonNoWorkerNodesAvailable = "NoWorkerNodesAvailable"
)

// Messages.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ import (
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1"
"k8s.io/utils/ptr"
Expand Down Expand Up @@ -127,6 +128,7 @@ exec /bin/azure-cloud-node-manager \
type reconciler struct {
client client.Client
uncachedClient client.Client
clientSet *clientset.Clientset
upsert.CreateOrUpdateProvider
platformType hyperv1.PlatformType
rootCA string
Expand All @@ -144,6 +146,18 @@ type reconciler struct {
operateOnReleaseImage string
ImageMetaDataProvider util.ImageMetadataProvider
cleanupTracker *util.CleanupTracker

// exposed for unit test since GetLogs looks hard to be mocked
GetPodLogs func(context context.Context, clientset *clientset.Clientset, namespace, name, container string) ([]byte, error)
}

func getPodLogs(ctx context.Context, clientSet *clientset.Clientset, namespace, name, container string) ([]byte, error) {
limit := int64(1024)
opts := &corev1.PodLogOptions{
Container: container,
LimitBytes: &limit,
}
return clientSet.CoreV1().Pods(namespace).GetLogs(name, opts).DoRaw(ctx)
}

// eventHandler is the handler used throughout. As this controller reconciles all kind of different resources
Expand Down Expand Up @@ -181,9 +195,15 @@ func Setup(ctx context.Context, opts *operator.HostedClusterConfigOperatorConfig
return fmt.Errorf("failed to create kubevirt infra uncached client: %w", err)
}

clientset, err := clientset.NewForConfig(opts.Manager.GetConfig())
if err != nil {
return fmt.Errorf("failed to initialize kubeClient from config: %w", err)
}

c, err := controller.New(ControllerName, opts.Manager, controller.Options{Reconciler: &reconciler{
client: opts.Manager.GetClient(),
uncachedClient: uncachedClient,
clientSet: clientset,
CreateOrUpdateProvider: opts.TargetCreateOrUpdateProvider,
platformType: opts.PlatformType,
rootCA: opts.InitialCA,
Expand All @@ -201,6 +221,7 @@ func Setup(ctx context.Context, opts *operator.HostedClusterConfigOperatorConfig
operateOnReleaseImage: opts.OperateOnReleaseImage,
ImageMetaDataProvider: opts.ImageMetaDataProvider,
cleanupTracker: util.NewCleanupTracker(),
GetPodLogs: getPodLogs,
}})
if err != nil {
return fmt.Errorf("failed to construct controller: %w", err)
Expand Down Expand Up @@ -539,6 +560,11 @@ func (r *reconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result
errs = append(errs, fmt.Errorf("failed to reconcile konnectivity agent: %w", err))
}

log.Info("reconciling control-Plane to data-Plane status conditions")
if err := r.reconcileControlPlaneDataPlaneConnectivityConditions(ctx, hcp, log); err != nil {
errs = append(errs, fmt.Errorf("failed to update ControlPlaneToDataPlaneConnectivity condition: %w", err))
}

log.Info("reconciling openshift apiserver apiservices")
if err := r.reconcileOpenshiftAPIServerAPIServices(ctx, hcp); err != nil {
errs = append(errs, fmt.Errorf("failed to reconcile openshift apiserver service: %w", err))
Expand Down Expand Up @@ -1389,6 +1415,79 @@ func (r *reconciler) reconcileClusterVersion(ctx context.Context, hcp *hyperv1.H
return nil
}

func (r *reconciler) reconcileControlPlaneDataPlaneConnectivityConditions(ctx context.Context, hcp *hyperv1.HostedControlPlane, log logr.Logger) error {
patchHCPWithCondition := func(hcp *hyperv1.HostedControlPlane, condition *metav1.Condition) error {
originalHCP := hcp.DeepCopy()
if !meta.SetStatusCondition(&hcp.Status.Conditions, *condition) {
return nil // No status change; avoid unnecessary API call.
}
if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFrom(originalHCP)); err != nil {
return fmt.Errorf("failed to update HostedControlPlane status with %s condition: %w", condition.Type, err)
}
log.Info(string(hyperv1.ControlPlaneToDataPlaneConnectivityHealthy) + " updated")
return nil
}

condition := &metav1.Condition{
Type: string(hyperv1.ControlPlaneToDataPlaneConnectivityHealthy),
Status: metav1.ConditionFalse, // False by default
}
totalNodes, err := util.CountAvailableNodes(ctx, r.client)
if err != nil {
condition.Status = metav1.ConditionUnknown
condition.Reason = hyperv1.ControlPlaneToDataPlaneReasonNoWorkerNodesAvailable
condition.Message = "Unable to count worker nodes: " + err.Error()
return patchHCPWithCondition(hcp, condition)
}
if totalNodes == 0 {
condition.Status = metav1.ConditionUnknown
condition.Reason = hyperv1.ControlPlaneToDataPlaneReasonNoWorkerNodesAvailable
condition.Message = "No worker nodes available"
return patchHCPWithCondition(hcp, condition)
}
var podList corev1.PodList
if err := r.uncachedClient.List(ctx, &podList,
client.MatchingLabels{"app": "konnectivity-agent"}, client.InNamespace("kube-system")); err != nil {
condition.Reason = hyperv1.ControlPlaneToDataPlaneReasonNoKonnectivityAgentPodsFound
condition.Message = "Couldn't list konnectivity-agent PODs in kube-system namespace: " + err.Error()
return patchHCPWithCondition(hcp, condition)
}

logsFound := false
runningPodsFound := false
for _, pod := range podList.Items {
if pod.Status.Phase != corev1.PodRunning {
continue
}
runningPodsFound = true
data, err := r.GetPodLogs(ctx, r.clientSet, pod.Namespace, pod.Name, "konnectivity-agent")
if err != nil {
log.Error(err,
fmt.Sprintf("failed to get logs for konnectivity-agent pod %s/%s", pod.Namespace, pod.Name))
continue
}
if len(data) > 0 {
logsFound = true
break
}
}
if !runningPodsFound {
condition.Reason = hyperv1.ControlPlaneToDataPlaneReasonNoKonnectivityAgentPodsFound
condition.Message = "Couldn't find any konnectivity-agent running in data plane"
} else {
if !logsFound {
condition.Reason = hyperv1.ControlPlaneToDataPlaneReasonNoKonnectivityAgentPodsFound
condition.Message = "failed to read konnectivity-agent logs from data plane"
} else {
condition.Status = metav1.ConditionTrue
condition.Reason = hyperv1.AsExpectedReason
condition.Message = hyperv1.AllIsWellMessage
}
}

return patchHCPWithCondition(hcp, condition)
}

func (r *reconciler) reconcileOpenshiftAPIServerAPIServices(ctx context.Context, hcp *hyperv1.HostedControlPlane) error {
rootCA := cpomanifests.RootCASecret(hcp.Namespace)
if err := r.cpClient.Get(ctx, client.ObjectKeyFromObject(rootCA), rootCA); err != nil {
Expand Down
Loading