Skip to content

Commit

Permalink
Fixups
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Büringer buringerst@vmware.com
  • Loading branch information
sbueringer committed Feb 9, 2023
1 parent 980dde5 commit 7cf0192
Show file tree
Hide file tree
Showing 11 changed files with 179 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
{spec, "machineTemplate", "nodeDeletionTimeout"},
{spec, "replicas"},
{spec, "version"},
{spec, "remediationStrategy"},
{spec, "remediationStrategy", "*"},
{spec, "rolloutAfter"},
{spec, "rolloutBefore", "*"},
{spec, "rolloutStrategy", "*"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) {
validUpdate.Spec.RolloutBefore = &RolloutBefore{
CertificatesExpiryDays: pointer.Int32(14),
}
validUpdate.Spec.RemediationStrategy = &RemediationStrategy{
MaxRetry: pointer.Int32(50),
MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour},
RetryPeriod: metav1.Duration{Duration: 10 * time.Minute},
}
validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig

scaleToZero := before.DeepCopy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct {
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`

// The RemediationStrategy that controls how control plane machine remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}

// KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines
Expand Down
5 changes: 5 additions & 0 deletions controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion controlplane/kubeadm/internal/controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
}

// Remove the annotation tracking that a remediation is in progress (the remediation completed when
// the replacement machine have been created above).
// the replacement machine has been created above).
delete(kcp.Annotations, controlplanev1.RemediationInProgressAnnotation)

return nil
Expand Down
96 changes: 49 additions & 47 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, kerrors.NewAggregate(errList)
}

// Returns if another remediation is in progress but the new machine is not yet created.
if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
return ctrl.Result{}, nil
}

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
unhealthyMachines := controlPlane.UnhealthyMachines()
Expand All @@ -99,6 +94,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, nil
}

log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)

// Returns if another remediation is in progress but the new Machine is not yet created.
// Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated
// is being deleted to avoid unnecessary logs if no further remediation should be done.
if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
log.Info("Another remediation is already in progress. Skipping remediation.")
return ctrl.Result{}, nil
}

patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -118,7 +123,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)

// Check if KCP is allowed to remediate considering retry limits:
// - Remediation cannot happen because retryPeriod is not yet expired.
Expand All @@ -132,9 +136,11 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, nil
}

// Executes checks that applies only if the control plane is already initialized; in this case KCP can
// remediate only if it can safely assume that the operation preserves the operation state of the existing cluster (or at least it doesn't make it worst).
if controlPlane.KCP.Status.Initialized {
// Executes checks that apply only if the control plane is already initialized; in this case KCP can
// remediate only if it can safely assume that the operation preserves the operation state of the
// existing cluster (or at least it doesn't make it worse).

// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
if controlPlane.Machines.Len() <= 1 {
log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
Expand Down Expand Up @@ -163,22 +169,14 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, nil
}
}
}

// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
remediationInProgressValue, err := remediationInProgressData.Marshal()
if err != nil {
return ctrl.Result{}, err
}

// Start remediating the unhealthy control plane machine by deleting it.
// A new machine will come up completing the operation as part of the regular
// Start remediating the unhealthy control plane machine by deleting it.
// A new machine will come up completing the operation as part of the regular reconcile.

// If the control plane is initialized, before deleting the machine:
// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
// - delete the etcd member hosted on the machine being deleted.
// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
if controlPlane.KCP.Status.Initialized {
// If the control plane is initialized, before deleting the machine:
// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
// - delete the etcd member hosted on the machine being deleted.
// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
if err != nil {
log.Error(err, "Failed to create client to workload cluster")
Expand Down Expand Up @@ -227,6 +225,12 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
log.Info("Remediating unhealthy machine")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
remediationInProgressValue, err := remediationInProgressData.Marshal()
if err != nil {
return ctrl.Result{}, err
}

// Set annotations tracking remediation details so they can be picked up by the machine
// that will be created as part of the scale up action that completes the remediation.
annotations.AddAnnotations(controlPlane.KCP, map[string]string{
Expand Down Expand Up @@ -263,49 +267,39 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin
return remediationInProgressData, true, nil
}

// Gets MinHealthySeconds and RetryDelaySeconds from the remediation strategy, or use defaults.
// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
}

retryPeriod := time.Duration(0)
if controlPlane.KCP.Spec.RemediationStrategy != nil {
retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration
}

// Gets the timestamp of the last remediation; if missing, default to a value
// that ensures both MinHealthySeconds and RetryDelaySeconds are expired.
// that ensures both MinHealthyPeriod and RetryPeriod are expired.
// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
// the RemediationForAnnotation on Machines.
max := func(x, y time.Duration) time.Duration {
if x < y {
return y
}
return x
}

lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
if !lastRemediationData.Timestamp.IsZero() {
lastRemediationTime = lastRemediationData.Timestamp.Time
}

// Check if the machine being remediated has been created as a remediation for a previous unhealthy machine.
// NOTE: if someone/something changes or deletes the RemediationForAnnotation on Machines, this could potentially
// lead to executing more retries than expected, but this is considered acceptable in such a case.
machineRemediationFor := remediationInProgressData.Machine
if lastRemediationData.Machine != "" {
// If the remediation is happening before minHealthyPeriod is expired, then KCP considers this
// as a remediation for the same previously unhealthy machine.
if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
machineRemediationFor = lastRemediationData.Machine
log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, machineRemediationFor))
}
// Once we get here we already know that there was a last remediation for the Machine.
// If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this
// as a remediation for the same previously unhealthy machine.
// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
var retryForSameMachineInProgress bool
if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
retryForSameMachineInProgress = true
log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
}

// If remediation is happening for a different machine, this is the first try of a new retry sequence.
if lastRemediationData.Machine != machineRemediationFor {
// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
if !retryForSameMachineInProgress {
return remediationInProgressData, true, nil
}

Expand All @@ -315,7 +309,7 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin
// Check if remediation can happen because retryPeriod is passed.
if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod))
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryDelay)", retryPeriod)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod)
return remediationInProgressData, false, nil
}

Expand All @@ -329,12 +323,20 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin
}
}

// All the check passed, increase the remediation number.
// All the check passed, increase the remediation retry count.
remediationInProgressData.RetryCount++

return remediationInProgressData, true, nil
}

// max calculates the maximum duration.
func max(x, y time.Duration) time.Duration {
if x < y {
return y
}
return x
}

// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
// without loosing etcd quorum.
//
Expand Down
Loading

0 comments on commit 7cf0192

Please sign in to comment.